diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -988,18 +988,24 @@ // Buffer intrinsics ////////////////////////////////////////////////////////////////////////// +// Data type for buffer resources (V#). Maybe, in the future, we can create a +// similar one for textures (T#). +class AMDGPUBufferRsrcTy + : LLVMQualPointerType; + let TargetPrefix = "amdgcn" in { defset list AMDGPUBufferIntrinsics = { class AMDGPUBufferLoad : DefaultAttrsIntrinsic < [data_ty], - [llvm_v4i32_ty, // rsrc(SGPR) - llvm_i32_ty, // vindex(VGPR) - llvm_i32_ty, // offset(SGPR/VGPR/imm) - llvm_i1_ty, // glc(imm) - llvm_i1_ty], // slc(imm) - [IntrReadMem, ImmArg>, ImmArg>], "", [SDNPMemOperand]>, + [AMDGPUBufferRsrcTy>, // rsrc(SGPR) + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // offset(SGPR/VGPR/imm) + llvm_i1_ty, // glc(imm) + llvm_i1_ty], // slc(imm) + [IntrArgMemOnly, IntrReadMem, ReadOnly>, NoCapture>, + ImmArg>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_buffer_load_format : AMDGPUBufferLoad; def int_amdgcn_buffer_load : AMDGPUBufferLoad; @@ -1008,21 +1014,23 @@ // the offset argument is uniform. def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic < [llvm_any_ty], - [llvm_v4i32_ty, // rsrc(SGPR) - llvm_i32_ty, // byte offset - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 2 = dlc) - [IntrNoMem, ImmArg>]>, - AMDGPURsrcIntrinsic<0>; + [AMDGPUBufferRsrcTy>, // rsrc(SGPR) + llvm_i32_ty, // byte offset + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 2 = dlc) + [IntrArgMemOnly, IntrReadMem, + ReadOnly>, NoCapture>, ImmArg>]>, + AMDGPURsrcIntrinsic<0>; class AMDGPUBufferStore : DefaultAttrsIntrinsic < [], - [data_ty, // vdata(VGPR) - llvm_v4i32_ty, // rsrc(SGPR) - llvm_i32_ty, // vindex(VGPR) - llvm_i32_ty, // offset(SGPR/VGPR/imm) - llvm_i1_ty, // glc(imm) - llvm_i1_ty], // slc(imm) - [IntrWriteMem, ImmArg>, ImmArg>], "", [SDNPMemOperand]>, + [data_ty, // vdata(VGPR) + AMDGPUBufferRsrcTy>, // rsrc(SGPR) + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // offset(SGPR/VGPR/imm) + llvm_i1_ty, // glc(imm) + llvm_i1_ty], // slc(imm) + [IntrArgMemOnly, IntrWriteMem, WriteOnly>, NoCapture>, + ImmArg>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; def int_amdgcn_buffer_store_format : AMDGPUBufferStore; def int_amdgcn_buffer_store : AMDGPUBufferStore; @@ -1036,60 +1044,64 @@ // they behave differently in bounds checking and swizzling. class AMDGPURawBufferLoad : DefaultAttrsIntrinsic < [data_ty], - [llvm_v4i32_ty, // rsrc(SGPR) - llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) - llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, - // bit 1 = slc, - // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) - [IntrReadMem, ImmArg>], "", [SDNPMemOperand]>, + [AMDGPUBufferRsrcTy>, // rsrc(SGPR) + llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) + llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) + [IntrArgMemOnly, IntrReadMem, ReadOnly>, NoCapture>, + ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad; def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad; class AMDGPUStructBufferLoad : DefaultAttrsIntrinsic < [data_ty], - [llvm_v4i32_ty, // rsrc(SGPR) - llvm_i32_ty, // vindex(VGPR) - llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) - llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, - // bit 1 = slc, - // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) - [IntrReadMem, ImmArg>], "", [SDNPMemOperand]>, + [AMDGPUBufferRsrcTy>, // rsrc(SGPR) + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) + llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) + [IntrArgMemOnly, IntrReadMem, ReadOnly>, NoCapture>, + ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad; def int_amdgcn_struct_buffer_load : AMDGPUStructBufferLoad; class AMDGPURawBufferStore : DefaultAttrsIntrinsic < [], - [data_ty, // vdata(VGPR) - llvm_v4i32_ty, // rsrc(SGPR) - llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) - llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, - // bit 1 = slc, - // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) - [IntrWriteMem, ImmArg>], "", [SDNPMemOperand]>, + [data_ty, // vdata(VGPR) + AMDGPUBufferRsrcTy>, // rsrc(SGPR) + llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) + llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) + [IntrArgMemOnly, IntrWriteMem, WriteOnly>, NoCapture>, + ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore; def int_amdgcn_raw_buffer_store : AMDGPURawBufferStore; class AMDGPUStructBufferStore : DefaultAttrsIntrinsic < [], - [data_ty, // vdata(VGPR) - llvm_v4i32_ty, // rsrc(SGPR) - llvm_i32_ty, // vindex(VGPR) - llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) - llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, - // bit 1 = slc, - // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) - [IntrWriteMem, ImmArg>], "", [SDNPMemOperand]>, + [data_ty, // vdata(VGPR) + AMDGPUBufferRsrcTy>, // rsrc(SGPR) + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) + llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) + [IntrArgMemOnly, IntrWriteMem, WriteOnly>, NoCapture>, + ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore; def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore; @@ -1097,11 +1109,12 @@ class AMDGPURawBufferAtomic : Intrinsic < !if(NoRtn, [], [data_ty]), [!if(NoRtn, data_ty, LLVMMatchType<0>), // vdata(VGPR) - llvm_v4i32_ty, // rsrc(SGPR) - llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) - llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) - [ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, + AMDGPUBufferRsrcTy>, // rsrc(SGPR) + llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) + llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + [IntrArgMemOnly, NoCapture>, + ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1, 0>; def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_add : AMDGPURawBufferAtomic; @@ -1121,11 +1134,12 @@ [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) LLVMMatchType<0>, // cmp(VGPR) - llvm_v4i32_ty, // rsrc(SGPR) + AMDGPUBufferRsrcTy>, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) - [ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, + [IntrArgMemOnly, NoCapture>, + ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<2, 0>; // gfx908 intrinsic @@ -1134,12 +1148,13 @@ class AMDGPUStructBufferAtomic : Intrinsic < !if(NoRtn, [], [data_ty]), [!if(NoRtn, data_ty, LLVMMatchType<0>), // vdata(VGPR) - llvm_v4i32_ty, // rsrc(SGPR) - llvm_i32_ty, // vindex(VGPR) - llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) - llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) - [ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, + AMDGPUBufferRsrcTy>, // rsrc(SGPR) + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) + llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + [IntrArgMemOnly, NoCapture>, + ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1, 0>; def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_add : AMDGPUStructBufferAtomic; @@ -1157,12 +1172,13 @@ [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) LLVMMatchType<0>, // cmp(VGPR) - llvm_v4i32_ty, // rsrc(SGPR) + AMDGPUBufferRsrcTy>, // rsrc(SGPR) llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) - [ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, + [IntrArgMemOnly, NoCapture>, + ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<2, 0>; // gfx908 intrinsic @@ -1175,34 +1191,35 @@ // Obsolescent tbuffer intrinsics. def int_amdgcn_tbuffer_load : DefaultAttrsIntrinsic < - [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 - [llvm_v4i32_ty, // rsrc(SGPR) - llvm_i32_ty, // vindex(VGPR) - llvm_i32_ty, // voffset(VGPR) - llvm_i32_ty, // soffset(SGPR) - llvm_i32_ty, // offset(imm) - llvm_i32_ty, // dfmt(imm) - llvm_i32_ty, // nfmt(imm) - llvm_i1_ty, // glc(imm) - llvm_i1_ty], // slc(imm) - [IntrReadMem, + [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 + [AMDGPUBufferRsrcTy>, // rsrc(SGPR) + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // voffset(VGPR) + llvm_i32_ty, // soffset(SGPR) + llvm_i32_ty, // offset(imm) + llvm_i32_ty, // dfmt(imm) + llvm_i32_ty, // nfmt(imm) + llvm_i1_ty, // glc(imm) + llvm_i1_ty], // slc(imm) + [IntrArgMemOnly, IntrReadMem, ReadOnly>, NoCapture>, ImmArg>, ImmArg>, ImmArg>, ImmArg>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_tbuffer_store : DefaultAttrsIntrinsic < [], - [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 - llvm_v4i32_ty, // rsrc(SGPR) - llvm_i32_ty, // vindex(VGPR) - llvm_i32_ty, // voffset(VGPR) - llvm_i32_ty, // soffset(SGPR) - llvm_i32_ty, // offset(imm) - llvm_i32_ty, // dfmt(imm) - llvm_i32_ty, // nfmt(imm) - llvm_i1_ty, // glc(imm) - llvm_i1_ty], // slc(imm) - [IntrWriteMem, ImmArg>, + [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 + AMDGPUBufferRsrcTy>, // rsrc(SGPR) + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // voffset(VGPR) + llvm_i32_ty, // soffset(SGPR) + llvm_i32_ty, // offset(imm) + llvm_i32_ty, // dfmt(imm) + llvm_i32_ty, // nfmt(imm) + llvm_i1_ty, // glc(imm) + llvm_i1_ty], // slc(imm) + [IntrArgMemOnly, IntrWriteMem, WriteOnly>, NoCapture>, + ImmArg>, ImmArg>, ImmArg>, ImmArg>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; @@ -1212,8 +1229,8 @@ // - joint format field // - joint cachepolicy field def int_amdgcn_raw_tbuffer_load : DefaultAttrsIntrinsic < - [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 - [llvm_v4i32_ty, // rsrc(SGPR) + [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 + [AMDGPUBufferRsrcTy>, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) @@ -1221,14 +1238,14 @@ // bit 1 = slc, // bit 2 = dlc on gfx10+), // swizzled buffer (bit 3 = swz)) - [IntrReadMem, + [IntrArgMemOnly, IntrReadMem, ReadOnly>, NoCapture>, ImmArg>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_raw_tbuffer_store : DefaultAttrsIntrinsic < [], [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 - llvm_v4i32_ty, // rsrc(SGPR) + AMDGPUBufferRsrcTy>, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) @@ -1236,13 +1253,13 @@ // bit 1 = slc, // bit 2 = dlc on gfx10+), // swizzled buffer (bit 3 = swz)) - [IntrWriteMem, + [IntrArgMemOnly, IntrWriteMem, WriteOnly>, NoCapture>, ImmArg>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; def int_amdgcn_struct_tbuffer_load : DefaultAttrsIntrinsic < [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 - [llvm_v4i32_ty, // rsrc(SGPR) + [AMDGPUBufferRsrcTy>, // rsrc(SGPR) llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) @@ -1251,14 +1268,14 @@ // bit 1 = slc, // bit 2 = dlc on gfx10+), // swizzled buffer (bit 3 = swz)) - [IntrReadMem, + [IntrArgMemOnly, IntrReadMem, ReadOnly>, NoCapture>, ImmArg>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_struct_tbuffer_store : DefaultAttrsIntrinsic < [], [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 - llvm_v4i32_ty, // rsrc(SGPR) + AMDGPUBufferRsrcTy>, // rsrc(SGPR) llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) @@ -1267,18 +1284,19 @@ // bit 1 = slc, // bit 2 = dlc on gfx10+), // swizzled buffer (bit 3 = swz)) - [IntrWriteMem, + [IntrArgMemOnly, IntrWriteMem, WriteOnly>, NoCapture>, ImmArg>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; class AMDGPUBufferAtomic : Intrinsic < [llvm_anyint_ty], - [LLVMMatchType<0>, // vdata(VGPR) - llvm_v4i32_ty, // rsrc(SGPR) - llvm_i32_ty, // vindex(VGPR) - llvm_i32_ty, // offset(SGPR/VGPR/imm) - llvm_i1_ty], // slc(imm) - [ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, + [LLVMMatchType<0>, // vdata(VGPR) + AMDGPUBufferRsrcTy>, // rsrc(SGPR) + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // offset(SGPR/VGPR/imm) + llvm_i1_ty], // slc(imm) + [IntrArgMemOnly, NoCapture>, + ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1, 0>; def int_amdgcn_buffer_atomic_swap : AMDGPUBufferAtomic; def int_amdgcn_buffer_atomic_add : AMDGPUBufferAtomic; @@ -1294,11 +1312,12 @@ [llvm_i32_ty], [llvm_i32_ty, // src(VGPR) llvm_i32_ty, // cmp(VGPR) - llvm_v4i32_ty, // rsrc(SGPR) + AMDGPUBufferRsrcTy, // rsrc(SGPR) llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(SGPR/VGPR/imm) llvm_i1_ty], // slc(imm) - [ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, + [IntrArgMemOnly, NoCapture>, + ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<2, 0>; def int_amdgcn_buffer_atomic_csub : AMDGPUBufferAtomic; @@ -1306,11 +1325,12 @@ class AMDGPUBufferAtomicFP : Intrinsic < [llvm_anyfloat_ty], [LLVMMatchType<0>, // vdata(VGPR) - llvm_v4i32_ty, // rsrc(SGPR) + AMDGPUBufferRsrcTy>, // rsrc(SGPR) llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(SGPR/VGPR/imm) llvm_i1_ty], // slc(imm) - [ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, + [IntrArgMemOnly, NoCapture>, + ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1, 0>; // Legacy form of the intrinsic. raw and struct forms should be preferred. @@ -1318,7 +1338,7 @@ class AMDGPURawBufferLoadLDS : Intrinsic < [], - [llvm_v4i32_ty, // rsrc(SGPR) + [AMDGPUBufferRsrcTy, // rsrc(SGPR) LLVMQualPointerType, // LDS base offset llvm_i32_ty, // Data byte size: 1/2/4 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) @@ -1328,13 +1348,16 @@ // bit 1 = slc, // bit 2 = dlc on gfx10+)) // swizzled buffer (bit 3 = swz)) - [IntrWillReturn, NoCapture>, ImmArg>, ImmArg>, + [IntrWillReturn, IntrArgMemOnly, + ReadOnly>, NoCapture>, + WriteOnly>, NoCapture>, + ImmArg>, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS; class AMDGPUStructBufferLoadLDS : Intrinsic < [], - [llvm_v4i32_ty, // rsrc(SGPR) + [AMDGPUBufferRsrcTy, // rsrc(SGPR) LLVMQualPointerType, // LDS base offset llvm_i32_ty, // Data byte size: 1/2/4 llvm_i32_ty, // vindex(VGPR) @@ -1345,7 +1368,10 @@ // bit 1 = slc, // bit 2 = dlc on gfx10+)) // swizzled buffer (bit 3 = swz)) - [IntrWillReturn, NoCapture>, ImmArg>, ImmArg>, + [IntrWillReturn, IntrArgMemOnly, + ReadOnly>, NoCapture>, + WriteOnly>, NoCapture>, + ImmArg>, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS; diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -19,6 +19,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" @@ -27,6 +28,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAArch64.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsARM.h" #include "llvm/IR/IntrinsicsWebAssembly.h" #include "llvm/IR/IntrinsicsX86.h" @@ -39,6 +41,7 @@ #include "llvm/Support/Regex.h" #include "llvm/TargetParser/Triple.h" #include +#include using namespace llvm; @@ -589,6 +592,176 @@ return false; } +/// If the declared function is a buffer intrinsic, return its ID and the +/// position of the resource argument, but only if the resource argument is +/// represented by a <4 x i32> instead of a ptr addrspace(8). +static std::optional> +AMDGCNUpgradableBufferRsrcFunctionInfo(Function *F, StringRef Name) { + if (!Name.starts_with("amdgcn.")) + return std::nullopt; + + // Drop amdgcn. prefix + Name = Name.substr(7); + // Note that, in this switch, the ".format" and ".lds" variants come before + // that their unformatted counterparts. The second value is the position of + // the resource argument. + auto TableValue = + StringSwitch>(Name) + .StartsWith("buffer.atomic.add", + {Intrinsic::amdgcn_buffer_atomic_add, 1}) + .StartsWith("buffer.atomic.and", + {Intrinsic::amdgcn_buffer_atomic_and, 1}) + .StartsWith("buffer.atomic.cmpswap", + {Intrinsic::amdgcn_buffer_atomic_cmpswap, 2}) + .StartsWith("buffer.atomic.csub", + {Intrinsic::amdgcn_buffer_atomic_csub, 1}) + .StartsWith("buffer.atomic.fadd", + {Intrinsic::amdgcn_buffer_atomic_fadd, 1}) + .StartsWith("buffer.atomic.or", + {Intrinsic::amdgcn_buffer_atomic_or, 1}) + .StartsWith("buffer.atomic.smax", + {Intrinsic::amdgcn_buffer_atomic_smax, 1}) + .StartsWith("buffer.atomic.smin", + {Intrinsic::amdgcn_buffer_atomic_smin, 1}) + .StartsWith("buffer.atomic.sub", + {Intrinsic::amdgcn_buffer_atomic_sub, 1}) + .StartsWith("buffer.atomic.swap", + {Intrinsic::amdgcn_buffer_atomic_swap, 1}) + .StartsWith("buffer.atomic.umax", + {Intrinsic::amdgcn_buffer_atomic_umax, 1}) + .StartsWith("buffer.atomic.umin", + {Intrinsic::amdgcn_buffer_atomic_umin, 1}) + .StartsWith("buffer.atomic.xor", + {Intrinsic::amdgcn_buffer_atomic_xor, 1}) + .StartsWith("buffer.load.format", + {Intrinsic::amdgcn_buffer_load_format, 0}) + .StartsWith("buffer.load", {Intrinsic::amdgcn_buffer_load, 0}) + .StartsWith("buffer.store.format", + {Intrinsic::amdgcn_buffer_store_format, 1}) + .StartsWith("buffer.store", {Intrinsic::amdgcn_buffer_store, 1}) + .StartsWith("raw.buffer.atomic.add", + {Intrinsic::amdgcn_raw_buffer_atomic_add, 1}) + .StartsWith("raw.buffer.atomic.and", + {Intrinsic::amdgcn_raw_buffer_atomic_and, 1}) + .StartsWith("raw.buffer.atomic.cmpswap", + {Intrinsic::amdgcn_raw_buffer_atomic_cmpswap, 2}) + .StartsWith("raw.buffer.atomic.dec", + {Intrinsic::amdgcn_raw_buffer_atomic_dec, 1}) + .StartsWith("raw.buffer.atomic.fadd", + {Intrinsic::amdgcn_raw_buffer_atomic_fadd, 1}) + .StartsWith("raw.buffer.atomic.fmax", + {Intrinsic::amdgcn_raw_buffer_atomic_fmax, 1}) + .StartsWith("raw.buffer.atomic.fmin", + {Intrinsic::amdgcn_raw_buffer_atomic_fmin, 1}) + .StartsWith("raw.buffer.atomic.inc", + {Intrinsic::amdgcn_raw_buffer_atomic_inc, 1}) + .StartsWith("raw.buffer.atomic.or", + {Intrinsic::amdgcn_raw_buffer_atomic_or, 1}) + .StartsWith("raw.buffer.atomic.smax", + {Intrinsic::amdgcn_raw_buffer_atomic_smax, 1}) + .StartsWith("raw.buffer.atomic.smin", + {Intrinsic::amdgcn_raw_buffer_atomic_smin, 1}) + .StartsWith("raw.buffer.atomic.sub", + {Intrinsic::amdgcn_raw_buffer_atomic_sub, 1}) + .StartsWith("raw.buffer.atomic.swap", + {Intrinsic::amdgcn_raw_buffer_atomic_swap, 1}) + .StartsWith("raw.buffer.atomic.umax", + {Intrinsic::amdgcn_raw_buffer_atomic_umax, 1}) + .StartsWith("raw.buffer.atomic.umin", + {Intrinsic::amdgcn_raw_buffer_atomic_umin, 1}) + .StartsWith("raw.buffer.atomic.xor", + {Intrinsic::amdgcn_raw_buffer_atomic_xor, 1}) + .StartsWith("raw.buffer.load.format", + {Intrinsic::amdgcn_raw_buffer_load_format, 0}) + .StartsWith("raw.buffer.load.lds", + {Intrinsic::amdgcn_raw_buffer_load_lds, 0}) + .StartsWith("raw.buffer.load", {Intrinsic::amdgcn_raw_buffer_load, 0}) + .StartsWith("raw.buffer.store.format", + {Intrinsic::amdgcn_raw_buffer_store_format, 1}) + .StartsWith("raw.buffer.store", + {Intrinsic::amdgcn_raw_buffer_store, 1}) + .StartsWith("raw.tbuffer.load", + {Intrinsic::amdgcn_raw_tbuffer_load, 0}) + .StartsWith("raw.tbuffer.store", + {Intrinsic::amdgcn_raw_tbuffer_store, 1}) + .StartsWith("s.buffer.load", {Intrinsic::amdgcn_s_buffer_load, 0}) + .StartsWith("struct.buffer.atomic.add", + {Intrinsic::amdgcn_struct_buffer_atomic_add, 1}) + .StartsWith("struct.buffer.atomic.and", + {Intrinsic::amdgcn_struct_buffer_atomic_and, 1}) + .StartsWith("struct.buffer.atomic.cmpswap", + {Intrinsic::amdgcn_struct_buffer_atomic_cmpswap, 2}) + .StartsWith("struct.buffer.atomic.dec", + {Intrinsic::amdgcn_struct_buffer_atomic_dec, 1}) + .StartsWith("struct.buffer.atomic.fadd", + {Intrinsic::amdgcn_struct_buffer_atomic_fadd, 1}) + .StartsWith("struct.buffer.atomic.fmax", + {Intrinsic::amdgcn_struct_buffer_atomic_fmax, 1}) + .StartsWith("struct.buffer.atomic.fmin", + {Intrinsic::amdgcn_struct_buffer_atomic_fmin, 1}) + .StartsWith("struct.buffer.atomic.inc", + {Intrinsic::amdgcn_struct_buffer_atomic_inc, 1}) + .StartsWith("struct.buffer.atomic.or", + {Intrinsic::amdgcn_struct_buffer_atomic_or, 1}) + .StartsWith("struct.buffer.atomic.smax", + {Intrinsic::amdgcn_struct_buffer_atomic_smax, 1}) + .StartsWith("struct.buffer.atomic.smin", + {Intrinsic::amdgcn_struct_buffer_atomic_smin, 1}) + .StartsWith("struct.buffer.atomic.sub", + {Intrinsic::amdgcn_struct_buffer_atomic_sub, 1}) + .StartsWith("struct.buffer.atomic.swap", + {Intrinsic::amdgcn_struct_buffer_atomic_swap, 1}) + .StartsWith("struct.buffer.atomic.umax", + {Intrinsic::amdgcn_struct_buffer_atomic_umax, 1}) + .StartsWith("struct.buffer.atomic.umin", + {Intrinsic::amdgcn_struct_buffer_atomic_umin, 1}) + .StartsWith("struct.buffer.atomic.xor", + {Intrinsic::amdgcn_struct_buffer_atomic_xor, 1}) + .StartsWith("struct.buffer.load.format", + {Intrinsic::amdgcn_struct_buffer_load_format, 0}) + .StartsWith("struct.buffer.load.lds", + {Intrinsic::amdgcn_struct_buffer_load_lds, 0}) + .StartsWith("struct.buffer.load", + {Intrinsic::amdgcn_struct_buffer_load, 0}) + .StartsWith("struct.buffer.store.format", + {Intrinsic::amdgcn_struct_buffer_store_format, 1}) + .StartsWith("struct.buffer.store", + {Intrinsic::amdgcn_struct_buffer_store, 1}) + .StartsWith("struct.tbuffer.load", + {Intrinsic::amdgcn_struct_tbuffer_load, 0}) + .StartsWith("struct.tbuffer.store", + {Intrinsic::amdgcn_struct_tbuffer_store, 1}) + .StartsWith("tbuffer.load", {Intrinsic::amdgcn_tbuffer_load, 0}) + .StartsWith("tbuffer.store", {Intrinsic::amdgcn_tbuffer_store, 1}) + .Default({Intrinsic::not_intrinsic, -1}); + if (TableValue.first == Intrinsic::not_intrinsic) + return std::nullopt; + unsigned RsrcArgPosition = TableValue.second; + auto *RsrcType = + dyn_cast(F->getArg(RsrcArgPosition)->getType()); + if (!RsrcType) + return std::nullopt; + if (RsrcType->getNumElements() != 4 || + !RsrcType->getElementType()->isIntegerTy(32)) + return std::nullopt; + return TableValue; +} + +static bool UpgradeAMDGCNBufferRsrcFunction(Function *F, Intrinsic::ID IID, + unsigned RsrcArgPos, + Function *&NewFn) { + // Loads are variadic in their return type, all other functions are variadic + // in their return argument. + Type *VarType = + (RsrcArgPos == 0) ? F->getReturnType() : F->arg_begin()->getType(); + rename(F); + if (Intrinsic::isOverloaded(IID)) + NewFn = Intrinsic::getDeclaration(F->getParent(), IID, VarType); + else + NewFn = Intrinsic::getDeclaration(F->getParent(), IID); + return true; +} + static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { assert(F && "Illegal to upgrade a non-existent Function."); @@ -836,6 +1009,16 @@ return true; } + // Buffer intrinsics have been moved from <4 x i32> %rsrc to ptr + // addrspace(8) %rsrc + std::optional> + AMDGCNBufferRsrcUpgradeInfo = + AMDGCNUpgradableBufferRsrcFunctionInfo(F, Name); + if (AMDGCNBufferRsrcUpgradeInfo.has_value()) + return UpgradeAMDGCNBufferRsrcFunction( + F, AMDGCNBufferRsrcUpgradeInfo->first, + AMDGCNBufferRsrcUpgradeInfo->second, NewFn); + break; } @@ -4019,6 +4202,22 @@ return; }; CallInst *NewCall = nullptr; + const auto &AMDGCNBufferUpgrade = [&](unsigned RsrcPos) -> CallInst * { + SmallVector Args(CI->args()); + LLVMContext &Context = F->getParent()->getContext(); + Value *OrigRsrc = Args[RsrcPos]; + // We're here for metadata changes and not an actual upgrade + if (OrigRsrc->getType()->isPointerTy()) + return nullptr; + StringRef OrigName = OrigRsrc->getName(); + Args[RsrcPos] = Builder.CreateBitCast( + Args[RsrcPos], Type::getInt128Ty(Context), "." + OrigName + ".as.int"); + Args[RsrcPos] = Builder.CreateIntToPtr( + Args[RsrcPos], PointerType::get(Context, /*AddressSpace=*/8), + "." + OrigName + ".as.ptr"); + return Builder.CreateCall(NewFn, Args); + }; + switch (NewFn->getIntrinsicID()) { default: { DefaultCase(); @@ -4148,6 +4347,91 @@ break; } + case Intrinsic::amdgcn_buffer_load: + case Intrinsic::amdgcn_buffer_load_format: + case Intrinsic::amdgcn_raw_buffer_load: + case Intrinsic::amdgcn_raw_buffer_load_format: + case Intrinsic::amdgcn_raw_buffer_load_lds: + case Intrinsic::amdgcn_raw_tbuffer_load: + case Intrinsic::amdgcn_s_buffer_load: + case Intrinsic::amdgcn_struct_buffer_load: + case Intrinsic::amdgcn_struct_buffer_load_format: + case Intrinsic::amdgcn_struct_buffer_load_lds: + case Intrinsic::amdgcn_struct_tbuffer_load: + case Intrinsic::amdgcn_tbuffer_load: + NewCall = AMDGCNBufferUpgrade(/*RsrcPos=*/0); + if (!NewCall) { + DefaultCase(); + return; + } + break; + case Intrinsic::amdgcn_buffer_atomic_add: + case Intrinsic::amdgcn_buffer_atomic_and: + case Intrinsic::amdgcn_buffer_atomic_csub: + case Intrinsic::amdgcn_buffer_atomic_fadd: + case Intrinsic::amdgcn_buffer_atomic_or: + case Intrinsic::amdgcn_buffer_atomic_smax: + case Intrinsic::amdgcn_buffer_atomic_smin: + case Intrinsic::amdgcn_buffer_atomic_sub: + case Intrinsic::amdgcn_buffer_atomic_swap: + case Intrinsic::amdgcn_buffer_atomic_umax: + case Intrinsic::amdgcn_buffer_atomic_umin: + case Intrinsic::amdgcn_buffer_atomic_xor: + case Intrinsic::amdgcn_buffer_store: + case Intrinsic::amdgcn_buffer_store_format: + case Intrinsic::amdgcn_raw_buffer_atomic_add: + case Intrinsic::amdgcn_raw_buffer_atomic_and: + case Intrinsic::amdgcn_raw_buffer_atomic_dec: + case Intrinsic::amdgcn_raw_buffer_atomic_fadd: + case Intrinsic::amdgcn_raw_buffer_atomic_fmax: + case Intrinsic::amdgcn_raw_buffer_atomic_fmin: + case Intrinsic::amdgcn_raw_buffer_atomic_inc: + case Intrinsic::amdgcn_raw_buffer_atomic_or: + case Intrinsic::amdgcn_raw_buffer_atomic_smax: + case Intrinsic::amdgcn_raw_buffer_atomic_smin: + case Intrinsic::amdgcn_raw_buffer_atomic_sub: + case Intrinsic::amdgcn_raw_buffer_atomic_swap: + case Intrinsic::amdgcn_raw_buffer_atomic_umax: + case Intrinsic::amdgcn_raw_buffer_atomic_umin: + case Intrinsic::amdgcn_raw_buffer_atomic_xor: + case Intrinsic::amdgcn_raw_buffer_store: + case Intrinsic::amdgcn_raw_buffer_store_format: + case Intrinsic::amdgcn_raw_tbuffer_store: + case Intrinsic::amdgcn_struct_buffer_atomic_add: + case Intrinsic::amdgcn_struct_buffer_atomic_and: + case Intrinsic::amdgcn_struct_buffer_atomic_dec: + case Intrinsic::amdgcn_struct_buffer_atomic_fadd: + case Intrinsic::amdgcn_struct_buffer_atomic_fmax: + case Intrinsic::amdgcn_struct_buffer_atomic_fmin: + case Intrinsic::amdgcn_struct_buffer_atomic_inc: + case Intrinsic::amdgcn_struct_buffer_atomic_or: + case Intrinsic::amdgcn_struct_buffer_atomic_smax: + case Intrinsic::amdgcn_struct_buffer_atomic_smin: + case Intrinsic::amdgcn_struct_buffer_atomic_sub: + case Intrinsic::amdgcn_struct_buffer_atomic_swap: + case Intrinsic::amdgcn_struct_buffer_atomic_umax: + case Intrinsic::amdgcn_struct_buffer_atomic_umin: + case Intrinsic::amdgcn_struct_buffer_atomic_xor: + case Intrinsic::amdgcn_struct_buffer_store: + case Intrinsic::amdgcn_struct_buffer_store_format: + case Intrinsic::amdgcn_struct_tbuffer_store: + case Intrinsic::amdgcn_tbuffer_store: + NewCall = AMDGCNBufferUpgrade(/*RsrcPos=*/1); + if (!NewCall) { + DefaultCase(); + return; + } + break; + case Intrinsic::amdgcn_buffer_atomic_cmpswap: + case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: + case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: + NewCall = AMDGCNBufferUpgrade(/*RsrcPos=*/2); + if (!NewCall) { + DefaultCase(); + return; + } + break; + case Intrinsic::bitreverse: NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(0)}); break; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -138,6 +138,9 @@ setOperationAction(ISD::LOAD, MVT::v16f64, Promote); AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32); + setOperationAction(ISD::LOAD, MVT::i128, Promote); + AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32); + // There are no 64-bit extloads. These should be done as a 32-bit extload and // an extension to 64-bit. for (MVT VT : MVT::integer_valuetypes()) @@ -264,6 +267,9 @@ setOperationAction(ISD::STORE, MVT::v16f64, Promote); AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32); + setOperationAction(ISD::STORE, MVT::i128, Promote); + AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32); + setTruncStoreAction(MVT::i64, MVT::i1, Expand); setTruncStoreAction(MVT::i64, MVT::i8, Expand); setTruncStoreAction(MVT::i64, MVT::i16, Expand); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -14,9 +14,11 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINELEGALIZER_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINELEGALIZER_H -#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "AMDGPUArgumentUsageInfo.h" #include "SIInstrInfo.h" +#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" +#include "llvm/CodeGen/MachineInstr.h" namespace llvm { @@ -71,6 +73,7 @@ bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const; + bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const; bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; @@ -171,10 +174,6 @@ Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore = false) const; - bool legalizeRawBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &B, bool IsFormat) const; - bool legalizeRawBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &B, bool IsFormat) const; Register fixStoreSourceType(MachineIRBuilder &B, Register VData, bool IsFormat) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -20,10 +20,14 @@ #include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" @@ -131,6 +135,20 @@ }; } +static LLT getBufferRsrcScalarType(const LLT Ty) { + if (!Ty.isVector()) + return LLT::scalar(128); + const unsigned NumElems = Ty.getElementCount().getFixedValue(); + return LLT::fixed_vector(NumElems, LLT::scalar(128)); +} + +static LLT getBufferRsrcRegisterType(const LLT Ty) { + if (!Ty.isVector()) + return LLT::fixed_vector(4, LLT::scalar(32)); + const unsigned NumElems = Ty.getElementCount().getFixedValue(); + return LLT::fixed_vector(NumElems * 4, LLT::scalar(32)); +} + static LLT getBitcastRegisterType(const LLT Ty) { const unsigned Size = Ty.getSizeInBits(); @@ -249,6 +267,7 @@ case AMDGPUAS::GLOBAL_ADDRESS: case AMDGPUAS::CONSTANT_ADDRESS: case AMDGPUAS::CONSTANT_ADDRESS_32BIT: + case AMDGPUAS::BUFFER_RESOURCE: // Treat constant and global as identical. SMRD loads are sometimes usable for // global loads (ideally constant address space should be eliminated) // depending on the context. Legality cannot be context dependent, but @@ -332,6 +351,16 @@ return true; } +static bool hasBufferRsrcWorkaround(const LLT Ty) { + if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE) + return true; + if (Ty.isVector()) { + const LLT ElemTy = Ty.getElementType(); + return hasBufferRsrcWorkaround(ElemTy); + } + return false; +} + // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so // workaround this. Eventually it should ignore the type for loads and only care // about the size. Return true in cases where we will workaround this for now by @@ -343,6 +372,9 @@ const unsigned Size = Ty.getSizeInBits(); if (Size <= 64) return false; + // Address space 8 pointers get their own workaround. + if (hasBufferRsrcWorkaround(Ty)) + return false; if (!Ty.isVector()) return true; @@ -357,7 +389,7 @@ static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { const LLT Ty = Query.Types[0]; return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && - !loadStoreBitcastWorkaround(Ty); + !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty); } /// Return true if a load or store of the type should be lowered with a bitcast @@ -425,6 +457,83 @@ Query.Types[1].getAddressSpace(), Opcode); } +/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial +/// type of the operand `idx` and then to trunsform it to a `p8` via bitcasts +/// and inttoptr. In addition, handle vectors of p8. Returns the new type. +static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, + MachineRegisterInfo &MRI, unsigned Idx) { + MachineOperand &MO = MI.getOperand(Idx); + + const LLT PointerTy = MRI.getType(MO.getReg()); + + // Paranoidly prevent us from doing this multiple times. + if (!hasBufferRsrcWorkaround(PointerTy)) + return PointerTy; + + const LLT ScalarTy = getBufferRsrcScalarType(PointerTy); + const LLT VectorTy = getBufferRsrcRegisterType(PointerTy); + if (!PointerTy.isVector()) { + // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8) + const unsigned NumParts = PointerTy.getSizeInBits() / 32; + const LLT S32 = LLT::scalar(32); + + Register VectorReg = MRI.createGenericVirtualRegister(VectorTy); + SmallVector VectorElems; + B.setInsertPt(B.getMBB(), ++B.getInsertPt()); + for (unsigned I = 0; I < NumParts; ++I) { + VectorElems.push_back(MRI.createGenericVirtualRegister(S32)); + B.buildExtractVectorElementConstant(VectorElems.back(), VectorReg, I); + } + B.buildMergeValues(MO, VectorElems); + MO.setReg(VectorReg); + return VectorTy; + } + Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy); + Register ScalarReg = MRI.createGenericVirtualRegister(ScalarTy); + + B.setInsertPt(B.getMBB(), ++B.getInsertPt()); + B.buildBitcast(ScalarReg, BitcastReg); + B.buildIntToPtr(MO, ScalarReg); + MO.setReg(BitcastReg); + + return VectorTy; +} + +/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is +/// the form in which the value must be in order to be passed to the low-level +/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is +/// needed in order to account for the fact that we can't define a register +/// class for s128 without breaking SelectionDAG. +static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) { + MachineRegisterInfo &MRI = *B.getMRI(); + const LLT PointerTy = MRI.getType(Pointer); + const LLT ScalarTy = getBufferRsrcScalarType(PointerTy); + const LLT VectorTy = getBufferRsrcRegisterType(PointerTy); + + if (!PointerTy.isVector()) { + // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32) + SmallVector PointerParts; + const unsigned NumParts = PointerTy.getSizeInBits() / 32; + auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer); + for (unsigned I = 0; I < NumParts; ++I) + PointerParts.push_back(Unmerged.getReg(I)); + return B.buildBuildVector(VectorTy, PointerParts).getReg(0); + } + Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0); + return B.buildBitcast(VectorTy, Scalar).getReg(0); +} + +static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, + unsigned Idx) { + MachineOperand &MO = MI.getOperand(Idx); + + const LLT PointerTy = B.getMRI()->getType(MO.getReg()); + // Paranoidly prevent us from doing this multiple times. + if (!hasBufferRsrcWorkaround(PointerTy)) + return; + MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B)); +} + AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const GCNTargetMachine &TM) : ST(ST_) { @@ -487,6 +596,8 @@ const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); + const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER); + const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE); const LLT CodePtr = FlatPtr; @@ -498,6 +609,8 @@ LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr }; + const std::initializer_list AddrSpaces128 = {RsrcPtr}; + const std::initializer_list FPTypesBase = { S32, S64 }; @@ -518,17 +631,18 @@ // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more // elements for v3s16 getActionDefinitionsBuilder(G_PHI) - .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) - .legalFor(AllS32Vectors) - .legalFor(AllS64Vectors) - .legalFor(AddrSpaces64) - .legalFor(AddrSpaces32) - .legalIf(isPointer(0)) - .clampScalar(0, S16, S256) - .widenScalarToNextPow2(0, 32) - .clampMaxNumElements(0, S32, 16) - .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) - .scalarize(0); + .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) + .legalFor(AllS32Vectors) + .legalFor(AllS64Vectors) + .legalFor(AddrSpaces64) + .legalFor(AddrSpaces32) + .legalFor(AddrSpaces128) + .legalIf(isPointer(0)) + .clampScalar(0, S16, S256) + .widenScalarToNextPow2(0, 32) + .clampMaxNumElements(0, S32, 16) + .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) + .scalarize(0); if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { // Full set of gfx9 features. @@ -909,9 +1023,10 @@ } getActionDefinitionsBuilder(G_PTR_ADD) - .legalIf(all(isPointer(0), sameSize(0, 1))) - .scalarize(0) - .scalarSameSizeAs(1, 0); + .unsupportedFor({BufferFatPtr, RsrcPtr}) + .legalIf(all(isPointer(0), sameSize(0, 1))) + .scalarize(0) + .scalarSameSizeAs(1, 0); getActionDefinitionsBuilder(G_PTRMASK) .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) @@ -1183,6 +1298,18 @@ return isLoadStoreLegal(ST, Query); }); + // The custom pointers (fat pointers, buffer resources) don't work with load + // and store at this level. Fat pointers should haev been lowered to + // intrinsics before the translation to MIR. + Actions.unsupportedIf(typeInSet(1, {BufferFatPtr, RsrcPtr})); + + // Address space 8 pointers are handled by a 4xs32 load, bitcast, and + // ptrtoint. This is needed to account for the fact that we can't have i128 + // as a register class for SelectionDAG reasons. + Actions.customIf([=](const LegalityQuery &Query) -> bool { + return hasBufferRsrcWorkaround(Query.Types[0]); + }); + // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to // 64-bits. // @@ -1455,6 +1582,8 @@ const LLT VecTy = Query.Types[VecTypeIdx]; const LLT IdxTy = Query.Types[IdxTypeIdx]; const unsigned EltSize = EltTy.getSizeInBits(); + if (EltTy.isPointer() && EltSize > 64) + return true; return (EltSize == 32 || EltSize == 64) && VecTy.getSizeInBits() % 32 == 0 && VecTy.getSizeInBits() <= MaxRegisterSize && @@ -1782,6 +1911,8 @@ case TargetOpcode::G_SEXTLOAD: case TargetOpcode::G_ZEXTLOAD: return legalizeLoad(Helper, MI); + case TargetOpcode::G_STORE: + return legalizeStore(Helper, MI); case TargetOpcode::G_FMAD: return legalizeFMad(MI, MRI, B); case TargetOpcode::G_FDIV: @@ -2336,6 +2467,32 @@ // TODO: Promote dynamic indexing of s16 to s32 + Register Dst = MI.getOperand(0).getReg(); + Register Vec = MI.getOperand(1).getReg(); + + LLT VecTy = MRI.getType(Vec); + LLT EltTy = VecTy.getElementType(); + assert(EltTy == MRI.getType(Dst)); + + // Legalize vectors of large pointers to vectors of large integers so + // we don't try to do pointer <-> integer bitcasts later in legalization. + if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) { + LLT IntVecTy = + LLT::scalarOrVector(VecTy.getElementCount(), EltTy.getSizeInBits()); + LLT IntTy = LLT::scalar(EltTy.getSizeInBits()); + + Register IntVec = MRI.createGenericVirtualRegister(IntVecTy); + B.buildPtrToInt(IntVec, Vec); + + Register IntElt = MRI.createGenericVirtualRegister(IntTy); + B.buildExtractVectorElement(IntElt, IntVec, MI.getOperand(2)); + + B.buildIntToPtr(Dst, IntElt); + + MI.eraseFromParent(); + return true; + } + // FIXME: Artifact combiner probably should have replaced the truncated // constant before this, so we shouldn't need // getIConstantVRegValWithLookThrough. @@ -2344,14 +2501,6 @@ if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. return true; const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); - - Register Dst = MI.getOperand(0).getReg(); - Register Vec = MI.getOperand(1).getReg(); - - LLT VecTy = MRI.getType(Vec); - LLT EltTy = VecTy.getElementType(); - assert(EltTy == MRI.getType(Dst)); - if (IdxVal < VecTy.getNumElements()) { auto Unmerge = B.buildUnmerge(EltTy, Vec); B.buildCopy(Dst, Unmerge.getReg(IdxVal)); @@ -2370,6 +2519,36 @@ // TODO: Promote dynamic indexing of s16 to s32 + Register Dst = MI.getOperand(0).getReg(); + Register Vec = MI.getOperand(1).getReg(); + Register Ins = MI.getOperand(2).getReg(); + + LLT VecTy = MRI.getType(Vec); + LLT EltTy = VecTy.getElementType(); + assert(EltTy == MRI.getType(Ins)); + + // Legalize vectors of large pointers to vectors of large integers so + // we don't try to do pointer <-> integer bitcasts later in legalization. + if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) { + LLT IntVecTy = + LLT::scalarOrVector(VecTy.getElementCount(), EltTy.getSizeInBits()); + LLT IntTy = LLT::scalar(EltTy.getSizeInBits()); + + Register IntVecSource = MRI.createGenericVirtualRegister(IntVecTy); + B.buildPtrToInt(IntVecSource, Vec); + Register IntIns = MRI.createGenericVirtualRegister(IntTy); + B.buildPtrToInt(IntIns, Ins); + + Register IntVecDest = MRI.createGenericVirtualRegister(IntVecTy); + B.buildInsertVectorElement(IntVecDest, IntVecSource, IntIns, + MI.getOperand(3)); + + B.buildIntToPtr(Dst, IntVecDest); + + MI.eraseFromParent(); + return true; + } + // FIXME: Artifact combiner probably should have replaced the truncated // constant before this, so we shouldn't need // getIConstantVRegValWithLookThrough. @@ -2379,15 +2558,6 @@ return true; const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); - Register Dst = MI.getOperand(0).getReg(); - Register Vec = MI.getOperand(1).getReg(); - Register Ins = MI.getOperand(2).getReg(); - - LLT VecTy = MRI.getType(Vec); - LLT EltTy = VecTy.getElementType(); - assert(EltTy == MRI.getType(Ins)); - (void)Ins; - unsigned NumElts = VecTy.getNumElements(); if (IdxVal < NumElts) { SmallVector SrcRegs; @@ -2627,6 +2797,13 @@ Register ValReg = MI.getOperand(0).getReg(); LLT ValTy = MRI.getType(ValReg); + if (hasBufferRsrcWorkaround(ValTy)) { + Observer.changingInstr(MI); + castBufferRsrcFromV4I32(MI, B, MRI, 0); + Observer.changedInstr(MI); + return true; + } + MachineMemOperand *MMO = *MI.memoperands_begin(); const unsigned ValSize = ValTy.getSizeInBits(); const LLT MemTy = MMO->getMemoryType(); @@ -2684,6 +2861,24 @@ return false; } +bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper, + MachineInstr &MI) const { + MachineIRBuilder &B = Helper.MIRBuilder; + MachineRegisterInfo &MRI = *B.getMRI(); + GISelChangeObserver &Observer = Helper.Observer; + + Register DataReg = MI.getOperand(0).getReg(); + LLT DataTy = MRI.getType(DataReg); + + if (hasBufferRsrcWorkaround(DataTy)) { + Observer.changingInstr(MI); + castBufferRsrcArgToV4I32(MI, B, 0); + Observer.changedInstr(MI); + return true; + } + return false; +} + bool AMDGPULegalizerInfo::legalizeFMad( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -4316,9 +4511,6 @@ uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; MMO->setOffset(TotalOffset); - } else { - // We don't have a constant combined offset to use in the MMO. Give up. - MMO->setValue((Value *)nullptr); } } @@ -4393,6 +4585,10 @@ const LLT S16 = LLT::scalar(16); + // Fixup buffer resources themselves needing to be v4i128. + if (hasBufferRsrcWorkaround(Ty)) + return castBufferRsrcToV4I32(VData, B); + // Fixup illegal register types for i8 stores. if (Ty == LLT::scalar(8) || Ty == S16) { Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); @@ -4421,6 +4617,7 @@ const LLT S32 = LLT::scalar(32); VData = fixStoreSourceType(B, VData, IsFormat); + castBufferRsrcArgToV4I32(MI, B, 2); Register RSrc = MI.getOperand(2).getReg(); MachineMemOperand *MMO = *MI.memoperands_begin(); @@ -4538,6 +4735,7 @@ ++OpOffset; } + castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset); Register RSrc = MI.getOperand(2 + OpOffset).getReg(); // The typed intrinsics add an immediate after the registers. @@ -4566,6 +4764,12 @@ unsigned ImmOffset; LLT Ty = MRI.getType(Dst); + // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the + // logic doesn't have to handle that case. + if (hasBufferRsrcWorkaround(Ty)) { + Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0); + Dst = MI.getOperand(0).getReg(); + } LLT EltTy = Ty.getScalarType(); const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); const bool Unpacked = ST.hasUnpackedD16VMem(); @@ -4738,6 +4942,8 @@ OpOffset = -1; } + // Since we don't have 128-bit atomics, we don't need to handle the case of + // p8 argmunents to the atomic itself. Register VData = MI.getOperand(2 + OpOffset).getReg(); Register CmpVal; @@ -4746,6 +4952,7 @@ ++OpOffset; } + castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset); Register RSrc = MI.getOperand(3 + OpOffset).getReg(); const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; @@ -5281,10 +5488,14 @@ Register Dst = MI.getOperand(0).getReg(); LLT Ty = B.getMRI()->getType(Dst); unsigned Size = Ty.getSizeInBits(); - MachineFunction &MF = B.getMF(); Observer.changingInstr(MI); + if (hasBufferRsrcWorkaround(Ty)) { + Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0); + Dst = MI.getOperand(0).getReg(); + B.setInsertPt(B.getMBB(), MI); + } if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { Ty = getBitcastRegisterType(Ty); Helper.bitcastDst(MI, Ty, 0); @@ -5298,16 +5509,12 @@ MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); MI.removeOperand(1); // Remove intrinsic ID - // FIXME: When intrinsic definition is fixed, this should have an MMO already. - // TODO: Should this use datalayout alignment? - const unsigned MemSize = (Size + 7) / 8; - const Align MemAlign(4); - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant, - MemSize, MemAlign); - MI.addMemOperand(MF, MMO); + MachineMemOperand *MMO = *MI.memoperands_begin(); + MMO->setFlags(MMO->getFlags() | MachineMemOperand::MOInvariant); + + // Hack around the need for v4i32 values in the underlying descriptors because + // of SelectionDAG. + castBufferRsrcArgToV4I32(MI, B, 1); // There are no 96-bit result scalar loads, but widening to 128-bit should // always be legal. We may need to restore this to a 96-bit result if it turns diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1363,11 +1363,12 @@ // can, but we need to track an MMO for that. const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8; const Align MemAlign(4); // FIXME: ABI type alignment? + MachineMemOperand *OrigMMO = *MI.memoperands_begin(); MachineMemOperand *BaseMMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant, - MemSize, MemAlign); + OrigMMO->getPointerInfo(), + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + MemSize, MemAlign); if (MMOOffset != 0) BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -72,8 +72,9 @@ MVT VT, unsigned Offset) const; SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr, SelectionDAG &DAG, bool WithChain) const; - SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset, - SDValue CachePolicy, SelectionDAG &DAG) const; + SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Chain, SDValue Rsrc, + SDValue Offset, SDValue CachePolicy, + SelectionDAG &DAG) const; SDValue lowerRawBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG, unsigned NewOpcode) const; @@ -253,6 +254,12 @@ void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, SDValue *Offsets, Align Alignment = Align(4)) const; + // Convert the i128 that a addrspace(8) pointer is natively represented as + // into the v4i32 that all the buffer intrinsics expent to receive. We can't + // add register classes for i128 on pain of the promotion logic going haywire, + // so this slightly ugly hack is what we've got. + SDValue bufferRsrcPtrToVector(SDValue Pointer, SelectionDAG &DAG) const; + // Handle 8 bit and 16 bit buffer loads SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL, ArrayRef Ops, MemSDNode *M) const; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -29,6 +29,9 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" @@ -728,19 +731,19 @@ setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16, - MVT::v2i16, MVT::v2f16}, + MVT::v2i16, MVT::v2f16, MVT::i128}, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16, - MVT::i16, MVT::i8}, + MVT::i16, MVT::i8, MVT::i128}, Custom); setOperationAction(ISD::INTRINSIC_VOID, {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16, - MVT::i8}, + MVT::i8, MVT::i128}, Custom); setTargetDAGCombine({ISD::ADD, @@ -954,35 +957,44 @@ Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); } -static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes) { +static EVT memVTFromLoadIntrData(Type *Ty, const DataLayout &DL, + unsigned MaxNumLanes) { assert(MaxNumLanes != 0); + if (auto *PT = dyn_cast(Ty)) + return EVT::getIntegerVT(Ty->getContext(), + DL.getPointerSizeInBits(PT->getAddressSpace())); if (auto *VT = dyn_cast(Ty)) { unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements()); - return EVT::getVectorVT(Ty->getContext(), - EVT::getEVT(VT->getElementType()), - NumElts); + Type *ElemTy = VT->getElementType(); + EVT ElemTyVT = EVT::getEVT(ElemTy); + if (auto *PT = dyn_cast(ElemTy)) + ElemTyVT = EVT::getIntegerVT( + Ty->getContext(), DL.getPointerSizeInBits(PT->getAddressSpace())); + return EVT::getVectorVT(Ty->getContext(), ElemTyVT, NumElts); } return EVT::getEVT(Ty); } // Peek through TFE struct returns to only use the data size. -static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) { +static EVT memVTFromLoadIntrReturn(Type *Ty, const DataLayout &DL, + unsigned MaxNumLanes) { auto *ST = dyn_cast(Ty); if (!ST) - return memVTFromLoadIntrData(Ty, MaxNumLanes); + return memVTFromLoadIntrData(Ty, DL, MaxNumLanes); // TFE intrinsics return an aggregate type. assert(ST->getNumContainedTypes() == 2 && ST->getContainedType(1)->isIntegerTy(32)); - return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes); + return memVTFromLoadIntrData(ST->getContainedType(0), DL, MaxNumLanes); } bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &CI, MachineFunction &MF, unsigned IntrID) const { + const DataLayout &DL = MF.getDataLayout(); Info.flags = MachineMemOperand::MONone; if (CI.hasMetadata(LLVMContext::MD_invariant_load)) Info.flags |= MachineMemOperand::MOInvariant; @@ -1000,6 +1012,8 @@ if (RsrcIntr->IsImage) Info.align.reset(); + if (!RsrcIntr->IsImage) // Non-image intrinsics take pointer arguments + Info.ptrVal = CI.getArgOperand(RsrcIntr->RsrcArg); Info.flags |= MachineMemOperand::MODereferenceable; if (ME.onlyReadsMemory()) { @@ -1020,7 +1034,7 @@ } } - Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes); + Info.memVT = memVTFromLoadIntrReturn(CI.getType(), DL, MaxNumLanes); // FIXME: What does alignment mean for an image? Info.opc = ISD::INTRINSIC_W_CHAIN; @@ -1032,7 +1046,7 @@ if (RsrcIntr->IsImage) { unsigned DMask = cast(CI.getArgOperand(1))->getZExtValue(); unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask); - Info.memVT = memVTFromLoadIntrData(DataTy, DMaskLanes); + Info.memVT = memVTFromLoadIntrData(DataTy, DL, DMaskLanes); } else Info.memVT = EVT::getEVT(DataTy); @@ -6748,8 +6762,9 @@ DMaskLanes, NumVDataDwords, DL); } -SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, - SDValue Offset, SDValue CachePolicy, +SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Chain, + SDValue Rsrc, SDValue Offset, + SDValue CachePolicy, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -6764,26 +6779,27 @@ VT.getStoreSize(), Alignment); if (!Offset->isDivergent()) { - SDValue Ops[] = { - Rsrc, - Offset, // Offset - CachePolicy - }; + SDValue Ops[] = {Chain, Rsrc, + Offset, // Offset + CachePolicy}; // Widen vec3 load to vec4. if (VT.isVector() && VT.getVectorNumElements() == 3) { EVT WidenedVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4); auto WidenedOp = DAG.getMemIntrinsicNode( - AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT, + AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT, MVT::Other), + Ops, WidenedVT, MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize())); auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp, DAG.getVectorIdxConstant(0, DL)); - return Subvector; + auto Merged = + DAG.getMergeValues({Subvector, SDValue(WidenedOp.getNode(), 1)}, DL); + return Merged; } return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, - DAG.getVTList(VT), Ops, VT, MMO); + DAG.getVTList(VT, MVT::Other), Ops, VT, MMO); } // We have a divergent offset. Emit a MUBUF buffer load instead. We can @@ -6800,16 +6816,16 @@ LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4); } - SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue}); + SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other}); SDValue Ops[] = { - DAG.getEntryNode(), // Chain - Rsrc, // rsrc - DAG.getConstant(0, DL, MVT::i32), // vindex - {}, // voffset - {}, // soffset - {}, // offset - CachePolicy, // cachepolicy - DAG.getTargetConstant(0, DL, MVT::i1), // idxen + Chain, // Chain + Rsrc, // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + {}, // voffset + {}, // soffset + {}, // offset + CachePolicy, // cachepolicy + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; // Use the alignment to ensure that the required offsets will fit into the @@ -6818,14 +6834,21 @@ NumLoads > 1 ? Align(16 * NumLoads) : Align(4)); uint64_t InstOffset = cast(Ops[5])->getZExtValue(); + for (unsigned i = 0; i < NumLoads; ++i) { Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32); Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops, LoadVT, MMO, DAG)); } - if (NumElts == 8 || NumElts == 16) - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads); + if (NumElts == 8 || NumElts == 16) { + SDValue Result = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads); + SmallVector Chains; + for (SDValue Load : Loads) + Chains.emplace_back(Load.getNode(), 1); + SDValue NewChain = DAG.getTokenFactor(DL, Chains); + return DAG.getMergeValues({Result, NewChain}, DL); + } return Loads[0]; } @@ -7015,13 +7038,6 @@ case Intrinsic::amdgcn_wavefrontsize: return DAG.getConstant(MF.getSubtarget().getWavefrontSize(), SDLoc(Op), MVT::i32); - case Intrinsic::amdgcn_s_buffer_load: { - unsigned CPol = cast(Op.getOperand(3))->getZExtValue(); - if (CPol & ~AMDGPU::CPol::ALL) - return Op; - return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), - DAG); - } case Intrinsic::amdgcn_fdiv_fast: return lowerFDIV_FAST(Op, DAG); case Intrinsic::amdgcn_sin: @@ -7223,16 +7239,17 @@ SDValue VData = Op.getOperand(2); auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); SDValue Ops[] = { - Op.getOperand(0), // Chain - VData, // vdata - Op.getOperand(3), // rsrc - DAG.getConstant(0, DL, MVT::i32), // vindex - Offsets.first, // voffset - Op.getOperand(5), // soffset - Offsets.second, // offset - Op.getOperand(6), // cachepolicy - DAG.getTargetConstant(0, DL, MVT::i1), // idxen + Op.getOperand(0), // Chain + VData, // vdata + Rsrc, // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + Offsets.first, // voffset + Op.getOperand(5), // soffset + Offsets.second, // offset + Op.getOperand(6), // cachepolicy + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; auto *M = cast(Op); @@ -7257,17 +7274,18 @@ SDLoc DL(Op); SDValue VData = Op.getOperand(2); + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); SDValue Ops[] = { - Op.getOperand(0), // Chain - VData, // vdata - Op.getOperand(3), // rsrc - Op.getOperand(4), // vindex - Offsets.first, // voffset - Op.getOperand(6), // soffset - Offsets.second, // offset - Op.getOperand(7), // cachepolicy - DAG.getTargetConstant(1, DL, MVT::i1), // idxen + Op.getOperand(0), // Chain + VData, // vdata + Rsrc, // rsrc + Op.getOperand(4), // vindex + Offsets.first, // voffset + Op.getOperand(6), // soffset + Offsets.second, // offset + Op.getOperand(7), // cachepolicy + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; auto *M = cast(Op); @@ -7388,20 +7406,31 @@ return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } + case Intrinsic::amdgcn_s_buffer_load: { + unsigned CPol = cast(Op.getOperand(4))->getZExtValue(); + if (CPol & ~AMDGPU::CPol::ALL) + return Op; + SDValue Chain = Op->getOperand(0); + EVT VT = Op->getValueType(0); + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); + return lowerSBuffer(VT, DL, Chain, Rsrc, Op.getOperand(3), Op.getOperand(4), + DAG); + } case Intrinsic::amdgcn_buffer_load: case Intrinsic::amdgcn_buffer_load_format: { + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); unsigned Glc = cast(Op.getOperand(5))->getZExtValue(); unsigned Slc = cast(Op.getOperand(6))->getZExtValue(); unsigned IdxEn = getIdxEn(Op.getOperand(3)); SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - Op.getOperand(3), // vindex - SDValue(), // voffset -- will be set by setBufferOffsets - SDValue(), // soffset -- will be set by setBufferOffsets - SDValue(), // offset -- will be set by setBufferOffsets - DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen + Op.getOperand(0), // Chain + Rsrc, // rsrc + Op.getOperand(3), // vindex + SDValue(), // voffset -- will be set by setBufferOffsets + SDValue(), // soffset -- will be set by setBufferOffsets + SDValue(), // offset -- will be set by setBufferOffsets + DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]); @@ -7430,16 +7459,17 @@ case Intrinsic::amdgcn_raw_buffer_load_format: { const bool IsFormat = IntrID == Intrinsic::amdgcn_raw_buffer_load_format; + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG); SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - DAG.getConstant(0, DL, MVT::i32), // vindex - Offsets.first, // voffset - Op.getOperand(4), // soffset - Offsets.second, // offset - Op.getOperand(5), // cachepolicy, swizzled buffer - DAG.getTargetConstant(0, DL, MVT::i1), // idxen + Op.getOperand(0), // Chain + Rsrc, // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + Offsets.first, // voffset + Op.getOperand(4), // soffset + Offsets.second, // offset + Op.getOperand(5), // cachepolicy, swizzled buffer + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; auto *M = cast(Op); @@ -7450,16 +7480,17 @@ case Intrinsic::amdgcn_struct_buffer_load_format: { const bool IsFormat = IntrID == Intrinsic::amdgcn_struct_buffer_load_format; + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - Op.getOperand(3), // vindex - Offsets.first, // voffset - Op.getOperand(5), // soffset - Offsets.second, // offset - Op.getOperand(6), // cachepolicy, swizzled buffer - DAG.getTargetConstant(1, DL, MVT::i1), // idxen + Op.getOperand(0), // Chain + Rsrc, // rsrc + Op.getOperand(3), // vindex + Offsets.first, // voffset + Op.getOperand(5), // soffset + Offsets.second, // offset + Op.getOperand(6), // cachepolicy, swizzled buffer + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; auto *M = cast(Op); @@ -7470,21 +7501,22 @@ MemSDNode *M = cast(Op); EVT LoadVT = Op.getValueType(); + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); unsigned Dfmt = cast(Op.getOperand(7))->getZExtValue(); unsigned Nfmt = cast(Op.getOperand(8))->getZExtValue(); unsigned Glc = cast(Op.getOperand(9))->getZExtValue(); unsigned Slc = cast(Op.getOperand(10))->getZExtValue(); unsigned IdxEn = getIdxEn(Op.getOperand(3)); SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - Op.getOperand(3), // vindex - Op.getOperand(4), // voffset - Op.getOperand(5), // soffset - Op.getOperand(6), // offset - DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format - DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen + Op.getOperand(0), // Chain + Rsrc, // rsrc + Op.getOperand(3), // vindex + Op.getOperand(4), // voffset + Op.getOperand(5), // soffset + Op.getOperand(6), // offset + DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format + DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen }; if (LoadVT.getScalarType() == MVT::f16) @@ -7497,18 +7529,19 @@ case Intrinsic::amdgcn_raw_tbuffer_load: { MemSDNode *M = cast(Op); EVT LoadVT = Op.getValueType(); + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG); SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - DAG.getConstant(0, DL, MVT::i32), // vindex - Offsets.first, // voffset - Op.getOperand(4), // soffset - Offsets.second, // offset - Op.getOperand(5), // format - Op.getOperand(6), // cachepolicy, swizzled buffer - DAG.getTargetConstant(0, DL, MVT::i1), // idxen + Op.getOperand(0), // Chain + Rsrc, // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + Offsets.first, // voffset + Op.getOperand(4), // soffset + Offsets.second, // offset + Op.getOperand(5), // format + Op.getOperand(6), // cachepolicy, swizzled buffer + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; if (LoadVT.getScalarType() == MVT::f16) @@ -7521,18 +7554,19 @@ case Intrinsic::amdgcn_struct_tbuffer_load: { MemSDNode *M = cast(Op); EVT LoadVT = Op.getValueType(); + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - Op.getOperand(3), // vindex - Offsets.first, // voffset - Op.getOperand(5), // soffset - Offsets.second, // offset - Op.getOperand(6), // format - Op.getOperand(7), // cachepolicy, swizzled buffer - DAG.getTargetConstant(1, DL, MVT::i1), // idxen + Op.getOperand(0), // Chain + Rsrc, // rsrc + Op.getOperand(3), // vindex + Offsets.first, // voffset + Op.getOperand(5), // soffset + Offsets.second, // offset + Op.getOperand(6), // format + Op.getOperand(7), // cachepolicy, swizzled buffer + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; if (LoadVT.getScalarType() == MVT::f16) @@ -7554,18 +7588,19 @@ case Intrinsic::amdgcn_buffer_atomic_or: case Intrinsic::amdgcn_buffer_atomic_xor: case Intrinsic::amdgcn_buffer_atomic_fadd: { + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); unsigned Slc = cast(Op.getOperand(6))->getZExtValue(); unsigned IdxEn = getIdxEn(Op.getOperand(4)); SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // vdata - Op.getOperand(3), // rsrc - Op.getOperand(4), // vindex - SDValue(), // voffset -- will be set by setBufferOffsets - SDValue(), // soffset -- will be set by setBufferOffsets - SDValue(), // offset -- will be set by setBufferOffsets - DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen + Op.getOperand(0), // Chain + Op.getOperand(2), // vdata + Rsrc, // rsrc + Op.getOperand(4), // vindex + SDValue(), // voffset -- will be set by setBufferOffsets + SDValue(), // soffset -- will be set by setBufferOffsets + SDValue(), // offset -- will be set by setBufferOffsets + DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); @@ -7686,19 +7721,20 @@ return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC); case Intrinsic::amdgcn_buffer_atomic_cmpswap: { + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG); unsigned Slc = cast(Op.getOperand(7))->getZExtValue(); unsigned IdxEn = getIdxEn(Op.getOperand(5)); SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // src - Op.getOperand(3), // cmp - Op.getOperand(4), // rsrc - Op.getOperand(5), // vindex - SDValue(), // voffset -- will be set by setBufferOffsets - SDValue(), // soffset -- will be set by setBufferOffsets - SDValue(), // offset -- will be set by setBufferOffsets - DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen + Op.getOperand(0), // Chain + Op.getOperand(2), // src + Op.getOperand(3), // cmp + Rsrc, // rsrc + Op.getOperand(5), // vindex + SDValue(), // voffset -- will be set by setBufferOffsets + SDValue(), // soffset -- will be set by setBufferOffsets + SDValue(), // offset -- will be set by setBufferOffsets + DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]); @@ -7710,18 +7746,19 @@ Op->getVTList(), Ops, VT, M->getMemOperand()); } case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: { + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // src - Op.getOperand(3), // cmp - Op.getOperand(4), // rsrc - DAG.getConstant(0, DL, MVT::i32), // vindex - Offsets.first, // voffset - Op.getOperand(6), // soffset - Offsets.second, // offset - Op.getOperand(7), // cachepolicy - DAG.getTargetConstant(0, DL, MVT::i1), // idxen + Op.getOperand(0), // Chain + Op.getOperand(2), // src + Op.getOperand(3), // cmp + Rsrc, // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + Offsets.first, // voffset + Op.getOperand(6), // soffset + Offsets.second, // offset + Op.getOperand(7), // cachepolicy + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; EVT VT = Op.getValueType(); auto *M = cast(Op); @@ -7731,18 +7768,19 @@ Op->getVTList(), Ops, VT, M->getMemOperand()); } case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: { + SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG); SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // src - Op.getOperand(3), // cmp - Op.getOperand(4), // rsrc - Op.getOperand(5), // vindex - Offsets.first, // voffset - Op.getOperand(7), // soffset - Offsets.second, // offset - Op.getOperand(8), // cachepolicy - DAG.getTargetConstant(1, DL, MVT::i1), // idxen + Op.getOperand(0), // Chain + Op.getOperand(2), // src + Op.getOperand(3), // cmp + Rsrc, // rsrc + Op.getOperand(5), // vindex + Offsets.first, // voffset + Op.getOperand(7), // soffset + Offsets.second, // offset + Op.getOperand(8), // cachepolicy + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; EVT VT = Op.getValueType(); auto *M = cast(Op); @@ -8105,22 +8143,23 @@ bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); if (IsD16) VData = handleD16VData(VData, DAG); + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); unsigned Dfmt = cast(Op.getOperand(8))->getZExtValue(); unsigned Nfmt = cast(Op.getOperand(9))->getZExtValue(); unsigned Glc = cast(Op.getOperand(10))->getZExtValue(); unsigned Slc = cast(Op.getOperand(11))->getZExtValue(); unsigned IdxEn = getIdxEn(Op.getOperand(4)); SDValue Ops[] = { - Chain, - VData, // vdata - Op.getOperand(3), // rsrc - Op.getOperand(4), // vindex - Op.getOperand(5), // voffset - Op.getOperand(6), // soffset - Op.getOperand(7), // offset - DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format - DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen + Chain, + VData, // vdata + Rsrc, // rsrc + Op.getOperand(4), // vindex + Op.getOperand(5), // voffset + Op.getOperand(6), // soffset + Op.getOperand(7), // offset + DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format + DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : AMDGPUISD::TBUFFER_STORE_FORMAT; @@ -8135,17 +8174,18 @@ if (IsD16) VData = handleD16VData(VData, DAG); auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); SDValue Ops[] = { - Chain, - VData, // vdata - Op.getOperand(3), // rsrc - Op.getOperand(4), // vindex - Offsets.first, // voffset - Op.getOperand(6), // soffset - Offsets.second, // offset - Op.getOperand(7), // format - Op.getOperand(8), // cachepolicy, swizzled buffer - DAG.getTargetConstant(1, DL, MVT::i1), // idxen + Chain, + VData, // vdata + Rsrc, // rsrc + Op.getOperand(4), // vindex + Offsets.first, // voffset + Op.getOperand(6), // soffset + Offsets.second, // offset + Op.getOperand(7), // format + Op.getOperand(8), // cachepolicy, swizzled buffer + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : AMDGPUISD::TBUFFER_STORE_FORMAT; @@ -8159,18 +8199,19 @@ bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); if (IsD16) VData = handleD16VData(VData, DAG); + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); SDValue Ops[] = { - Chain, - VData, // vdata - Op.getOperand(3), // rsrc - DAG.getConstant(0, DL, MVT::i32), // vindex - Offsets.first, // voffset - Op.getOperand(5), // soffset - Offsets.second, // offset - Op.getOperand(6), // format - Op.getOperand(7), // cachepolicy, swizzled buffer - DAG.getTargetConstant(0, DL, MVT::i1), // idxen + Chain, + VData, // vdata + Rsrc, // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + Offsets.first, // voffset + Op.getOperand(5), // soffset + Offsets.second, // offset + Op.getOperand(6), // format + Op.getOperand(7), // cachepolicy, swizzled buffer + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : AMDGPUISD::TBUFFER_STORE_FORMAT; @@ -8185,19 +8226,20 @@ bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); if (IsD16) VData = handleD16VData(VData, DAG); + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); unsigned Glc = cast(Op.getOperand(6))->getZExtValue(); unsigned Slc = cast(Op.getOperand(7))->getZExtValue(); unsigned IdxEn = getIdxEn(Op.getOperand(4)); SDValue Ops[] = { - Chain, - VData, - Op.getOperand(3), // rsrc - Op.getOperand(4), // vindex - SDValue(), // voffset -- will be set by setBufferOffsets - SDValue(), // soffset -- will be set by setBufferOffsets - SDValue(), // offset -- will be set by setBufferOffsets - DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen + Chain, + VData, + Rsrc, // rsrc + Op.getOperand(4), // vindex + SDValue(), // voffset -- will be set by setBufferOffsets + SDValue(), // soffset -- will be set by setBufferOffsets + SDValue(), // offset -- will be set by setBufferOffsets + DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); @@ -8236,17 +8278,18 @@ getEquivalentMemType(*DAG.getContext(), VDataVT), VData); } + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); SDValue Ops[] = { - Chain, - VData, - Op.getOperand(3), // rsrc - DAG.getConstant(0, DL, MVT::i32), // vindex - Offsets.first, // voffset - Op.getOperand(5), // soffset - Offsets.second, // offset - Op.getOperand(6), // cachepolicy, swizzled buffer - DAG.getTargetConstant(0, DL, MVT::i1), // idxen + Chain, + VData, + Rsrc, + DAG.getConstant(0, DL, MVT::i32), // vindex + Offsets.first, // voffset + Op.getOperand(5), // soffset + Offsets.second, // offset + Op.getOperand(6), // cachepolicy, swizzled buffer + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; unsigned Opc = IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE; @@ -8283,17 +8326,18 @@ getEquivalentMemType(*DAG.getContext(), VDataVT), VData); } + auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); SDValue Ops[] = { - Chain, - VData, - Op.getOperand(3), // rsrc - Op.getOperand(4), // vindex - Offsets.first, // voffset - Op.getOperand(6), // soffset - Offsets.second, // offset - Op.getOperand(7), // cachepolicy, swizzled buffer - DAG.getTargetConstant(1, DL, MVT::i1), // idxen + Chain, + VData, + Rsrc, + Op.getOperand(4), // vindex + Offsets.first, // voffset + Op.getOperand(6), // soffset + Offsets.second, // offset + Op.getOperand(7), // cachepolicy, swizzled buffer + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; @@ -8355,7 +8399,8 @@ else if (HasVOffset) Ops.push_back(VOffset); - Ops.push_back(Op.getOperand(2)); // rsrc + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); + Ops.push_back(Rsrc); Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset unsigned Aux = Op.getConstantOperandVal(8 + OpOffset); @@ -8588,6 +8633,17 @@ Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32); } +SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue Pointer, + SelectionDAG &DAG) const { + if (!Pointer.getValueType().isScalarInteger()) + return Pointer; + + SDLoc DL(Pointer); + + SDValue Rsrc = DAG.getBitcast(MVT::v4i32, Pointer); + return Rsrc; +} + // Handle 8 bit and 16 bit buffer loads SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -42,7 +42,7 @@ def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD", SDTypeProfile<1, 3, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>]>, - [SDNPMayLoad, SDNPMemOperand] + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] >; def SIds_ordered_count : SDNode<"AMDGPUISD::DS_ORDERED_COUNT", diff --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll --- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll +++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -2,303 +2,303 @@ ; RUN: opt -mtriple amdgcn-mesa-mesa3d -passes='print' -disable-output %s 2>&1 | FileCheck %s ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.swap.i32( -define float @buffer_atomic_swap(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @buffer_atomic_swap(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %orig = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.add.i32( -define float @buffer_atomic_add(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @buffer_atomic_add(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %orig = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.sub.i32( -define float @buffer_atomic_sub(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @buffer_atomic_sub(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %orig = call i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smin.i32( -define float @buffer_atomic_smin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @buffer_atomic_smin(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %orig = call i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umin.i32( -define float @buffer_atomic_umin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @buffer_atomic_umin(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %orig = call i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smax.i32( -define float @buffer_atomic_smax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @buffer_atomic_smax(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %orig = call i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umax.i32( -define float @buffer_atomic_umax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @buffer_atomic_umax(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %orig = call i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.and.i32( -define float @buffer_atomic_and(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @buffer_atomic_and(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.and.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %orig = call i32 @llvm.amdgcn.buffer.atomic.and.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.or.i32( -define float @buffer_atomic_or(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @buffer_atomic_or(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.or.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %orig = call i32 @llvm.amdgcn.buffer.atomic.or.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.xor.i32( -define float @buffer_atomic_xor(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @buffer_atomic_xor(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %orig = call i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap( -define float @buffer_atomic_cmpswap(<4 x i32> inreg %rsrc, i32 inreg %data, i32 inreg %cmp) #0 { +define float @buffer_atomic_cmpswap(ptr addrspace(8) inreg %rsrc, i32 inreg %data, i32 inreg %cmp) #0 { main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32( -define float @raw_buffer_atomic_swap(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @raw_buffer_atomic_swap(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32( -define float @raw_buffer_atomic_add(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @raw_buffer_atomic_add(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32( -define float @raw_buffer_atomic_sub(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @raw_buffer_atomic_sub(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32( -define float @raw_buffer_atomic_smin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @raw_buffer_atomic_smin(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32( -define float @raw_buffer_atomic_umin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @raw_buffer_atomic_umin(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32( -define float @raw_buffer_atomic_smax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @raw_buffer_atomic_smax(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32( -define float @raw_buffer_atomic_umax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @raw_buffer_atomic_umax(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.and.i32( -define float @raw_buffer_atomic_and(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @raw_buffer_atomic_and(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.or.i32( -define float @raw_buffer_atomic_or(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @raw_buffer_atomic_or(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32( -define float @raw_buffer_atomic_xor(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @raw_buffer_atomic_xor(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32( -define float @raw_buffer_atomic_cmpswap(<4 x i32> inreg %rsrc, i32 inreg %data, i32 inreg %cmp) #0 { +define float @raw_buffer_atomic_cmpswap(ptr addrspace(8) inreg %rsrc, i32 inreg %data, i32 inreg %cmp) #0 { main_body: - %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %data, i32 %cmp, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32( -define float @struct_buffer_atomic_swap(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @struct_buffer_atomic_swap(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32( -define float @struct_buffer_atomic_add(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @struct_buffer_atomic_add(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32( -define float @struct_buffer_atomic_sub(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @struct_buffer_atomic_sub(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.smin.i32( -define float @struct_buffer_atomic_smin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @struct_buffer_atomic_smin(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.smin.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.smin.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.umin.i32( -define float @struct_buffer_atomic_umin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @struct_buffer_atomic_umin(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.umin.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.umin.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.smax.i32( -define float @struct_buffer_atomic_smax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @struct_buffer_atomic_smax(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.smax.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.smax.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.umax.i32( -define float @struct_buffer_atomic_umax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @struct_buffer_atomic_umax(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.umax.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.umax.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.and.i32( -define float @struct_buffer_atomic_and(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @struct_buffer_atomic_and(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.and.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.and.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.or.i32( -define float @struct_buffer_atomic_or(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @struct_buffer_atomic_or(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.or.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.or.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32( -define float @struct_buffer_atomic_xor(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @struct_buffer_atomic_xor(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32( -define float @struct_buffer_atomic_cmpswap(<4 x i32> inreg %rsrc, i32 inreg %data, i32 inreg %cmp) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) - %r = bitcast i32 %orig to float - ret float %r -} - -declare i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #0 - -declare i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32, <4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32, <4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32, <4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32) #0 - -declare i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.atomic.smin.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.atomic.umin.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.atomic.smax.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32, i32) #0 +define float @struct_buffer_atomic_cmpswap(ptr addrspace(8) inreg %rsrc, i32 inreg %data, i32 inreg %cmp) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %data, i32 %cmp, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +declare i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.add.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.and.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.or.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, ptr addrspace(8), i32, i32, i1) #0 + +declare i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32, ptr addrspace(8), i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, ptr addrspace(8), i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32, ptr addrspace(8), i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32, ptr addrspace(8), i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32, ptr addrspace(8), i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32, ptr addrspace(8), i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32, ptr addrspace(8), i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32, ptr addrspace(8), i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32, ptr addrspace(8), i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32, ptr addrspace(8), i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32, i32, ptr addrspace(8), i32, i32, i32) #0 + +declare i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.smin.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.umin.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.smax.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.umax.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.and.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.or.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32, i32, ptr addrspace(8), i32, i32, i32, i32) #0 attributes #0 = { nounwind } diff --git a/llvm/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll b/llvm/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll --- a/llvm/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll +++ b/llvm/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -1,103 +1,103 @@ ;RUN: opt -mtriple=amdgcn-mesa-mesa3d -amdgpu-use-legacy-divergence-analysis -passes='print' 2>&1 -disable-output %s | FileCheck %s ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.swap.i32( -define float @buffer_atomic_swap(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @buffer_atomic_swap(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %orig = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.add.i32( -define float @buffer_atomic_add(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @buffer_atomic_add(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %orig = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.sub.i32( -define float @buffer_atomic_sub(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @buffer_atomic_sub(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %orig = call i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smin.i32( -define float @buffer_atomic_smin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @buffer_atomic_smin(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %orig = call i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umin.i32( -define float @buffer_atomic_umin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @buffer_atomic_umin(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %orig = call i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smax.i32( -define float @buffer_atomic_smax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @buffer_atomic_smax(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %orig = call i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umax.i32( -define float @buffer_atomic_umax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @buffer_atomic_umax(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %orig = call i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.and.i32( -define float @buffer_atomic_and(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @buffer_atomic_and(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.and.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %orig = call i32 @llvm.amdgcn.buffer.atomic.and.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.or.i32( -define float @buffer_atomic_or(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @buffer_atomic_or(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.or.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %orig = call i32 @llvm.amdgcn.buffer.atomic.or.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.xor.i32( -define float @buffer_atomic_xor(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +define float @buffer_atomic_xor(ptr addrspace(8) inreg %rsrc, i32 inreg %data) #0 { main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %orig = call i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) %r = bitcast i32 %orig to float ret float %r } ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap( -define float @buffer_atomic_cmpswap(<4 x i32> inreg %rsrc, i32 inreg %data, i32 inreg %cmp) #0 { +define float @buffer_atomic_cmpswap(ptr addrspace(8) inreg %rsrc, i32 inreg %data, i32 inreg %cmp) #0 { main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) %r = bitcast i32 %orig to float ret float %r } -declare i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.add.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.and.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.or.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, ptr addrspace(8), i32, i32, i1) #0 attributes #0 = { nounwind } diff --git a/llvm/test/Bitcode/upgrade-amdgpu-amdgcn-buffer-intrinsics.ll b/llvm/test/Bitcode/upgrade-amdgpu-amdgcn-buffer-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Bitcode/upgrade-amdgpu-amdgcn-buffer-intrinsics.ll @@ -0,0 +1,3194 @@ +;; Test auto-upgrade of AMDGPU buffer resource intrinsics +;; Generated from the set of intrinsics manually updated in other tests +; RUN: llvm-as < %s | llvm-dis | FileCheck %s + +declare <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32>, i32, i32) +define amdgpu_cs <16 x float> @upgrade.s.buffer.load.v16f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.v16f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 0, i32 0) + ret <16 x float> %val +} + +declare <16 x i16> @llvm.amdgcn.s.buffer.load.v16i16(<4 x i32>, i32, i32) +define amdgpu_cs <16 x i16> @upgrade.s.buffer.load.v16i16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.v16i16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <16 x i16> @llvm.amdgcn.s.buffer.load.v16i16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call <16 x i16> @llvm.amdgcn.s.buffer.load.v16i16(<4 x i32> %rsrc, i32 0, i32 0) + ret <16 x i16> %val +} + +declare <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32>, i32, i32) +define amdgpu_cs <16 x i32> @upgrade.s.buffer.load.v16i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.v16i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 0, i32 0) + ret <16 x i32> %val +} + +declare <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32>, i32, i32, i1, i1) +define amdgpu_cs <1 x float> @upgrade.buffer.load.format.v1f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.load.format.v1f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + %val = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret <1 x float> %val +} + +declare <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32>, i32, i32, i1, i1) +define amdgpu_cs <1 x float> @upgrade.buffer.load.v1f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.load.v1f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + %val = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret <1 x float> %val +} + +declare <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(<4 x i32>, i32, i32, i32) +define amdgpu_cs <1 x float> @upgrade.raw.buffer.load.format.v1f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.format.v1f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret <1 x float> %val +} + +declare <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(<4 x i32>, i32, i32, i32) +define amdgpu_cs <1 x float> @upgrade.raw.buffer.load.v1f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.v1f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret <1 x float> %val +} + +declare <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <1 x float> @upgrade.struct.buffer.load.format.v1f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.format.v1f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <1 x float> %val +} + +declare <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <1 x float> @upgrade.struct.buffer.load.v1f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.v1f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <1 x float> %val +} + +declare <1 x float> @llvm.amdgcn.tbuffer.load.v1f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +define amdgpu_cs <1 x float> @upgrade.tbuffer.load.v1f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.tbuffer.load.v1f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <1 x float> @llvm.amdgcn.tbuffer.load.v1f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %val = call <1 x float> @llvm.amdgcn.tbuffer.load.v1f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + ret <1 x float> %val +} + +declare { <2 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v2f32i32s(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs { <2 x float>, i32 } @upgrade.struct.buffer.load.format.sl_v2f32i32s(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.format.sl_v2f32i32s +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call { <2 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v2f32i32s(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call { <2 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v2f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret { <2 x float>, i32 } %val +} + +declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32>, i32, i32, i1, i1) +define amdgpu_cs <2 x float> @upgrade.buffer.load.format.v2f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.load.format.v2f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + %val = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret <2 x float> %val +} + +declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) +define amdgpu_cs <2 x float> @upgrade.buffer.load.v2f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.load.v2f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + %val = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret <2 x float> %val +} + +declare <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32) +define amdgpu_cs <2 x float> @upgrade.raw.buffer.load.format.v2f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.format.v2f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret <2 x float> %val +} + +declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) +define amdgpu_cs <2 x float> @upgrade.raw.buffer.load.v2f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.v2f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret <2 x float> %val +} + +declare <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <2 x float> @upgrade.raw.tbuffer.load.v2f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.tbuffer.load.v2f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <2 x float> %val +} + +declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32>, i32, i32) +define amdgpu_cs <2 x float> @upgrade.s.buffer.load.v2f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.v2f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 0) + ret <2 x float> %val +} + +declare <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <2 x float> @upgrade.struct.buffer.load.format.v2f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.format.v2f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <2 x float> %val +} + +declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <2 x float> @upgrade.struct.buffer.load.v2f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.v2f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <2 x float> %val +} + +declare <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32>, i32, i32, i32, i32, i32) +define amdgpu_cs <2 x float> @upgrade.struct.tbuffer.load.v2f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.tbuffer.load.v2f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0) + %val = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <2 x float> %val +} + +declare <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +define amdgpu_cs <2 x float> @upgrade.tbuffer.load.v2f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.tbuffer.load.v2f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %val = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + ret <2 x float> %val +} + +declare <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1) +define amdgpu_cs <2 x half> @upgrade.buffer.atomic.fadd.v2f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.atomic.fadd.v2f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false) + %val = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false) + ret <2 x half> %val +} + +declare <2 x half> @llvm.amdgcn.buffer.load.format.v2f16(<4 x i32>, i32, i32, i1, i1) +define amdgpu_cs <2 x half> @upgrade.buffer.load.format.v2f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.load.format.v2f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x half> @llvm.amdgcn.buffer.load.format.v2f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + %val = call <2 x half> @llvm.amdgcn.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret <2 x half> %val +} + +declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32) +define amdgpu_cs <2 x half> @upgrade.raw.buffer.atomic.fadd.v2f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.atomic.fadd.v2f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret <2 x half> %val +} + +declare <2 x half> @llvm.amdgcn.raw.buffer.load.format.v2f16(<4 x i32>, i32, i32, i32) +define amdgpu_cs <2 x half> @upgrade.raw.buffer.load.format.v2f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.format.v2f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x half> @llvm.amdgcn.raw.buffer.load.format.v2f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call <2 x half> @llvm.amdgcn.raw.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret <2 x half> %val +} + +declare <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32>, i32, i32, i32) +define amdgpu_cs <2 x half> @upgrade.raw.buffer.load.v2f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.v2f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret <2 x half> %val +} + +declare <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <2 x half> @upgrade.raw.tbuffer.load.v2f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.tbuffer.load.v2f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <2 x half> %val +} + +declare <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32>, i32, i32) +define amdgpu_cs <2 x half> @upgrade.s.buffer.load.v2f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.v2f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0) + ret <2 x half> %val +} + +declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <2 x half> @upgrade.struct.buffer.atomic.fadd.v2f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.atomic.fadd.v2f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <2 x half> %val +} + +declare <2 x half> @llvm.amdgcn.struct.buffer.load.format.v2f16(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <2 x half> @upgrade.struct.buffer.load.format.v2f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.format.v2f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x half> @llvm.amdgcn.struct.buffer.load.format.v2f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <2 x half> @llvm.amdgcn.struct.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <2 x half> %val +} + +declare <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <2 x half> @upgrade.struct.buffer.load.v2f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.v2f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <2 x half> %val +} + +declare <2 x half> @llvm.amdgcn.struct.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32, i32) +define amdgpu_cs <2 x half> @upgrade.struct.tbuffer.load.v2f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.tbuffer.load.v2f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x half> @llvm.amdgcn.struct.tbuffer.load.v2f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0) + %val = call <2 x half> @llvm.amdgcn.struct.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <2 x half> %val +} + +declare <2 x half> @llvm.amdgcn.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +define amdgpu_cs <2 x half> @upgrade.tbuffer.load.v2f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.tbuffer.load.v2f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x half> @llvm.amdgcn.tbuffer.load.v2f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %val = call <2 x half> @llvm.amdgcn.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + ret <2 x half> %val +} + +declare <2 x i16> @llvm.amdgcn.raw.buffer.load.v2i16(<4 x i32>, i32, i32, i32) +define amdgpu_cs <2 x i16> @upgrade.raw.buffer.load.v2i16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.v2i16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x i16> @llvm.amdgcn.raw.buffer.load.v2i16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call <2 x i16> @llvm.amdgcn.raw.buffer.load.v2i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret <2 x i16> %val +} + +declare <2 x i16> @llvm.amdgcn.struct.buffer.load.v2i16(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <2 x i16> @upgrade.struct.buffer.load.v2i16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.v2i16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x i16> @llvm.amdgcn.struct.buffer.load.v2i16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <2 x i16> @llvm.amdgcn.struct.buffer.load.v2i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <2 x i16> %val +} + +declare { <2 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v2i32i32s(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs { <2 x i32>, i32 } @upgrade.struct.buffer.load.format.sl_v2i32i32s(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.format.sl_v2i32i32s +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call { <2 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v2i32i32s(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call { <2 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v2i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret { <2 x i32>, i32 } %val +} + +declare <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32>, i32, i32, i32) +define amdgpu_cs <2 x i32> @upgrade.raw.buffer.load.v2i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.v2i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret <2 x i32> %val +} + +declare <2 x i32> @llvm.amdgcn.raw.tbuffer.load.v2i32(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <2 x i32> @upgrade.raw.tbuffer.load.v2i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.tbuffer.load.v2i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x i32> @llvm.amdgcn.raw.tbuffer.load.v2i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <2 x i32> @llvm.amdgcn.raw.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <2 x i32> %val +} + +declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32) +define amdgpu_cs <2 x i32> @upgrade.s.buffer.load.v2i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.v2i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0) + ret <2 x i32> %val +} + +declare <2 x i32> @llvm.amdgcn.struct.buffer.load.v2i32(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <2 x i32> @upgrade.struct.buffer.load.v2i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.v2i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x i32> @llvm.amdgcn.struct.buffer.load.v2i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <2 x i32> @llvm.amdgcn.struct.buffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <2 x i32> %val +} + +declare <2 x i32> @llvm.amdgcn.struct.tbuffer.load.v2i32(<4 x i32>, i32, i32, i32, i32, i32) +define amdgpu_cs <2 x i32> @upgrade.struct.tbuffer.load.v2i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.tbuffer.load.v2i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x i32> @llvm.amdgcn.struct.tbuffer.load.v2i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0) + %val = call <2 x i32> @llvm.amdgcn.struct.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <2 x i32> %val +} + +declare <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +define amdgpu_cs <2 x i32> @upgrade.tbuffer.load.v2i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.tbuffer.load.v2i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %val = call <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + ret <2 x i32> %val +} + +declare <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32>, i32, i32, i32) +define amdgpu_cs <2 x i8> @upgrade.raw.buffer.load.v2i8(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.v2i8 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret <2 x i8> %val +} + +declare <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32>, i32, i32) +define amdgpu_cs <2 x i8> @upgrade.s.buffer.load.v2i8(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.v2i8 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 0, i32 0) + ret <2 x i8> %val +} + +declare <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <2 x i8> @upgrade.struct.buffer.load.v2i8(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.v2i8 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <2 x i8> %val +} + +declare <32 x i16> @llvm.amdgcn.s.buffer.load.v32i16(<4 x i32>, i32, i32) +define amdgpu_cs <32 x i16> @upgrade.s.buffer.load.v32i16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.v32i16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <32 x i16> @llvm.amdgcn.s.buffer.load.v32i16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call <32 x i16> @llvm.amdgcn.s.buffer.load.v32i16(<4 x i32> %rsrc, i32 0, i32 0) + ret <32 x i16> %val +} + +declare { <3 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3f32i32s(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs { <3 x float>, i32 } @upgrade.struct.buffer.load.format.sl_v3f32i32s(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.format.sl_v3f32i32s +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call { <3 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3f32i32s(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call { <3 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret { <3 x float>, i32 } %val +} + +declare <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32>, i32, i32, i1, i1) +define amdgpu_cs <3 x float> @upgrade.buffer.load.format.v3f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.load.format.v3f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + %val = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret <3 x float> %val +} + +declare <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32>, i32, i32, i1, i1) +define amdgpu_cs <3 x float> @upgrade.buffer.load.v3f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.load.v3f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + %val = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret <3 x float> %val +} + +declare <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32) +define amdgpu_cs <3 x float> @upgrade.raw.buffer.load.format.v3f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.format.v3f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret <3 x float> %val +} + +declare <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32>, i32, i32, i32) +define amdgpu_cs <3 x float> @upgrade.raw.buffer.load.v3f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.v3f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret <3 x float> %val +} + +declare <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <3 x float> @upgrade.raw.tbuffer.load.v3f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.tbuffer.load.v3f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <3 x float> %val +} + +declare <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32>, i32, i32) +define amdgpu_cs <3 x float> @upgrade.s.buffer.load.v3f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.v3f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 0, i32 0) + ret <3 x float> %val +} + +declare <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <3 x float> @upgrade.struct.buffer.load.format.v3f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.format.v3f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <3 x float> %val +} + +declare <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <3 x float> @upgrade.struct.buffer.load.v3f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.v3f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <3 x float> %val +} + +declare <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32>, i32, i32, i32, i32, i32) +define amdgpu_cs <3 x float> @upgrade.struct.tbuffer.load.v3f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.tbuffer.load.v3f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0) + %val = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <3 x float> %val +} + +declare <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +define amdgpu_cs <3 x float> @upgrade.tbuffer.load.v3f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.tbuffer.load.v3f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %val = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + ret <3 x float> %val +} + +declare <3 x half> @llvm.amdgcn.buffer.load.format.v3f16(<4 x i32>, i32, i32, i1, i1) +define amdgpu_cs <3 x half> @upgrade.buffer.load.format.v3f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.load.format.v3f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x half> @llvm.amdgcn.buffer.load.format.v3f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + %val = call <3 x half> @llvm.amdgcn.buffer.load.format.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret <3 x half> %val +} + +declare <3 x half> @llvm.amdgcn.raw.buffer.load.format.v3f16(<4 x i32>, i32, i32, i32) +define amdgpu_cs <3 x half> @upgrade.raw.buffer.load.format.v3f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.format.v3f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x half> @llvm.amdgcn.raw.buffer.load.format.v3f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call <3 x half> @llvm.amdgcn.raw.buffer.load.format.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret <3 x half> %val +} + +declare <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(<4 x i32>, i32, i32, i32) +define amdgpu_cs <3 x half> @upgrade.raw.buffer.load.v3f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.v3f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret <3 x half> %val +} + +declare <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <3 x half> @upgrade.raw.tbuffer.load.v3f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.tbuffer.load.v3f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <3 x half> %val +} + +declare <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(<4 x i32>, i32, i32) +define amdgpu_cs <3 x half> @upgrade.s.buffer.load.v3f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.v3f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(<4 x i32> %rsrc, i32 0, i32 0) + ret <3 x half> %val +} + +declare <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <3 x half> @upgrade.struct.buffer.load.format.v3f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.format.v3f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <3 x half> %val +} + +declare <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <3 x half> @upgrade.struct.buffer.load.v3f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.v3f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <3 x half> %val +} + +declare <3 x half> @llvm.amdgcn.struct.tbuffer.load.v3f16(<4 x i32>, i32, i32, i32, i32, i32) +define amdgpu_cs <3 x half> @upgrade.struct.tbuffer.load.v3f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.tbuffer.load.v3f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x half> @llvm.amdgcn.struct.tbuffer.load.v3f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0) + %val = call <3 x half> @llvm.amdgcn.struct.tbuffer.load.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <3 x half> %val +} + +declare <3 x half> @llvm.amdgcn.tbuffer.load.v3f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +define amdgpu_cs <3 x half> @upgrade.tbuffer.load.v3f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.tbuffer.load.v3f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x half> @llvm.amdgcn.tbuffer.load.v3f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %val = call <3 x half> @llvm.amdgcn.tbuffer.load.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + ret <3 x half> %val +} + +declare { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3i32i32s(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs { <3 x i32>, i32 } @upgrade.struct.buffer.load.format.sl_v3i32i32s(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.format.sl_v3i32i32s +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3i32i32s(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret { <3 x i32>, i32 } %val +} + +declare <3 x i32> @llvm.amdgcn.raw.tbuffer.load.v3i32(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <3 x i32> @upgrade.raw.tbuffer.load.v3i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.tbuffer.load.v3i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x i32> @llvm.amdgcn.raw.tbuffer.load.v3i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <3 x i32> @llvm.amdgcn.raw.tbuffer.load.v3i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <3 x i32> %val +} + +declare <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32>, i32, i32) +define amdgpu_cs <3 x i32> @upgrade.s.buffer.load.v3i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.v3i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %rsrc, i32 0, i32 0) + ret <3 x i32> %val +} + +declare <3 x i32> @llvm.amdgcn.struct.tbuffer.load.v3i32(<4 x i32>, i32, i32, i32, i32, i32) +define amdgpu_cs <3 x i32> @upgrade.struct.tbuffer.load.v3i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.tbuffer.load.v3i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x i32> @llvm.amdgcn.struct.tbuffer.load.v3i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0) + %val = call <3 x i32> @llvm.amdgcn.struct.tbuffer.load.v3i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <3 x i32> %val +} + +declare <3 x i32> @llvm.amdgcn.tbuffer.load.v3i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +define amdgpu_cs <3 x i32> @upgrade.tbuffer.load.v3i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.tbuffer.load.v3i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x i32> @llvm.amdgcn.tbuffer.load.v3i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %val = call <3 x i32> @llvm.amdgcn.tbuffer.load.v3i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + ret <3 x i32> %val +} + +declare <3 x i8> @llvm.amdgcn.raw.buffer.load.v3i8(<4 x i32>, i32, i32, i32) +define amdgpu_cs <3 x i8> @upgrade.raw.buffer.load.v3i8(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.v3i8 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x i8> @llvm.amdgcn.raw.buffer.load.v3i8(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call <3 x i8> @llvm.amdgcn.raw.buffer.load.v3i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret <3 x i8> %val +} + +declare <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(<4 x i32>, i32, i32) +define amdgpu_cs <3 x i8> @upgrade.s.buffer.load.v3i8(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.v3i8 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(<4 x i32> %rsrc, i32 0, i32 0) + ret <3 x i8> %val +} + +declare <3 x i8> @llvm.amdgcn.struct.buffer.load.v3i8(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <3 x i8> @upgrade.struct.buffer.load.v3i8(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.v3i8 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <3 x i8> @llvm.amdgcn.struct.buffer.load.v3i8(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <3 x i8> @llvm.amdgcn.struct.buffer.load.v3i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <3 x i8> %val +} + +declare { <4 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4f32i32s(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs { <4 x float>, i32 } @upgrade.struct.buffer.load.format.sl_v4f32i32s(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.format.sl_v4f32i32s +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call { <4 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4f32i32s(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call { <4 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret { <4 x float>, i32 } %val +} + +declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) +define amdgpu_cs <4 x float> @upgrade.buffer.load.format.v4f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.load.format.v4f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + %val = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret <4 x float> %val +} + +declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) +define amdgpu_cs <4 x float> @upgrade.buffer.load.v4f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.load.v4f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + %val = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret <4 x float> %val +} + +declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) +define amdgpu_cs <4 x float> @upgrade.raw.buffer.load.format.v4f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.format.v4f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret <4 x float> %val +} + +declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) +define amdgpu_cs <4 x float> @upgrade.raw.buffer.load.v4f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.v4f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret <4 x float> %val +} + +declare <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <4 x float> @upgrade.raw.tbuffer.load.v4f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.tbuffer.load.v4f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %val +} + +declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32) +define amdgpu_cs <4 x float> @upgrade.s.buffer.load.v4f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.v4f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %val +} + +declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <4 x float> @upgrade.struct.buffer.load.format.v4f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.format.v4f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %val +} + +declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <4 x float> @upgrade.struct.buffer.load.v4f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.v4f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %val +} + +declare <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32) +define amdgpu_cs <4 x float> @upgrade.struct.tbuffer.load.v4f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.tbuffer.load.v4f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0) + %val = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %val +} + +declare <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +define amdgpu_cs <4 x float> @upgrade.tbuffer.load.v4f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.tbuffer.load.v4f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %val = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + ret <4 x float> %val +} + +declare <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32>, i32, i32, i1, i1) +define amdgpu_cs <4 x half> @upgrade.buffer.load.format.v4f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.load.format.v4f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + %val = call <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret <4 x half> %val +} + +declare <4 x half> @llvm.amdgcn.raw.buffer.load.format.v4f16(<4 x i32>, i32, i32, i32) +define amdgpu_cs <4 x half> @upgrade.raw.buffer.load.format.v4f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.format.v4f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x half> @llvm.amdgcn.raw.buffer.load.format.v4f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call <4 x half> @llvm.amdgcn.raw.buffer.load.format.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret <4 x half> %val +} + +declare <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32>, i32, i32, i32) +define amdgpu_cs <4 x half> @upgrade.raw.buffer.load.v4f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.v4f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret <4 x half> %val +} + +declare <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <4 x half> @upgrade.raw.tbuffer.load.v4f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.tbuffer.load.v4f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <4 x half> %val +} + +declare <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32>, i32, i32) +define amdgpu_cs <4 x half> @upgrade.s.buffer.load.v4f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.v4f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0) + ret <4 x half> %val +} + +declare <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <4 x half> @upgrade.struct.buffer.load.format.v4f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.format.v4f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <4 x half> %val +} + +declare <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <4 x half> @upgrade.struct.buffer.load.v4f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.v4f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <4 x half> %val +} + +declare <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32, i32) +define amdgpu_cs <4 x half> @upgrade.struct.tbuffer.load.v4f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.tbuffer.load.v4f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0) + %val = call <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <4 x half> %val +} + +declare <4 x half> @llvm.amdgcn.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +define amdgpu_cs <4 x half> @upgrade.tbuffer.load.v4f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.tbuffer.load.v4f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x half> @llvm.amdgcn.tbuffer.load.v4f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %val = call <4 x half> @llvm.amdgcn.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + ret <4 x half> %val +} + +declare <4 x i16> @llvm.amdgcn.raw.buffer.load.v4i16(<4 x i32>, i32, i32, i32) +define amdgpu_cs <4 x i16> @upgrade.raw.buffer.load.v4i16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.v4i16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x i16> @llvm.amdgcn.raw.buffer.load.v4i16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call <4 x i16> @llvm.amdgcn.raw.buffer.load.v4i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret <4 x i16> %val +} + +declare <4 x i16> @llvm.amdgcn.struct.buffer.load.v4i16(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <4 x i16> @upgrade.struct.buffer.load.v4i16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.v4i16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x i16> @llvm.amdgcn.struct.buffer.load.v4i16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <4 x i16> @llvm.amdgcn.struct.buffer.load.v4i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <4 x i16> %val +} + +declare { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs { <4 x i32>, i32 } @upgrade.struct.buffer.load.format.sl_v4i32i32s(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.format.sl_v4i32i32s +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret { <4 x i32>, i32 } %val +} + +declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32) +define amdgpu_cs <4 x i32> @upgrade.raw.buffer.load.v4i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.v4i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret <4 x i32> %val +} + +declare <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <4 x i32> @upgrade.raw.tbuffer.load.v4i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.tbuffer.load.v4i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <4 x i32> %val +} + +declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) +define amdgpu_cs <4 x i32> @upgrade.s.buffer.load.v4i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.v4i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %rsrc, i32 0, i32 0) + ret <4 x i32> %val +} + +declare <4 x i32> @llvm.amdgcn.struct.buffer.load.format.v4i32(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <4 x i32> @upgrade.struct.buffer.load.format.v4i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.format.v4i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x i32> @llvm.amdgcn.struct.buffer.load.format.v4i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <4 x i32> @llvm.amdgcn.struct.buffer.load.format.v4i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <4 x i32> %val +} + +declare <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <4 x i32> @upgrade.struct.buffer.load.v4i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.v4i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <4 x i32> %val +} + +declare <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32) +define amdgpu_cs <4 x i32> @upgrade.struct.tbuffer.load.v4i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.tbuffer.load.v4i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0) + %val = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <4 x i32> %val +} + +declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +define amdgpu_cs <4 x i32> @upgrade.tbuffer.load.v4i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.tbuffer.load.v4i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %val = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + ret <4 x i32> %val +} + +declare <4 x i64> @llvm.amdgcn.s.buffer.load.v4i64(<4 x i32>, i32, i32) +define amdgpu_cs <4 x i64> @upgrade.s.buffer.load.v4i64(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.v4i64 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x i64> @llvm.amdgcn.s.buffer.load.v4i64(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call <4 x i64> @llvm.amdgcn.s.buffer.load.v4i64(<4 x i32> %rsrc, i32 0, i32 0) + ret <4 x i64> %val +} + +declare <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32>, i32, i32, i32) +define amdgpu_cs <4 x i8> @upgrade.raw.buffer.load.v4i8(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.v4i8 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret <4 x i8> %val +} + +declare <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32>, i32, i32) +define amdgpu_cs <4 x i8> @upgrade.s.buffer.load.v4i8(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.v4i8 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 0, i32 0) + ret <4 x i8> %val +} + +declare <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs <4 x i8> @upgrade.struct.buffer.load.v4i8(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.v4i8 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <4 x i8> %val +} + +declare <4 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v4p1(<4 x i32>, i32, i32) +define amdgpu_cs <4 x ptr addrspace(1)> @upgrade.s.buffer.load.v4p1(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.v4p1 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <4 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v4p1(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call <4 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v4p1(<4 x i32> %rsrc, i32 0, i32 0) + ret <4 x ptr addrspace(1)> %val +} + +declare <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32>, i32, i32) +define amdgpu_cs <8 x float> @upgrade.s.buffer.load.v8f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.v8f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 0, i32 0) + ret <8 x float> %val +} + +declare <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32>, i32, i32) +define amdgpu_cs <8 x i32> @upgrade.s.buffer.load.v8i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.v8i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %rsrc, i32 0, i32 0) + ret <8 x i32> %val +} + +declare <8 x i64> @llvm.amdgcn.s.buffer.load.v8i64(<4 x i32>, i32, i32) +define amdgpu_cs <8 x i64> @upgrade.s.buffer.load.v8i64(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.v8i64 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <8 x i64> @llvm.amdgcn.s.buffer.load.v8i64(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call <8 x i64> @llvm.amdgcn.s.buffer.load.v8i64(<4 x i32> %rsrc, i32 0, i32 0) + ret <8 x i64> %val +} + +declare <8 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v8p1(<4 x i32>, i32, i32) +define amdgpu_cs <8 x ptr addrspace(1)> @upgrade.s.buffer.load.v8p1(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.v8p1 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call <8 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v8p1(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call <8 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v8p1(<4 x i32> %rsrc, i32 0, i32 0) + ret <8 x ptr addrspace(1)> %val +} + +declare double @llvm.amdgcn.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i1) +define amdgpu_cs double @upgrade.buffer.atomic.fadd.f64(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.atomic.fadd.f64 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false) + %val = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false) + ret double %val +} + +declare double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32) +define amdgpu_cs double @upgrade.raw.buffer.atomic.fadd.f64(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.atomic.fadd.f64 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret double %val +} + +declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32) +define amdgpu_cs double @upgrade.raw.buffer.atomic.fmax.f64(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.atomic.fmax.f64 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret double %val +} + +declare double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32) +define amdgpu_cs double @upgrade.raw.buffer.atomic.fmin.f64(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.atomic.fmin.f64 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret double %val +} + +declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs double @upgrade.struct.buffer.atomic.fadd.f64(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.atomic.fadd.f64 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret double %val +} + +declare double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs double @upgrade.struct.buffer.atomic.fmax.f64(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.atomic.fmax.f64 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret double %val +} + +declare double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs double @upgrade.struct.buffer.atomic.fmin.f64(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.atomic.fmin.f64 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret double %val +} + +declare { float, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_f32i32s(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs { float, i32 } @upgrade.struct.buffer.load.format.sl_f32i32s(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.format.sl_f32i32s +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call { float, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_f32i32s(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call { float, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret { float, i32 } %val +} + +declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) +define amdgpu_cs float @upgrade.buffer.atomic.fadd.f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.atomic.fadd.f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false) + %val = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false) + ret float %val +} + +declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) +define amdgpu_cs float @upgrade.buffer.load.f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.load.f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + %val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret float %val +} + +declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) +define amdgpu_cs float @upgrade.buffer.load.format.f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.load.format.f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call float @llvm.amdgcn.buffer.load.format.f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + %val = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret float %val +} + +declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32) +define amdgpu_cs float @upgrade.raw.buffer.atomic.fadd.f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.atomic.fadd.f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret float %val +} + +declare float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float, <4 x i32>, i32, i32, i32) +define amdgpu_cs float @upgrade.raw.buffer.atomic.fmax.f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.atomic.fmax.f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret float %val +} + +declare float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float, <4 x i32>, i32, i32, i32) +define amdgpu_cs float @upgrade.raw.buffer.atomic.fmin.f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.atomic.fmin.f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret float %val +} + +declare float @llvm.amdgcn.raw.buffer.atomic.swap.f32(float, <4 x i32>, i32, i32, i32) +define amdgpu_cs float @upgrade.raw.buffer.atomic.swap.f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.atomic.swap.f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call float @llvm.amdgcn.raw.buffer.atomic.swap.f32(float undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.atomic.swap.f32(float undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret float %val +} + +declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) +define amdgpu_cs float @upgrade.raw.buffer.load.f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret float %val +} + +declare float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32>, i32, i32, i32) +define amdgpu_cs float @upgrade.raw.buffer.load.format.f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.format.f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret float %val +} + +declare float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs float @upgrade.raw.tbuffer.load.f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.tbuffer.load.f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret float %val +} + +declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) +define amdgpu_cs float @upgrade.s.buffer.load.f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0) + ret float %val +} + +declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs float @upgrade.struct.buffer.atomic.fadd.f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.atomic.fadd.f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret float %val +} + +declare float @llvm.amdgcn.struct.buffer.atomic.swap.f32(float, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs float @upgrade.struct.buffer.atomic.swap.f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.atomic.swap.f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call float @llvm.amdgcn.struct.buffer.atomic.swap.f32(float undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call float @llvm.amdgcn.struct.buffer.atomic.swap.f32(float undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret float %val +} + +declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs float @upgrade.struct.buffer.load.f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret float %val +} + +declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs float @upgrade.struct.buffer.load.format.f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.format.f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret float %val +} + +declare float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32>, i32, i32, i32, i32, i32) +define amdgpu_cs float @upgrade.struct.tbuffer.load.f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.tbuffer.load.f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call float @llvm.amdgcn.struct.tbuffer.load.f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0) + %val = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0) + ret float %val +} + +declare float @llvm.amdgcn.tbuffer.load.f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +define amdgpu_cs float @upgrade.tbuffer.load.f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.tbuffer.load.f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call float @llvm.amdgcn.tbuffer.load.f32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %val = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + ret float %val +} + +declare half @llvm.amdgcn.buffer.load.format.f16(<4 x i32>, i32, i32, i1, i1) +define amdgpu_cs half @upgrade.buffer.load.format.f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.load.format.f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call half @llvm.amdgcn.buffer.load.format.f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + %val = call half @llvm.amdgcn.buffer.load.format.f16(<4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret half %val +} + +declare half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32>, i32, i32, i32) +define amdgpu_cs half @upgrade.raw.buffer.load.f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call half @llvm.amdgcn.raw.buffer.load.f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret half %val +} + +declare half @llvm.amdgcn.raw.buffer.load.format.f16(<4 x i32>, i32, i32, i32) +define amdgpu_cs half @upgrade.raw.buffer.load.format.f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.format.f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call half @llvm.amdgcn.raw.buffer.load.format.f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call half @llvm.amdgcn.raw.buffer.load.format.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret half %val +} + +declare half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs half @upgrade.raw.tbuffer.load.f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.tbuffer.load.f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret half %val +} + +declare half @llvm.amdgcn.s.buffer.load.f16(<4 x i32>, i32, i32) +define amdgpu_cs half @upgrade.s.buffer.load.f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call half @llvm.amdgcn.s.buffer.load.f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0) + ret half %val +} + +declare half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs half @upgrade.struct.buffer.load.f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call half @llvm.amdgcn.struct.buffer.load.f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret half %val +} + +declare half @llvm.amdgcn.struct.buffer.load.format.f16(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs half @upgrade.struct.buffer.load.format.f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.format.f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call half @llvm.amdgcn.struct.buffer.load.format.f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call half @llvm.amdgcn.struct.buffer.load.format.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret half %val +} + +declare half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32, i32) +define amdgpu_cs half @upgrade.struct.tbuffer.load.f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.tbuffer.load.f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0) + %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0) + ret half %val +} + +declare half @llvm.amdgcn.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +define amdgpu_cs half @upgrade.tbuffer.load.f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.tbuffer.load.f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call half @llvm.amdgcn.tbuffer.load.f16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %val = call half @llvm.amdgcn.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + ret half %val +} + +declare i16 @llvm.amdgcn.buffer.load.i16(<4 x i32>, i32, i32, i1, i1) +define amdgpu_cs i16 @upgrade.buffer.load.i16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.load.i16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i16 @llvm.amdgcn.buffer.load.i16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + %val = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret i16 %val +} + +declare i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32>, i32, i32, i32) +define amdgpu_cs i16 @upgrade.raw.buffer.load.i16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.i16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i16 @llvm.amdgcn.raw.buffer.load.i16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret i16 %val +} + +declare i16 @llvm.amdgcn.struct.buffer.load.format.i16(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs i16 @upgrade.struct.buffer.load.format.i16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.format.i16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i16 @llvm.amdgcn.struct.buffer.load.format.i16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call i16 @llvm.amdgcn.struct.buffer.load.format.i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret i16 %val +} + +declare i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs i16 @upgrade.struct.buffer.load.i16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.i16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i16 @llvm.amdgcn.struct.buffer.load.i16(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret i16 %val +} + +declare i256 @llvm.amdgcn.s.buffer.load.i256(<4 x i32>, i32, i32) +define amdgpu_cs i256 @upgrade.s.buffer.load.i256(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.i256 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i256 @llvm.amdgcn.s.buffer.load.i256(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call i256 @llvm.amdgcn.s.buffer.load.i256(<4 x i32> %rsrc, i32 0, i32 0) + ret i256 %val +} + +declare { i32, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_i32i32s(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs { i32, i32 } @upgrade.struct.buffer.load.format.sl_i32i32s(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.format.sl_i32i32s +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call { i32, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_i32i32s(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call { i32, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret { i32, i32 } %val +} + +declare i32 @llvm.amdgcn.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i1) +define amdgpu_cs i32 @upgrade.buffer.atomic.add.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.atomic.add.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false) + %val = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false) + ret i32 %val +} + +declare i32 @llvm.amdgcn.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i1) +define amdgpu_cs i32 @upgrade.buffer.atomic.and.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.atomic.and.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.buffer.atomic.and.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false) + %val = call i32 @llvm.amdgcn.buffer.atomic.and.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false) + ret i32 %val +} + +declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) +define amdgpu_cs i32 @upgrade.buffer.atomic.cmpswap(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.atomic.cmpswap +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 undef, i32 0, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false) + %val = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 undef, i32 0, <4 x i32> %rsrc, i32 0, i32 0, i1 false) + ret i32 %val +} + +declare i32 @llvm.amdgcn.buffer.atomic.csub(i32, <4 x i32>, i32, i32, i1) +define amdgpu_cs i32 @upgrade.buffer.atomic.csub(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.atomic.csub +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.buffer.atomic.csub.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false) + %val = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false) + ret i32 %val +} + +declare i32 @llvm.amdgcn.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i1) +define amdgpu_cs i32 @upgrade.buffer.atomic.or.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.atomic.or.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.buffer.atomic.or.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false) + %val = call i32 @llvm.amdgcn.buffer.atomic.or.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false) + ret i32 %val +} + +declare i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32, <4 x i32>, i32, i32, i1) +define amdgpu_cs i32 @upgrade.buffer.atomic.smax.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.atomic.smax.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false) + %val = call i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false) + ret i32 %val +} + +declare i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32, <4 x i32>, i32, i32, i1) +define amdgpu_cs i32 @upgrade.buffer.atomic.smin.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.atomic.smin.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false) + %val = call i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false) + ret i32 %val +} + +declare i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i1) +define amdgpu_cs i32 @upgrade.buffer.atomic.sub.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.atomic.sub.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false) + %val = call i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false) + ret i32 %val +} + +declare i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i1) +define amdgpu_cs i32 @upgrade.buffer.atomic.swap.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.atomic.swap.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false) + %val = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false) + ret i32 %val +} + +declare i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i1) +define amdgpu_cs i32 @upgrade.buffer.atomic.umax.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.atomic.umax.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false) + %val = call i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false) + ret i32 %val +} + +declare i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32, <4 x i32>, i32, i32, i1) +define amdgpu_cs i32 @upgrade.buffer.atomic.umin.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.atomic.umin.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false) + %val = call i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false) + ret i32 %val +} + +declare i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i1) +define amdgpu_cs i32 @upgrade.buffer.atomic.xor.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.atomic.xor.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false) + %val = call i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false) + ret i32 %val +} + +declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32) +define amdgpu_cs i32 @upgrade.raw.buffer.atomic.add.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.atomic.add.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32) +define amdgpu_cs i32 @upgrade.raw.buffer.atomic.add(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.atomic.add +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i32) +define amdgpu_cs i32 @upgrade.raw.buffer.atomic.and.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.atomic.and.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32) +define amdgpu_cs i32 @upgrade.raw.buffer.atomic.cmpswap.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.atomic.cmpswap.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 undef, i32 0, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 undef, i32 0, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.raw.buffer.atomic.dec.i32(i32, <4 x i32>, i32, i32, i32) +define amdgpu_cs i32 @upgrade.raw.buffer.atomic.dec.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.atomic.dec.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.raw.buffer.atomic.dec.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.raw.buffer.atomic.dec.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.raw.buffer.atomic.inc.i32(i32, <4 x i32>, i32, i32, i32) +define amdgpu_cs i32 @upgrade.raw.buffer.atomic.inc.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.atomic.inc.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.raw.buffer.atomic.inc.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.raw.buffer.atomic.inc.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i32) +define amdgpu_cs i32 @upgrade.raw.buffer.atomic.or.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.atomic.or.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32, <4 x i32>, i32, i32, i32) +define amdgpu_cs i32 @upgrade.raw.buffer.atomic.smax.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.atomic.smax.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32, <4 x i32>, i32, i32, i32) +define amdgpu_cs i32 @upgrade.raw.buffer.atomic.smin.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.atomic.smin.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i32) +define amdgpu_cs i32 @upgrade.raw.buffer.atomic.sub.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.atomic.sub.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32) +define amdgpu_cs i32 @upgrade.raw.buffer.atomic.sub(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.atomic.sub +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i32) +define amdgpu_cs i32 @upgrade.raw.buffer.atomic.swap.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.atomic.swap.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i32) +define amdgpu_cs i32 @upgrade.raw.buffer.atomic.umax.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.atomic.umax.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32, <4 x i32>, i32, i32, i32) +define amdgpu_cs i32 @upgrade.raw.buffer.atomic.umin.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.atomic.umin.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i32) +define amdgpu_cs i32 @upgrade.raw.buffer.atomic.xor.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.atomic.xor.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32) +define amdgpu_cs i32 @upgrade.raw.buffer.load.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.raw.tbuffer.load.i32(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs i32 @upgrade.raw.tbuffer.load.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.tbuffer.load.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.raw.tbuffer.load.i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.raw.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32) +define amdgpu_cs i32 @upgrade.s.buffer.load.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs i32 @upgrade.struct.buffer.atomic.add.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.atomic.add.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs i32 @upgrade.struct.buffer.atomic.add(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.atomic.add +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.struct.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs i32 @upgrade.struct.buffer.atomic.and.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.atomic.and.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.struct.buffer.atomic.and.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.struct.buffer.atomic.and.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs i32 @upgrade.struct.buffer.atomic.cmpswap.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.atomic.cmpswap.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 undef, i32 0, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 undef, i32 0, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.struct.buffer.atomic.dec.i32(i32, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs i32 @upgrade.struct.buffer.atomic.dec.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.atomic.dec.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.struct.buffer.atomic.dec.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.struct.buffer.atomic.dec.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.struct.buffer.atomic.inc.i32(i32, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs i32 @upgrade.struct.buffer.atomic.inc.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.atomic.inc.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.struct.buffer.atomic.inc.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.struct.buffer.atomic.inc.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.struct.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs i32 @upgrade.struct.buffer.atomic.or.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.atomic.or.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.struct.buffer.atomic.or.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.struct.buffer.atomic.or.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.struct.buffer.atomic.smax.i32(i32, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs i32 @upgrade.struct.buffer.atomic.smax.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.atomic.smax.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.struct.buffer.atomic.smax.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.struct.buffer.atomic.smax.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.struct.buffer.atomic.smin.i32(i32, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs i32 @upgrade.struct.buffer.atomic.smin.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.atomic.smin.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.struct.buffer.atomic.smin.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.struct.buffer.atomic.smin.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs i32 @upgrade.struct.buffer.atomic.sub.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.atomic.sub.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs i32 @upgrade.struct.buffer.atomic.sub(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.atomic.sub +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs i32 @upgrade.struct.buffer.atomic.swap.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.atomic.swap.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.struct.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs i32 @upgrade.struct.buffer.atomic.umax.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.atomic.umax.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.struct.buffer.atomic.umax.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.struct.buffer.atomic.umax.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.struct.buffer.atomic.umin.i32(i32, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs i32 @upgrade.struct.buffer.atomic.umin.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.atomic.umin.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.struct.buffer.atomic.umin.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.struct.buffer.atomic.umin.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs i32 @upgrade.struct.buffer.atomic.xor.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.atomic.xor.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs i32 @upgrade.struct.buffer.load.format.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.format.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs i32 @upgrade.struct.buffer.load.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.struct.buffer.load.i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.struct.tbuffer.load.i32(<4 x i32>, i32, i32, i32, i32, i32) +define amdgpu_cs i32 @upgrade.struct.tbuffer.load.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.tbuffer.load.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.struct.tbuffer.load.i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.struct.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +define amdgpu_cs i32 @upgrade.tbuffer.load.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.tbuffer.load.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i32 @llvm.amdgcn.tbuffer.load.i32(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %val = call i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + ret i32 %val +} + +declare i512 @llvm.amdgcn.s.buffer.load.i512(<4 x i32>, i32, i32) +define amdgpu_cs i512 @upgrade.s.buffer.load.i512(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.i512 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i512 @llvm.amdgcn.s.buffer.load.i512(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call i512 @llvm.amdgcn.s.buffer.load.i512(<4 x i32> %rsrc, i32 0, i32 0) + ret i512 %val +} + +declare i64 @llvm.amdgcn.buffer.atomic.add.i64(i64, <4 x i32>, i32, i32, i1) +define amdgpu_cs i64 @upgrade.buffer.atomic.add.i64(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.atomic.add.i64 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i64 @llvm.amdgcn.buffer.atomic.add.i64(i64 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false) + %val = call i64 @llvm.amdgcn.buffer.atomic.add.i64(i64 undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false) + ret i64 %val +} + +declare i64 @llvm.amdgcn.buffer.atomic.and.i64(i64, <4 x i32>, i32, i32, i1) +define amdgpu_cs i64 @upgrade.buffer.atomic.and.i64(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.atomic.and.i64 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i64 @llvm.amdgcn.buffer.atomic.and.i64(i64 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false) + %val = call i64 @llvm.amdgcn.buffer.atomic.and.i64(i64 undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false) + ret i64 %val +} + +declare i64 @llvm.amdgcn.buffer.atomic.or.i64(i64, <4 x i32>, i32, i32, i1) +define amdgpu_cs i64 @upgrade.buffer.atomic.or.i64(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.atomic.or.i64 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i64 @llvm.amdgcn.buffer.atomic.or.i64(i64 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false) + %val = call i64 @llvm.amdgcn.buffer.atomic.or.i64(i64 undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false) + ret i64 %val +} + +declare i64 @llvm.amdgcn.buffer.atomic.smax.i64(i64, <4 x i32>, i32, i32, i1) +define amdgpu_cs i64 @upgrade.buffer.atomic.smax.i64(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.atomic.smax.i64 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i64 @llvm.amdgcn.buffer.atomic.smax.i64(i64 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false) + %val = call i64 @llvm.amdgcn.buffer.atomic.smax.i64(i64 undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false) + ret i64 %val +} + +declare i64 @llvm.amdgcn.buffer.atomic.smin.i64(i64, <4 x i32>, i32, i32, i1) +define amdgpu_cs i64 @upgrade.buffer.atomic.smin.i64(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.atomic.smin.i64 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i64 @llvm.amdgcn.buffer.atomic.smin.i64(i64 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false) + %val = call i64 @llvm.amdgcn.buffer.atomic.smin.i64(i64 undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false) + ret i64 %val +} + +declare i64 @llvm.amdgcn.buffer.atomic.sub.i64(i64, <4 x i32>, i32, i32, i1) +define amdgpu_cs i64 @upgrade.buffer.atomic.sub.i64(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.atomic.sub.i64 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i64 @llvm.amdgcn.buffer.atomic.sub.i64(i64 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false) + %val = call i64 @llvm.amdgcn.buffer.atomic.sub.i64(i64 undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false) + ret i64 %val +} + +declare i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64, <4 x i32>, i32, i32, i1) +define amdgpu_cs i64 @upgrade.buffer.atomic.swap.i64(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.atomic.swap.i64 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false) + %val = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false) + ret i64 %val +} + +declare i64 @llvm.amdgcn.buffer.atomic.umax.i64(i64, <4 x i32>, i32, i32, i1) +define amdgpu_cs i64 @upgrade.buffer.atomic.umax.i64(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.atomic.umax.i64 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i64 @llvm.amdgcn.buffer.atomic.umax.i64(i64 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false) + %val = call i64 @llvm.amdgcn.buffer.atomic.umax.i64(i64 undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false) + ret i64 %val +} + +declare i64 @llvm.amdgcn.buffer.atomic.umin.i64(i64, <4 x i32>, i32, i32, i1) +define amdgpu_cs i64 @upgrade.buffer.atomic.umin.i64(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.atomic.umin.i64 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i64 @llvm.amdgcn.buffer.atomic.umin.i64(i64 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false) + %val = call i64 @llvm.amdgcn.buffer.atomic.umin.i64(i64 undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false) + ret i64 %val +} + +declare i64 @llvm.amdgcn.buffer.atomic.xor.i64(i64, <4 x i32>, i32, i32, i1) +define amdgpu_cs i64 @upgrade.buffer.atomic.xor.i64(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.atomic.xor.i64 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i64 @llvm.amdgcn.buffer.atomic.xor.i64(i64 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false) + %val = call i64 @llvm.amdgcn.buffer.atomic.xor.i64(i64 undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false) + ret i64 %val +} + +declare i64 @llvm.amdgcn.raw.buffer.atomic.add.i64(i64, <4 x i32>, i32, i32, i32) +define amdgpu_cs i64 @upgrade.raw.buffer.atomic.add.i64(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.atomic.add.i64 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i64 @llvm.amdgcn.raw.buffer.atomic.add.i64(i64 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call i64 @llvm.amdgcn.raw.buffer.atomic.add.i64(i64 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret i64 %val +} + +declare i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64, i64, <4 x i32>, i32, i32, i32) +define amdgpu_cs i64 @upgrade.raw.buffer.atomic.cmpswap.i64(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.atomic.cmpswap.i64 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 undef, i64 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 undef, i64 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret i64 %val +} + +declare i64 @llvm.amdgcn.struct.buffer.atomic.add.i64(i64, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs i64 @upgrade.struct.buffer.atomic.add.i64(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.atomic.add.i64 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i64 @llvm.amdgcn.struct.buffer.atomic.add.i64(i64 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call i64 @llvm.amdgcn.struct.buffer.atomic.add.i64(i64 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret i64 %val +} + +declare i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64, i64, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs i64 @upgrade.struct.buffer.atomic.cmpswap.i64(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.atomic.cmpswap.i64 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 undef, i64 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 undef, i64 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret i64 %val +} + +declare i8 @llvm.amdgcn.buffer.load.i8(<4 x i32>, i32, i32, i1, i1) +define amdgpu_cs i8 @upgrade.buffer.load.i8(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.load.i8 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i8 @llvm.amdgcn.buffer.load.i8(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + %val = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret i8 %val +} + +declare i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32>, i32, i32, i32) +define amdgpu_cs i8 @upgrade.raw.buffer.load.i8(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.i8 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i8 @llvm.amdgcn.raw.buffer.load.i8(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + %val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret i8 %val +} + +declare i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32>, i32, i32) +define amdgpu_cs i8 @upgrade.s.buffer.load.i8(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.i8 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i8 @llvm.amdgcn.s.buffer.load.i8(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 0) + ret i8 %val +} + +declare i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32>, i32, i32, i32, i32) +define amdgpu_cs i8 @upgrade.struct.buffer.load.i8(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.i8 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i8 @llvm.amdgcn.struct.buffer.load.i8(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + %val = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret i8 %val +} + +declare i96 @llvm.amdgcn.s.buffer.load.i96(<4 x i32>, i32, i32) +define amdgpu_cs i96 @upgrade.s.buffer.load.i96(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.s.buffer.load.i96 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: %val = call i96 @llvm.amdgcn.s.buffer.load.i96(ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0) + %val = call i96 @llvm.amdgcn.s.buffer.load.i96(<4 x i32> %rsrc, i32 0, i32 0) + ret i96 %val +} + +declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) +define amdgpu_cs void @upgrade.buffer.store.f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.store.f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.buffer.store.f32(float undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + call void @llvm.amdgcn.buffer.store.f32(float undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret void +} + +declare void @llvm.amdgcn.buffer.store.format.f16(half, <4 x i32>, i32, i32, i1, i1) +define amdgpu_cs void @upgrade.buffer.store.format.f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.store.format.f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.buffer.store.format.f16(half undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + call void @llvm.amdgcn.buffer.store.format.f16(half undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret void +} + +declare void @llvm.amdgcn.buffer.store.format.f32(float, <4 x i32>, i32, i32, i1, i1) +define amdgpu_cs void @upgrade.buffer.store.format.f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.store.format.f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.buffer.store.format.f32(float undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + call void @llvm.amdgcn.buffer.store.format.f32(float undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret void +} + +declare void @llvm.amdgcn.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i1, i1) +define amdgpu_cs void @upgrade.buffer.store.format.v2f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.store.format.v2f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.buffer.store.format.v2f16(<2 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + call void @llvm.amdgcn.buffer.store.format.v2f16(<2 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret void +} + +declare void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) +define amdgpu_cs void @upgrade.buffer.store.format.v2f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.store.format.v2f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret void +} + +declare void @llvm.amdgcn.buffer.store.format.v3f16(<3 x half>, <4 x i32>, i32, i32, i1, i1) +define amdgpu_cs void @upgrade.buffer.store.format.v3f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.store.format.v3f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.buffer.store.format.v3f16(<3 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + call void @llvm.amdgcn.buffer.store.format.v3f16(<3 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret void +} + +declare void @llvm.amdgcn.buffer.store.format.v3f32(<3 x float>, <4 x i32>, i32, i32, i1, i1) +define amdgpu_cs void @upgrade.buffer.store.format.v3f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.store.format.v3f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.buffer.store.format.v3f32(<3 x float> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + call void @llvm.amdgcn.buffer.store.format.v3f32(<3 x float> undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret void +} + +declare void @llvm.amdgcn.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i1, i1) +define amdgpu_cs void @upgrade.buffer.store.format.v4f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.store.format.v4f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.buffer.store.format.v4f16(<4 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + call void @llvm.amdgcn.buffer.store.format.v4f16(<4 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret void +} + +declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) +define amdgpu_cs void @upgrade.buffer.store.format.v4f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.store.format.v4f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret void +} + +declare void @llvm.amdgcn.buffer.store.i16(i16, <4 x i32>, i32, i32, i1, i1) +define amdgpu_cs void @upgrade.buffer.store.i16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.store.i16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.buffer.store.i16(i16 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + call void @llvm.amdgcn.buffer.store.i16(i16 undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret void +} + +declare void @llvm.amdgcn.buffer.store.i8(i8, <4 x i32>, i32, i32, i1, i1) +define amdgpu_cs void @upgrade.buffer.store.i8(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.store.i8 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.buffer.store.i8(i8 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + call void @llvm.amdgcn.buffer.store.i8(i8 undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret void +} + +declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) +define amdgpu_cs void @upgrade.buffer.store.v2f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.store.v2f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret void +} + +declare void @llvm.amdgcn.buffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i1, i1) +define amdgpu_cs void @upgrade.buffer.store.v3f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.store.v3f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.buffer.store.v3f32(<3 x float> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + call void @llvm.amdgcn.buffer.store.v3f32(<3 x float> undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret void +} + +declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) +define amdgpu_cs void @upgrade.buffer.store.v4f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.buffer.store.v4f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i1 false, i1 false) + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, <4 x i32> %rsrc, i32 0, i32 0, i1 false, i1 false) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32>, ptr addrspace(3), i32, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.buffer.load.lds(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.load.lds +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.buffer.load.lds(ptr addrspace(8) %.rsrc.as.ptr, ptr addrspace(3) undef, i32 0, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) undef, i32 0, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.f16(half, <4 x i32>, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.buffer.store.f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.store.f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.buffer.store.f16(half undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f16(half undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.buffer.store.f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.store.f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.buffer.store.f32(float undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.format.f16(half, <4 x i32>, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.buffer.store.format.f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.store.format.f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.buffer.store.format.f16(half undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.f16(half undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.format.f32(float, <4 x i32>, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.buffer.store.format.f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.store.format.f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.buffer.store.format.f32(float undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.f32(float undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.buffer.store.format.v2f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.store.format.v2f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.buffer.store.format.v2f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.store.format.v2f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.format.v3f16(<3 x half>, <4 x i32>, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.buffer.store.format.v3f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.store.format.v3f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.buffer.store.format.v3f16(<3 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v3f16(<3 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float>, <4 x i32>, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.buffer.store.format.v3f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.store.format.v3f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.buffer.store.format.v4f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.store.format.v4f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.buffer.store.format.v4f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.store.format.v4f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.i16(i16, <4 x i32>, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.buffer.store.i16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.store.i16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.buffer.store.i16(i16 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i16(i16 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.buffer.store.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.store.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.buffer.store.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.i8(i8, <4 x i32>, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.buffer.store.i8(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.store.i8 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.buffer.store.i8(i8 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i8(i8 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.buffer.store.v2f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.store.v2f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.buffer.store.v2f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.store.v2f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.v2i16(<2 x i16>, <4 x i32>, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.buffer.store.v2i16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.store.v2i16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.buffer.store.v2i16(<2 x i16> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2i16(<2 x i16> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.buffer.store.v2i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.store.v2i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.buffer.store.v3f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.store.v3f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.buffer.store.v3i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.store.v3i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.buffer.store.v4f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.store.v4f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.buffer.store.v4f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.store.v4f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.v4i16(<4 x i16>, <4 x i32>, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.buffer.store.v4i16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.store.v4i16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.buffer.store.v4i16(<4 x i16> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4i16(<4 x i16> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.buffer.store.v4i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.buffer.store.v4i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.tbuffer.store.f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.tbuffer.store.f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.tbuffer.store.f16(half undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.f16(half undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.tbuffer.store.f32(float, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.tbuffer.store.f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.tbuffer.store.f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.tbuffer.store.f32(float undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.f32(float undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.tbuffer.store.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.tbuffer.store.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.tbuffer.store.i8(i8, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.tbuffer.store.i8(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.tbuffer.store.i8 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.tbuffer.store.v2f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.tbuffer.store.v2f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.tbuffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.tbuffer.store.v2f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.tbuffer.store.v2f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.tbuffer.store.v2f32(<2 x float> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v2f32(<2 x float> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.tbuffer.store.v2i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.tbuffer.store.v2i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.tbuffer.store.v2i32(<2 x i32> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v2i32(<2 x i32> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.tbuffer.store.v3f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.tbuffer.store.v3f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.tbuffer.store.v3f16(<3 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v3f16(<3 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.tbuffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.tbuffer.store.v3f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.tbuffer.store.v3f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.tbuffer.store.v3f32(<3 x float> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v3f32(<3 x float> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.tbuffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.tbuffer.store.v3i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.tbuffer.store.v3i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.tbuffer.store.v3i32(<3 x i32> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v3i32(<3 x i32> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.tbuffer.store.v4f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.tbuffer.store.v4f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.tbuffer.store.v4f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.tbuffer.store.v4f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.raw.tbuffer.store.v4i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.raw.tbuffer.store.v4i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32>, ptr addrspace(3), i32, i32, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.load.lds(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.load.lds +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.load.lds(ptr addrspace(8) %.rsrc.as.ptr, ptr addrspace(3) undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.f16(half, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.f16(half undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f16(half undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.f32(float undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.format.f16(half, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.format.f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.format.f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.format.f16(half undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.f16(half undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.format.f32(float, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.format.f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.format.f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.format.f32(float undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.f32(float undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.format.i16(i16, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.format.i16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.format.i16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.format.i16(i16 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.i16(i16 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.format.i32(i32, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.format.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.format.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.format.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.format.v2f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.format.v2f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.format.v2f16(<2 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v2f16(<2 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.format.v2f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.format.v2f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.format.v2f32(<2 x float> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v2f32(<2 x float> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.format.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.format.v3f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.format.v3f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.format.v3f16(<3 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v3f16(<3 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.format.v3f32(<3 x float>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.format.v3f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.format.v3f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.format.v3f32(<3 x float> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v3f32(<3 x float> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.format.v4f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.format.v4f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.format.v4f16(<4 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v4f16(<4 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.format.v4f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.format.v4f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.format.v4i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.format.v4i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.i16(i16, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.i16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.i16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.i16(i16 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.i16(i16 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.i8(i8, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.i8(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.i8 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.i8(i8 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.i8(i8 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.v2f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.v2f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.v2f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.v2f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.v2i16(<2 x i16>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.v2i16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.v2i16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.v2i16(<2 x i16> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v2i16(<2 x i16> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.v2i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.v2i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.v2i32(<2 x i32> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v2i32(<2 x i32> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.v3f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.v3f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.v3f16(<3 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v3f16(<3 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.v3f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.v3f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.v3f32(<3 x float> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v3f32(<3 x float> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.v4f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.v4f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.v4f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.v4f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.v4i16(<4 x i16>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.v4i16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.v4i16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.v4i16(<4 x i16> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v4i16(<4 x i16> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.buffer.store.v4i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.buffer.store.v4i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.tbuffer.store.f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.tbuffer.store.f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.tbuffer.store.f16(half undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.f16(half undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.tbuffer.store.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.tbuffer.store.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.tbuffer.store.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.tbuffer.store.v2f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.tbuffer.store.v2f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.tbuffer.store.v2f16(<2 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v2f16(<2 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.tbuffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.tbuffer.store.v2f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.tbuffer.store.v2f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.tbuffer.store.v2f32(<2 x float> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v2f32(<2 x float> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.tbuffer.store.v2i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.tbuffer.store.v2i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.tbuffer.store.v2i32(<2 x i32> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v2i32(<2 x i32> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.tbuffer.store.v3f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.tbuffer.store.v3f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.tbuffer.store.v3f16(<3 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v3f16(<3 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.tbuffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.tbuffer.store.v3i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.tbuffer.store.v3i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.tbuffer.store.v3i32(<3 x i32> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v3i32(<3 x i32> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.tbuffer.store.v4f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.tbuffer.store.v4f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.tbuffer.store.v4f16(<4 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v4f16(<4 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.tbuffer.store.v4f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.tbuffer.store.v4f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32) +define amdgpu_cs void @upgrade.struct.tbuffer.store.v4i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.struct.tbuffer.store.v4i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +define amdgpu_cs void @upgrade.tbuffer.store.f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.tbuffer.store.f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.tbuffer.store.f16(half undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + call void @llvm.amdgcn.tbuffer.store.f16(half undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + ret void +} + +declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +define amdgpu_cs void @upgrade.tbuffer.store.i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.tbuffer.store.i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + ret void +} + +declare void @llvm.amdgcn.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +define amdgpu_cs void @upgrade.tbuffer.store.v2f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.tbuffer.store.v2f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.tbuffer.store.v2f16(<2 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + call void @llvm.amdgcn.tbuffer.store.v2f16(<2 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + ret void +} + +declare void @llvm.amdgcn.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +define amdgpu_cs void @upgrade.tbuffer.store.v2i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.tbuffer.store.v2i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.tbuffer.store.v2i32(<2 x i32> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + call void @llvm.amdgcn.tbuffer.store.v2i32(<2 x i32> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + ret void +} + +declare void @llvm.amdgcn.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +define amdgpu_cs void @upgrade.tbuffer.store.v3f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.tbuffer.store.v3f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.tbuffer.store.v3f16(<3 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + call void @llvm.amdgcn.tbuffer.store.v3f16(<3 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + ret void +} + +declare void @llvm.amdgcn.tbuffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +define amdgpu_cs void @upgrade.tbuffer.store.v3i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.tbuffer.store.v3i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.tbuffer.store.v3i32(<3 x i32> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + call void @llvm.amdgcn.tbuffer.store.v3i32(<3 x i32> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + ret void +} + +declare void @llvm.amdgcn.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +define amdgpu_cs void @upgrade.tbuffer.store.v4f16(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.tbuffer.store.v4f16 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.tbuffer.store.v4f16(<4 x half> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + call void @llvm.amdgcn.tbuffer.store.v4f16(<4 x half> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + ret void +} + +declare void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +define amdgpu_cs void @upgrade.tbuffer.store.v4f32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.tbuffer.store.v4f32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + call void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + ret void +} + +declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +define amdgpu_cs void @upgrade.tbuffer.store.v4i32(<4 x i32> %rsrc) { +; CHECK-LABEL: @upgrade.tbuffer.store.v4i32 +; CHECK: %.rsrc.as.int = bitcast <4 x i32> %rsrc to i128 +; CHECK: %.rsrc.as.ptr = inttoptr i128 %.rsrc.as.int to ptr addrspace(8) +; CHECK: call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> undef, ptr addrspace(8) %.rsrc.as.ptr, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll @@ -1,14 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-atomic-optimizer -verify-machineinstrs %s | FileCheck -check-prefix=IR %s ; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-atomic-optimizations -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -declare i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32, i32 immarg) -declare i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i32, i32 immarg) -declare i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i32, i32 immarg) -declare void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32 immarg) +declare i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32>, ptr addrspace(8), i32, i32, i32, i32 immarg) -define amdgpu_cs void @atomic_add(<4 x i32> inreg %arg) { +define amdgpu_cs void @atomic_add(ptr addrspace(8) inreg %arg) { ; IR-LABEL: @atomic_add( ; IR-NEXT: .entry: ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) @@ -22,7 +21,7 @@ ; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]] ; IR: 9: -; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 [[TMP7]], <4 x i32> [[ARG:%.*]], i32 0, i32 0, i32 0, i32 0) +; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 [[TMP7]], ptr addrspace(8) [[ARG:%.*]], i32 0, i32 0, i32 0, i32 0) ; IR-NEXT: br label [[TMP11]] ; IR: 11: ; IR-NEXT: ret void @@ -43,11 +42,11 @@ ; GCN-NEXT: .LBB0_2: ; GCN-NEXT: s_endpgm .entry: - call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 1, <4 x i32> %arg, i32 0, i32 0, i32 0, i32 0) + call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 1, ptr addrspace(8) %arg, i32 0, i32 0, i32 0, i32 0) ret void } -define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) { +define amdgpu_cs void @atomic_add_and_format(ptr addrspace(8) inreg %arg) { ; IR-LABEL: @atomic_add_and_format( ; IR-NEXT: .entry: ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) @@ -61,13 +60,15 @@ ; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]] ; IR: 9: -; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 [[TMP7]], <4 x i32> [[ARG:%.*]], i32 0, i32 0, i32 0, i32 0) +; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 [[TMP7]], ptr addrspace(8) [[ARG:%.*]], i32 0, i32 0, i32 0, i32 0) ; IR-NEXT: br label [[TMP11]] ; IR: 11: ; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ] ; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]]) ; IR-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP5]] -; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0) +; IR-NEXT: [[B:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128 +; IR-NEXT: [[C:%.*]] = bitcast i128 [[B]] to <4 x i32> +; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[C]], ptr addrspace(8) [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0) ; IR-NEXT: ret void ; ; GCN-LABEL: atomic_add_and_format: @@ -97,12 +98,14 @@ ; GCN-NEXT: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen ; GCN-NEXT: s_endpgm .entry: - %a = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 1, <4 x i32> %arg, i32 0, i32 0, i32 0, i32 0) - call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> %arg, <4 x i32> %arg, i32 %a, i32 0, i32 0, i32 0) + %a = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 1, ptr addrspace(8) %arg, i32 0, i32 0, i32 0, i32 0) + %b = ptrtoint ptr addrspace(8) %arg to i128 + %c = bitcast i128 %b to <4 x i32> + call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> %c, ptr addrspace(8) %arg, i32 %a, i32 0, i32 0, i32 0) ret void } -define amdgpu_cs void @atomic_sub(<4 x i32> inreg %arg) { +define amdgpu_cs void @atomic_sub(ptr addrspace(8) inreg %arg) { ; IR-LABEL: @atomic_sub( ; IR-NEXT: .entry: ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) @@ -116,7 +119,7 @@ ; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]] ; IR: 9: -; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 [[TMP7]], <4 x i32> [[ARG:%.*]], i32 0, i32 0, i32 0, i32 0) +; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 [[TMP7]], ptr addrspace(8) [[ARG:%.*]], i32 0, i32 0, i32 0, i32 0) ; IR-NEXT: br label [[TMP11]] ; IR: 11: ; IR-NEXT: ret void @@ -137,11 +140,11 @@ ; GCN-NEXT: .LBB2_2: ; GCN-NEXT: s_endpgm .entry: - call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 1, <4 x i32> %arg, i32 0, i32 0, i32 0, i32 0) + call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 1, ptr addrspace(8) %arg, i32 0, i32 0, i32 0, i32 0) ret void } -define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) { +define amdgpu_cs void @atomic_sub_and_format(ptr addrspace(8) inreg %arg) { ; IR-LABEL: @atomic_sub_and_format( ; IR-NEXT: .entry: ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) @@ -155,13 +158,15 @@ ; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]] ; IR: 9: -; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 [[TMP7]], <4 x i32> [[ARG:%.*]], i32 0, i32 0, i32 0, i32 0) +; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 [[TMP7]], ptr addrspace(8) [[ARG:%.*]], i32 0, i32 0, i32 0, i32 0) ; IR-NEXT: br label [[TMP11]] ; IR: 11: ; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ] ; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]]) ; IR-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], [[TMP5]] -; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0) +; IR-NEXT: [[B:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128 +; IR-NEXT: [[C:%.*]] = bitcast i128 [[B]] to <4 x i32> +; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[C]], ptr addrspace(8) [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0) ; IR-NEXT: ret void ; ; GCN-LABEL: atomic_sub_and_format: @@ -191,12 +196,14 @@ ; GCN-NEXT: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen ; GCN-NEXT: s_endpgm .entry: - %a = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 1, <4 x i32> %arg, i32 0, i32 0, i32 0, i32 0) - call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> %arg, <4 x i32> %arg, i32 %a, i32 0, i32 0, i32 0) + %a = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 1, ptr addrspace(8) %arg, i32 0, i32 0, i32 0, i32 0) + %b = ptrtoint ptr addrspace(8) %arg to i128 + %c = bitcast i128 %b to <4 x i32> + call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> %c, ptr addrspace(8) %arg, i32 %a, i32 0, i32 0, i32 0) ret void } -define amdgpu_cs void @atomic_xor(<4 x i32> inreg %arg) { +define amdgpu_cs void @atomic_xor(ptr addrspace(8) inreg %arg) { ; IR-LABEL: @atomic_xor( ; IR-NEXT: .entry: ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) @@ -211,7 +218,7 @@ ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] ; IR: 10: -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 [[TMP8]], <4 x i32> [[ARG:%.*]], i32 0, i32 0, i32 0, i32 0) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 [[TMP8]], ptr addrspace(8) [[ARG:%.*]], i32 0, i32 0, i32 0, i32 0) ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: ret void @@ -233,11 +240,11 @@ ; GCN-NEXT: .LBB4_2: ; GCN-NEXT: s_endpgm .entry: - call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 1, <4 x i32> %arg, i32 0, i32 0, i32 0, i32 0) + call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 1, ptr addrspace(8) %arg, i32 0, i32 0, i32 0, i32 0) ret void } -define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) { +define amdgpu_cs void @atomic_xor_and_format(ptr addrspace(8) inreg %arg) { ; IR-LABEL: @atomic_xor_and_format( ; IR-NEXT: .entry: ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) @@ -252,14 +259,16 @@ ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] ; IR: 10: -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 [[TMP8]], <4 x i32> [[ARG:%.*]], i32 0, i32 0, i32 0, i32 0) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 [[TMP8]], ptr addrspace(8) [[ARG:%.*]], i32 0, i32 0, i32 0, i32 0) ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] ; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) ; IR-NEXT: [[TMP15:%.*]] = and i32 [[TMP5]], 1 ; IR-NEXT: [[TMP16:%.*]] = xor i32 [[TMP14]], [[TMP15]] -; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP16]], i32 0, i32 0, i32 0) +; IR-NEXT: [[B:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128 +; IR-NEXT: [[C:%.*]] = bitcast i128 [[B]] to <4 x i32> +; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[C]], ptr addrspace(8) [[ARG]], i32 [[TMP16]], i32 0, i32 0, i32 0) ; IR-NEXT: ret void ; ; GCN-LABEL: atomic_xor_and_format: @@ -291,7 +300,9 @@ ; GCN-NEXT: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen ; GCN-NEXT: s_endpgm .entry: - %a = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 1, <4 x i32> %arg, i32 0, i32 0, i32 0, i32 0) - call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> %arg, <4 x i32> %arg, i32 %a, i32 0, i32 0, i32 0) + %a = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 1, ptr addrspace(8) %arg, i32 0, i32 0, i32 0, i32 0) + %b = ptrtoint ptr addrspace(8) %arg to i128 + %c = bitcast i128 %b to <4 x i32> + call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> %c, ptr addrspace(8) %arg, i32 %a, i32 0, i32 0, i32 0) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-no-rtn.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-no-rtn.ll @@ -4,7 +4,7 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s -define amdgpu_ps void @buffer_atomic_fadd_f32_offset_no_rtn(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps void @buffer_atomic_fadd_f32_offset_no_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; GFX908_GFX11-LABEL: name: buffer_atomic_fadd_f32_offset_no_rtn ; GFX908_GFX11: bb.1 (%ir-block.0): ; GFX908_GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0 @@ -14,9 +14,9 @@ ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX908_GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX908_GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908_GFX11-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_offset_no_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): @@ -27,15 +27,15 @@ ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @buffer_atomic_fadd_f32_offen_no_rtn(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @buffer_atomic_fadd_f32_offen_no_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; GFX908_GFX11-LABEL: name: buffer_atomic_fadd_f32_offen_no_rtn ; GFX908_GFX11: bb.1 (%ir-block.0): ; GFX908_GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 @@ -45,10 +45,10 @@ ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX908_GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX908_GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908_GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908_GFX11-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_offen_no_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): @@ -59,16 +59,16 @@ ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @buffer_atomic_fadd_f32_idxen_no_rtn(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { +define amdgpu_ps void @buffer_atomic_fadd_f32_idxen_no_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) { ; GFX908_GFX11-LABEL: name: buffer_atomic_fadd_f32_idxen_no_rtn ; GFX908_GFX11: bb.1 (%ir-block.0): ; GFX908_GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 @@ -78,10 +78,10 @@ ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX908_GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX908_GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908_GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908_GFX11-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_idxen_no_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): @@ -92,16 +92,16 @@ ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @buffer_atomic_fadd_f32_bothen_no_rtn(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @buffer_atomic_fadd_f32_bothen_no_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; GFX908_GFX11-LABEL: name: buffer_atomic_fadd_f32_bothen_no_rtn ; GFX908_GFX11: bb.1 (%ir-block.0): ; GFX908_GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 @@ -111,12 +111,12 @@ ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX908_GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX908_GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908_GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908_GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX908_GFX11-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908_GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_bothen_no_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): @@ -127,16 +127,16 @@ ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void } -declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) -declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) +declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, ptr addrspace(8), i32, i32, i32 immarg) +declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-rtn.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-rtn.ll @@ -3,7 +3,7 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s -define amdgpu_ps float @buffer_atomic_fadd_f32_offset_rtn(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps float @buffer_atomic_fadd_f32_offset_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_offset_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0 @@ -13,9 +13,9 @@ ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX11-LABEL: name: buffer_atomic_fadd_f32_offset_rtn @@ -27,16 +27,16 @@ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX11-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX11-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX11-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret float %ret } -define amdgpu_ps float @buffer_atomic_fadd_f32_offen_rtn(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @buffer_atomic_fadd_f32_offen_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_offen_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 @@ -46,10 +46,10 @@ ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX11-LABEL: name: buffer_atomic_fadd_f32_offen_rtn @@ -61,17 +61,17 @@ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX11-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX11-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX11-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret float %ret } -define amdgpu_ps float @buffer_atomic_fadd_f32_idxen_rtn(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { +define amdgpu_ps float @buffer_atomic_fadd_f32_idxen_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_idxen_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 @@ -81,10 +81,10 @@ ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX11-LABEL: name: buffer_atomic_fadd_f32_idxen_rtn @@ -96,17 +96,17 @@ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX11-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX11-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX11-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret float %ret } -define amdgpu_ps float @buffer_atomic_fadd_f32_bothen_rtn(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @buffer_atomic_fadd_f32_bothen_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_bothen_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 @@ -116,12 +116,12 @@ ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX11-LABEL: name: buffer_atomic_fadd_f32_bothen_rtn @@ -133,17 +133,17 @@ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX11-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX11-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX11-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret float %ret } -declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) -declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) +declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, ptr addrspace(8), i32, i32, i32 immarg) +declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll @@ -2,7 +2,7 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -define amdgpu_ps void @buffer_atomic_fadd_f64_offset_no_rtn(double %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps void @buffer_atomic_fadd_f64_offset_no_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_offset_no_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 @@ -14,15 +14,15 @@ ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_OFFSET [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_OFFSET [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @buffer_atomic_fadd_f64_offen_no_rtn(double %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @buffer_atomic_fadd_f64_offen_no_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_offen_no_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 @@ -34,16 +34,16 @@ ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_OFFEN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_OFFEN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @buffer_atomic_fadd_f64_idxen_no_rtn(double %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { +define amdgpu_ps void @buffer_atomic_fadd_f64_idxen_no_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_idxen_no_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 @@ -55,16 +55,16 @@ ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_IDXEN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_IDXEN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @buffer_atomic_fadd_f64_bothen_no_rtn(double %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @buffer_atomic_fadd_f64_bothen_no_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_bothen_no_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -76,18 +76,18 @@ ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_BOTHEN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_BOTHEN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void } -define amdgpu_ps double @buffer_atomic_fadd_f64_offset_rtn(double %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps double @buffer_atomic_fadd_f64_offset_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_offset_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 @@ -99,9 +99,9 @@ ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0 ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1 ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec @@ -109,11 +109,11 @@ ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret double %ret } -define amdgpu_ps double @buffer_atomic_fadd_f64_offen_rtn(double %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps double @buffer_atomic_fadd_f64_offen_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_offen_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 @@ -125,10 +125,10 @@ ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0 ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1 ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec @@ -136,11 +136,11 @@ ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret double %ret } -define amdgpu_ps double @buffer_atomic_fadd_f64_idxen_rtn(double %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { +define amdgpu_ps double @buffer_atomic_fadd_f64_idxen_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_idxen_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 @@ -152,10 +152,10 @@ ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0 ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1 ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec @@ -163,11 +163,11 @@ ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret double %ret } -define amdgpu_ps double @buffer_atomic_fadd_f64_bothen_rtn(double %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps double @buffer_atomic_fadd_f64_bothen_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_bothen_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -179,12 +179,12 @@ ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0 ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1 ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec @@ -192,9 +192,9 @@ ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret double %ret } -declare double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32 immarg) -declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32 immarg) +declare double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i32 immarg) +declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-no-rtn.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-no-rtn.ll @@ -3,7 +3,7 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -define amdgpu_ps void @buffer_atomic_fadd_v2f16_offset_no_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps void @buffer_atomic_fadd_v2f16_offset_no_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; GFX908-LABEL: name: buffer_atomic_fadd_v2f16_offset_no_rtn ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0 @@ -13,9 +13,9 @@ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_offset_no_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): @@ -26,15 +26,15 @@ ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 4095, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @buffer_atomic_fadd_v2f16_offen_no_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @buffer_atomic_fadd_v2f16_offen_no_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; GFX908-LABEL: name: buffer_atomic_fadd_v2f16_offen_no_rtn ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 @@ -44,10 +44,10 @@ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_offen_no_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): @@ -58,16 +58,16 @@ ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @buffer_atomic_fadd_v2f16_idxen_no_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { +define amdgpu_ps void @buffer_atomic_fadd_v2f16_idxen_no_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) { ; GFX908-LABEL: name: buffer_atomic_fadd_v2f16_idxen_no_rtn ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 @@ -77,10 +77,10 @@ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_idxen_no_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): @@ -91,16 +91,16 @@ ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @buffer_atomic_fadd_v2f16_bothen_no_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @buffer_atomic_fadd_v2f16_bothen_no_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; GFX908-LABEL: name: buffer_atomic_fadd_v2f16_bothen_no_rtn ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 @@ -110,12 +110,12 @@ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_bothen_no_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): @@ -126,16 +126,16 @@ ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) + %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void } -declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) -declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) +declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32 immarg) +declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-rtn.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-rtn.ll @@ -2,7 +2,7 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_offset_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_offset_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_offset_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0 @@ -12,16 +12,16 @@ ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret <2 x half> %ret } -define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_offen_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_offen_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_offen_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 @@ -31,17 +31,17 @@ ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret <2 x half> %ret } -define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_idxen_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { +define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_idxen_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_idxen_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 @@ -51,17 +51,17 @@ ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret <2 x half> %ret } -define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_bothen_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_bothen_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_bothen_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 @@ -71,17 +71,17 @@ ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 3, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 3, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) + %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret <2 x half> %ret } -declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) -declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) +declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32 immarg) +declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-schedule.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-schedule.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-schedule.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-schedule.ll @@ -4,17 +4,17 @@ ; GCN: buffer_store_dword ; GCN: buffer_load_dword ; GCN: buffer_store_dword -define amdgpu_cs void @test1(<4 x i32> inreg %buf, i32 %off) { +define amdgpu_cs void @test1(ptr addrspace(8) %buf, i32 %off) { .entry: - call void @llvm.amdgcn.raw.buffer.store.i32(i32 0, <4 x i32> %buf, i32 8, i32 0, i32 0) - %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %buf, i32 %off, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %buf, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i32(i32 0, ptr addrspace(8) %buf, i32 8, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) %buf, i32 %off, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, ptr addrspace(8) %buf, i32 0, i32 0, i32 0) ret void } -declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32) #2 +declare i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8), i32, i32, i32) #2 -declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32) #3 +declare void @llvm.amdgcn.raw.buffer.store.i32(i32, ptr addrspace(8), i32, i32, i32) #3 attributes #2 = { nounwind readonly } attributes #3 = { nounwind writeonly } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX90A -declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32 immarg) -declare double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32 immarg) -declare double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32, i32 immarg) -declare double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32 immarg) -declare double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32, i32 immarg) -declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32 immarg) +declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i32 immarg) +declare double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double, ptr addrspace(8), i32, i32, i32 immarg) +declare double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32 immarg) declare double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) declare double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) @@ -15,7 +15,7 @@ declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32, i32, i1) -define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -27,11 +27,11 @@ ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) ret void } -define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen glc @@ -39,12 +39,12 @@ ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr undef ret void } -define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -60,12 +60,12 @@ ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 ret void } -define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -77,11 +77,11 @@ ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void } -define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen glc @@ -89,12 +89,12 @@ ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr undef ret void } -define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -110,12 +110,12 @@ ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 ret void } -define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -127,11 +127,11 @@ ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) ret void } -define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen glc @@ -139,12 +139,12 @@ ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr undef ret void } -define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -160,12 +160,12 @@ ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 ret void } -define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -177,11 +177,11 @@ ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void } -define amdgpu_ps void @struct_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_ps void @struct_buffer_atomic_min_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen glc @@ -189,12 +189,12 @@ ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr undef ret void } -define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -210,12 +210,12 @@ ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 ret void } -define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -227,11 +227,11 @@ ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) ret void } -define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen glc @@ -239,12 +239,12 @@ ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr undef ret void } -define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -260,12 +260,12 @@ ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 ret void } -define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -277,11 +277,11 @@ ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void } -define amdgpu_ps void @struct_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_ps void @struct_buffer_atomic_max_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen glc @@ -289,12 +289,12 @@ ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr undef ret void } -define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -310,7 +310,7 @@ ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir @@ -11,13 +11,15 @@ ; GCN-LABEL: name: s_buffer_load_s32 ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(p8) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s32)) + ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](p8) + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32) + ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s32)) ; GCN-NEXT: S_ENDPGM 0, implicit [[AMDGPU_S_BUFFER_LOAD]](s32) - %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %0:_(p8) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 - %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 + %2:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 :: (dereferenceable invariant load (s32), align 4) S_ENDPGM 0, implicit %2 ... @@ -31,15 +33,17 @@ ; GCN-LABEL: name: s_buffer_load_v3s32 ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(p8) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) - ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) - ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32) - ; GCN-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>) - %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](p8) + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32) + ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) + ; GCN-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) + ; GCN-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV4]](s32), [[UV5]](s32), [[UV6]](s32) + ; GCN-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR1]](<3 x s32>) + %0:_(p8) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 - %2:_(<3 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 + %2:_(<3 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 :: (dereferenceable invariant load (s96), align 4) S_ENDPGM 0, implicit %2 ... @@ -53,16 +57,18 @@ ; GCN-LABEL: name: s_buffer_load_v3p3 ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(p8) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) - ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) - ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32) - ; GCN-NEXT: [[BITCAST:%[0-9]+]]:_(<3 x p3>) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](p8) + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32) + ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) + ; GCN-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) + ; GCN-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV4]](s32), [[UV5]](s32), [[UV6]](s32) + ; GCN-NEXT: [[BITCAST:%[0-9]+]]:_(<3 x p3>) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; GCN-NEXT: S_ENDPGM 0, implicit [[BITCAST]](<3 x p3>) - %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %0:_(p8) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 - %2:_(<3 x p3>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 + %2:_(<3 x p3>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 :: (dereferenceable invariant load (s96), align 4) S_ENDPGM 0, implicit %2 ... @@ -76,16 +82,18 @@ ; GCN-LABEL: name: s_buffer_load_v6s16 ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(p8) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) - ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) - ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32) - ; GCN-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](p8) + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32) + ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) + ; GCN-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) + ; GCN-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV4]](s32), [[UV5]](s32), [[UV6]](s32) + ; GCN-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; GCN-NEXT: S_ENDPGM 0, implicit [[BITCAST]](<6 x s16>) - %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %0:_(p8) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 - %2:_(<6 x s16>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 + %2:_(<6 x s16>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 :: (dereferenceable invariant load (s96), align 4) S_ENDPGM 0, implicit %2 ... @@ -99,15 +107,17 @@ ; GCN-LABEL: name: s_buffer_load_v6s32 ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(p8) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s192), align 4) - ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>) - ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<6 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[UV4]](s32), [[UV5]](s32) - ; GCN-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<6 x s32>) - %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](p8) + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32) + ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s192), align 4) + ; GCN-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>) + ; GCN-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<6 x s32>) = G_BUILD_VECTOR [[UV4]](s32), [[UV5]](s32), [[UV6]](s32), [[UV7]](s32), [[UV8]](s32), [[UV9]](s32) + ; GCN-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR1]](<6 x s32>) + %0:_(p8) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 - %2:_(<6 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 + %2:_(<6 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 :: (dereferenceable invariant load (s192), align 4) S_ENDPGM 0, implicit %2 ... @@ -121,15 +131,17 @@ ; GCN-LABEL: name: s_buffer_load_v3s64 ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(p8) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s64>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s192), align 4) - ; GCN-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s64>) - ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[UV2]](s64) - ; GCN-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s64>) - %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](p8) + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32) + ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s64>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s192), align 4) + ; GCN-NEXT: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64), [[UV6:%[0-9]+]]:_(s64), [[UV7:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s64>) + ; GCN-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[UV4]](s64), [[UV5]](s64), [[UV6]](s64) + ; GCN-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR1]](<3 x s64>) + %0:_(p8) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 - %2:_(<3 x s64>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 + %2:_(<3 x s64>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 :: (dereferenceable invariant load (s192), align 4) S_ENDPGM 0, implicit %2 ... @@ -143,24 +155,26 @@ ; GCN-LABEL: name: s_buffer_load_v12s8 ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(p8) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) - ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) + ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](p8) + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32) + ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) + ; GCN-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GCN-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32) + ; GCN-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV4]], [[C1]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GCN-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C2]](s32) + ; GCN-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV4]], [[C2]](s32) ; GCN-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 - ; GCN-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C3]](s32) - ; GCN-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C1]](s32) - ; GCN-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C2]](s32) - ; GCN-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C3]](s32) - ; GCN-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C1]](s32) - ; GCN-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32) - ; GCN-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C3]](s32) + ; GCN-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV4]], [[C3]](s32) + ; GCN-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C1]](s32) + ; GCN-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C2]](s32) + ; GCN-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C3]](s32) + ; GCN-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV6]], [[C1]](s32) + ; GCN-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV6]], [[C2]](s32) + ; GCN-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV6]], [[C3]](s32) ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GCN-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C4]] + ; GCN-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV4]], [[C4]] ; GCN-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C4]] ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] @@ -170,7 +184,7 @@ ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] ; GCN-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; GCN-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C4]] + ; GCN-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C4]] ; GCN-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C4]] ; GCN-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) ; GCN-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] @@ -180,7 +194,7 @@ ; GCN-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) ; GCN-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] ; GCN-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) - ; GCN-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[UV2]], [[C4]] + ; GCN-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[UV6]], [[C4]] ; GCN-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C4]] ; GCN-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C2]](s32) ; GCN-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]] @@ -192,9 +206,9 @@ ; GCN-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) ; GCN-NEXT: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<12 x s16>) - %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %0:_(p8) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 - %2:_(<12 x s8>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 + %2:_(<12 x s8>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 :: (dereferenceable invariant load (s96), align 4) %3:_(<12 x s16>) = G_ANYEXT %2 S_ENDPGM 0, implicit %3 @@ -209,15 +223,17 @@ ; GCN-LABEL: name: s_buffer_load_s96 ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(p8) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) - ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) - ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32) - ; GCN-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>) - %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](p8) + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32) + ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) + ; GCN-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) + ; GCN-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV4]](s32), [[UV5]](s32), [[UV6]](s32) + ; GCN-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR1]](<3 x s32>) + %0:_(p8) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 - %2:_(<3 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 + %2:_(<3 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 :: (dereferenceable invariant load (s96), align 4) S_ENDPGM 0, implicit %2 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll @@ -2,7 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s ; Natural mapping -define amdgpu_ps float @raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -12,18 +12,18 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) %cast = bitcast i32 %ret to float ret float %cast } -define amdgpu_ps float @raw_buffer_atomic_add_i32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_atomic_add_i32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_atomic_add_i32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -33,18 +33,18 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) %cast = bitcast i32 %ret to float ret float %cast } -define amdgpu_ps <2 x float> @raw_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <2 x float> @raw_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -56,21 +56,21 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_X2_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_OFFEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_X2_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_OFFEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_OFFEN_RTN]].sub0 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_OFFEN_RTN]].sub1 ; CHECK-NEXT: $vgpr0 = COPY [[COPY8]] ; CHECK-NEXT: $vgpr1 = COPY [[COPY9]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.add.i64(i64 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.add.i64(i64 %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) %cast = bitcast i64 %ret to <2 x float> ret <2 x float> %cast } -define amdgpu_ps void @raw_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -82,17 +82,17 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_ATOMIC_ADD_X2_OFFEN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_ATOMIC_ADD_X2_OFFEN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.add.i64(i64 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.add.i64(i64 %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } ; All operands need regbank legalization -define amdgpu_ps float @raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i32 inreg %val, <4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) { +define amdgpu_ps float @raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i32 inreg %val, ptr addrspace(8) %rsrc, i32 inreg %voffset, i32 %soffset) { ; CHECK-LABEL: name: raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -103,9 +103,9 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -133,7 +133,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -145,13 +145,13 @@ ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) %cast = bitcast i32 %ret to float ret float %cast } ; All operands need regbank legalization -define amdgpu_ps void @raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i32 inreg %val, <4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) { +define amdgpu_ps void @raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i32 inreg %val, ptr addrspace(8) %rsrc, i32 inreg %voffset, i32 %soffset) { ; CHECK-LABEL: name: raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -162,9 +162,9 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -192,7 +192,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -203,11 +203,11 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 - %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps float @raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095(i32 %val, <4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095(i32 %val, ptr addrspace(8) inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -217,20 +217,20 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset = add i32 %voffset.base, 4095 - %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) %cast = bitcast i32 %ret to float ret float %cast } ; Natural mapping + slc -define amdgpu_ps float @raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(i32 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(i32 %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -240,18 +240,18 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) + %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 2) %cast = bitcast i32 %ret to float ret float %cast } -declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32 immarg) #0 -declare i64 @llvm.amdgcn.raw.buffer.atomic.add.i64(i64, <4 x i32>, i32, i32, i32 immarg) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, ptr addrspace(8), i32, i32, i32 immarg) #0 +declare i64 @llvm.amdgcn.raw.buffer.atomic.add.i64(i64, ptr addrspace(8), i32, i32, i32 immarg) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll @@ -2,7 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s ; Natural mapping -define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, i32 %cmp, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, i32 %cmp, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -13,21 +13,21 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN]].sub0 ; CHECK-NEXT: $vgpr0 = COPY [[COPY8]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) %cast = bitcast i32 %ret to float ret float %cast } ; Natural mapping -define amdgpu_ps void @raw_buffer_atomic_cmpswap_i32_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, i32 %cmp, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_atomic_cmpswap_i32_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, i32 %cmp, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_atomic_cmpswap_i32_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -38,18 +38,18 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; CHECK-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } ; All operands need regbank legalization -define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i32 inreg %val, i32 inreg %cmp, <4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) { +define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i32 inreg %val, i32 inreg %cmp, ptr addrspace(8) %rsrc, i32 inreg %voffset, i32 %soffset) { ; CHECK-LABEL: name: raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -61,9 +61,9 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] @@ -93,7 +93,7 @@ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN]].sub0 ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec @@ -106,13 +106,13 @@ ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[COPY15]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) %cast = bitcast i32 %ret to float ret float %cast } ; All operands need regbank legalization -define amdgpu_ps void @raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i32 inreg %val, i32 inreg %cmp, <4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) { +define amdgpu_ps void @raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i32 inreg %val, i32 inreg %cmp, ptr addrspace(8) %rsrc, i32 inreg %voffset, i32 %soffset) { ; CHECK-LABEL: name: raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -124,9 +124,9 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] @@ -156,7 +156,7 @@ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1 - ; CHECK-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -167,11 +167,11 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 - %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095(i32 %val, i32 %cmp, <4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095(i32 %val, i32 %cmp, ptr addrspace(8) inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -182,16 +182,16 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN]].sub0 ; CHECK-NEXT: $vgpr0 = COPY [[COPY8]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset = add i32 %voffset.base, 4095 - %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) %cast = bitcast i32 %ret to float ret float %cast } @@ -199,18 +199,18 @@ ; FIXME: 64-bit not handled ; ; Natural mapping -; define amdgpu_ps <2 x float> @raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, i64 %cmp, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { -; %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) +; define amdgpu_ps <2 x float> @raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, i64 %cmp, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ; %cast = bitcast i64 %ret to <2 x float> ; ret <2 x float> %cast ; } -; define amdgpu_ps void @raw_buffer_atomic_cmpswap_i64_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, i64 %cmp, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { -; %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) +; define amdgpu_ps void @raw_buffer_atomic_cmpswap_i64_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, i64 %cmp, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ; ret void ; } -declare i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32 immarg) #0 -declare i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64, i64, <4 x i32>, i32, i32, i32 immarg) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32, i32, ptr addrspace(8), i32, i32, i32 immarg) #0 +declare i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64, i64, ptr addrspace(8), i32, i32, i32 immarg) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll @@ -1,25 +1,25 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s ; RUN: not --crash llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 -declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) -declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) +declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, ptr addrspace(8), i32, i32, i32 immarg) +declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32 immarg) -; GFX908: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_AMDGPU_BUFFER_ATOMIC_FADD %{{[0-9]+}}:vgpr, %{{[0-9]+}}:sgpr(<4 x s32>), %{{[0-9]+}}:vgpr(s32), %{{[0-9]+}}:vgpr, %{{[0-9]+}}:sgpr, 0, 0, 0 :: (volatile dereferenceable load store (s32), align 1, addrspace 8) (in function: buffer_atomic_add_f32_rtn) +; GFX908: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_AMDGPU_BUFFER_ATOMIC_FADD %{{[0-9]+}}:vgpr, %{{[0-9]+|rsrcVec}}:sgpr(<4 x s32>), %{{[0-9]+}}:vgpr(s32), %{{[0-9]+}}:vgpr, %{{[0-9]+}}:sgpr, 0, 0, 0 :: (volatile dereferenceable load store (s32) on %ir.rsrc.load, align 1, addrspace 8) (in function: buffer_atomic_add_f32_rtn) ; GFX90A-LABEL: {{^}}buffer_atomic_add_f32_rtn: ; GFX90A: buffer_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}], s{{[0-9]+}} offen glc -define amdgpu_kernel void @buffer_atomic_add_f32_rtn(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 %soffset) { +define amdgpu_kernel void @buffer_atomic_add_f32_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 %soffset) { main_body: - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) store float %ret, ptr undef ret void } ; GFX90A-LABEL: {{^}}buffer_atomic_add_v2f16_rtn: ; GFX90A: buffer_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}], s{{[0-9]+}} offen glc -define amdgpu_kernel void @buffer_atomic_add_v2f16_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_kernel void @buffer_atomic_add_v2f16_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { main_body: - %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) store <2 x half> %ret, ptr undef ret void } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll @@ -3,7 +3,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX90A ; Natural mapping -define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; GFX908-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -13,10 +13,10 @@ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX90A: bb.1 (%ir-block.0): @@ -27,16 +27,16 @@ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; GFX908-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -46,10 +46,10 @@ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset ; GFX90A: bb.1 (%ir-block.0): @@ -60,17 +60,17 @@ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; GFX908-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -80,9 +80,9 @@ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset ; GFX90A: bb.1 (%ir-block.0): @@ -93,16 +93,16 @@ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 4095, i32 %soffset, i32 0) ret void } ; Natural mapping, no voffset -define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; GFX908-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -112,9 +112,9 @@ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset ; GFX90A: bb.1 (%ir-block.0): @@ -125,16 +125,16 @@ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret void } ; All operands need regbank legalization -define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(float inreg %val, <4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) { +define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(float inreg %val, ptr addrspace(8) %rsrc, i32 inreg %voffset, i32 %soffset) { ; GFX908-LABEL: name: raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: successors: %bb.2(0x80000000) @@ -145,9 +145,9 @@ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -175,7 +175,7 @@ ; GFX908-NEXT: bb.3: ; GFX908-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX908-NEXT: {{ $}} @@ -196,9 +196,9 @@ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -226,7 +226,7 @@ ; GFX90A-NEXT: bb.3: ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX90A-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX90A-NEXT: {{ $}} @@ -237,12 +237,12 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } ; All operands need regbank legalization, no voffset -define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset(float inreg %val, <4 x i32> %rsrc, i32 %soffset) { +define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset(float inreg %val, ptr addrspace(8) %rsrc, i32 %soffset) { ; GFX908-LABEL: name: raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: successors: %bb.2(0x80000000) @@ -253,8 +253,8 @@ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} @@ -281,7 +281,7 @@ ; GFX908-NEXT: bb.3: ; GFX908-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX908-NEXT: {{ $}} @@ -302,8 +302,8 @@ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX90A-NEXT: {{ $}} @@ -330,7 +330,7 @@ ; GFX90A-NEXT: bb.3: ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX90A-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX90A-NEXT: {{ $}} @@ -341,11 +341,11 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095(float %val, <4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { ; GFX908-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095 ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -355,10 +355,10 @@ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095 ; GFX90A: bb.1 (%ir-block.0): @@ -369,18 +369,18 @@ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A-NEXT: S_ENDPGM 0 %voffset = add i32 %voffset.base, 4095 - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } ; Natural mapping + slc -define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; GFX908-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -390,10 +390,10 @@ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc ; GFX90A: bb.1 (%ir-block.0): @@ -404,16 +404,16 @@ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 2) ret void } -define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; GFX908-LABEL: name: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -423,10 +423,10 @@ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX90A: bb.1 (%ir-block.0): @@ -437,16 +437,16 @@ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX90A-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) ; GFX90A-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; GFX908-LABEL: name: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -456,9 +456,9 @@ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset ; GFX90A: bb.1 (%ir-block.0): @@ -469,15 +469,15 @@ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX90A-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) ; GFX90A-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret void } -declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0 -declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) #0 +declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, ptr addrspace(8), i32, i32, i32 immarg) #0 +declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32 immarg) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll @@ -3,7 +3,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=UNPACKED %s ; Natural mapping -define amdgpu_ps half @raw_buffer_load_format_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps half @raw_buffer_load_format_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; PACKED-LABEL: name: raw_buffer_load_format_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): ; PACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -12,10 +12,10 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; UNPACKED-LABEL: name: raw_buffer_load_format_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset @@ -26,17 +26,17 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call half @llvm.amdgcn.raw.buffer.load.format.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call half @llvm.amdgcn.raw.buffer.load.format.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret half %val } -define amdgpu_ps <2 x half> @raw_buffer_load_format_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <2 x half> @raw_buffer_load_format_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; PACKED-LABEL: name: raw_buffer_load_format_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): ; PACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -45,10 +45,10 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XY_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XY_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_OFFEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; UNPACKED-LABEL: name: raw_buffer_load_format_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset @@ -59,10 +59,10 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub0 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub1 ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 @@ -76,17 +76,17 @@ ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call <2 x half> @llvm.amdgcn.raw.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call <2 x half> @llvm.amdgcn.raw.buffer.load.format.v2f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret <2 x half> %val } ; FIXME -; define amdgpu_ps <3 x half> @raw_buffer_load_format_v3f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { -; %val = call <3 x half> @llvm.amdgcn.raw.buffer.load.format.v3f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) +; define amdgpu_ps <3 x half> @raw_buffer_load_format_v3f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; %val = call <3 x half> @llvm.amdgcn.raw.buffer.load.format.v3f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ; ret <3 x half> %val ; } -define amdgpu_ps <4 x half> @raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <4 x half> @raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; PACKED-LABEL: name: raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): ; PACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -95,10 +95,10 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub0 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub1 ; PACKED-NEXT: $vgpr0 = COPY [[COPY6]] @@ -112,10 +112,10 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub0 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub1 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub2 @@ -139,12 +139,12 @@ ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - %val = call <4 x half> @llvm.amdgcn.raw.buffer.load.format.v4f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call <4 x half> @llvm.amdgcn.raw.buffer.load.format.v4f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret <4 x half> %val } ; Waterfall for rsrc and soffset, copy for voffset -define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) { +define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(ptr addrspace(8) %rsrc, i32 inreg %voffset, i32 %soffset) { ; PACKED-LABEL: name: raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset ; PACKED: bb.1 (%ir-block.0): ; PACKED-NEXT: successors: %bb.2(0x80000000) @@ -154,9 +154,9 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} @@ -183,7 +183,7 @@ ; PACKED-NEXT: bb.3: ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} @@ -204,9 +204,9 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} @@ -233,7 +233,7 @@ ; UNPACKED-NEXT: bb.3: ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} @@ -245,11 +245,11 @@ ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call half @llvm.amdgcn.raw.buffer.load.format.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call half @llvm.amdgcn.raw.buffer.load.format.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret half %val } -define amdgpu_ps <4 x half> @raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add_4095(<4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { +define amdgpu_ps <4 x half> @raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add_4095(ptr addrspace(8) inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { ; PACKED-LABEL: name: raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add_4095 ; PACKED: bb.1 (%ir-block.0): ; PACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -258,10 +258,10 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub0 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub1 ; PACKED-NEXT: $vgpr0 = COPY [[COPY6]] @@ -275,10 +275,10 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub0 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub1 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub2 @@ -303,13 +303,13 @@ ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %voffset = add i32 %voffset.base, 4095 - %val = call <4 x half> @llvm.amdgcn.raw.buffer.load.format.v4f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call <4 x half> @llvm.amdgcn.raw.buffer.load.format.v4f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret <4 x half> %val } -declare half @llvm.amdgcn.raw.buffer.load.format.f16(<4 x i32>, i32, i32, i32 immarg) #0 -declare <2 x half> @llvm.amdgcn.raw.buffer.load.format.v2f16(<4 x i32>, i32, i32, i32 immarg) #0 -declare <3 x half> @llvm.amdgcn.raw.buffer.load.format.v3f16(<4 x i32>, i32, i32, i32 immarg) #0 -declare <4 x half> @llvm.amdgcn.raw.buffer.load.format.v4f16(<4 x i32>, i32, i32, i32 immarg) #0 +declare half @llvm.amdgcn.raw.buffer.load.format.f16(ptr addrspace(8), i32, i32, i32 immarg) #0 +declare <2 x half> @llvm.amdgcn.raw.buffer.load.format.v2f16(ptr addrspace(8), i32, i32, i32 immarg) #0 +declare <3 x half> @llvm.amdgcn.raw.buffer.load.format.v3f16(ptr addrspace(8), i32, i32, i32 immarg) #0 +declare <4 x half> @llvm.amdgcn.raw.buffer.load.format.v4f16(ptr addrspace(8), i32, i32, i32 immarg) #0 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll @@ -2,7 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s ; Natural mapping -define amdgpu_ps float @raw_buffer_load_format_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_load_format_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_format_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -11,17 +11,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret float %val } -define amdgpu_ps <2 x float> @raw_buffer_load_format_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <2 x float> @raw_buffer_load_format_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_format_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -30,20 +30,20 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XY_OFFEN]].sub0 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XY_OFFEN]].sub1 ; CHECK-NEXT: $vgpr0 = COPY [[COPY6]] ; CHECK-NEXT: $vgpr1 = COPY [[COPY7]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - %val = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret <2 x float> %val } -define amdgpu_ps <3 x float> @raw_buffer_load_format_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <3 x float> @raw_buffer_load_format_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_format_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -52,10 +52,10 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub0 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub1 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub2 @@ -63,11 +63,11 @@ ; CHECK-NEXT: $vgpr1 = COPY [[COPY7]] ; CHECK-NEXT: $vgpr2 = COPY [[COPY8]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 - %val = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret <3 x float> %val } -define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -76,10 +76,10 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 @@ -89,12 +89,12 @@ ; CHECK-NEXT: $vgpr2 = COPY [[COPY8]] ; CHECK-NEXT: $vgpr3 = COPY [[COPY9]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - %val = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret <4 x float> %val } ; Waterfall for rsrc and soffset, copy for voffset -define amdgpu_ps float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) { +define amdgpu_ps float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(ptr addrspace(8) %rsrc, i32 inreg %voffset, i32 %soffset) { ; CHECK-LABEL: name: raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -104,9 +104,9 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} @@ -133,7 +133,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -145,11 +145,11 @@ ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret float %val } -define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add_4095(<4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { +define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add_4095(ptr addrspace(8) inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add_4095 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -158,10 +158,10 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 @@ -172,13 +172,13 @@ ; CHECK-NEXT: $vgpr3 = COPY [[COPY9]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 %voffset = add i32 %voffset.base, 4095 - %val = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret <4 x float> %val } -declare float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32>, i32, i32, i32 immarg) #0 -declare <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32 immarg) #0 -declare <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32 immarg) #0 -declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32 immarg) #0 +declare float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8), i32, i32, i32 immarg) #0 +declare <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8), i32, i32, i32 immarg) #0 +declare <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8), i32, i32, i32 immarg) #0 +declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8), i32, i32, i32 immarg) #0 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll @@ -3,7 +3,7 @@ ; FIXME: Test with SI when argument lowering not broken for f16 ; Natural mapping -define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -12,18 +12,18 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret float %val } ; Copies for VGPR arguments -define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 inreg %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 inreg %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7 @@ -32,19 +32,19 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr7 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret float %val } ; Waterfall for rsrc -define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -54,9 +54,9 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -79,7 +79,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -91,12 +91,12 @@ ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret float %val } ; Waterfall for rsrc and soffset -define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, i32 %voffset, i32 %soffset) { +define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset) { ; CHECK-LABEL: name: raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -106,9 +106,9 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -134,7 +134,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -146,12 +146,12 @@ ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret float %val } ; Natural mapping + glc -define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -160,18 +160,18 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 1) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 1) ret float %val } ; Natural mapping + slc -define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -180,18 +180,18 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 2, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 2, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 2) ret float %val } ; Natural mapping + dlc -define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -200,18 +200,18 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 4, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 4, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 4) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 4) ret float %val } ; Natural mapping + slc + dlc -define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_dlc(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_dlc(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_dlc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -220,18 +220,18 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 6, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 6, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 6) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 6) ret float %val } ; Natural mapping + glc + dlc -define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc_dlc(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc_dlc(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc_dlc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -240,18 +240,18 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 5, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 5, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 5) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 5) ret float %val } ; Natural mapping + glc + slc + dlc -define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc_slc_dlc(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc_slc_dlc(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc_slc_dlc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -260,18 +260,18 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 7, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 7, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 7) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 7) ret float %val } ; Natural mapping -define amdgpu_ps <2 x float> @raw_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <2 x float> @raw_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -280,20 +280,20 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1 ; CHECK-NEXT: $vgpr0 = COPY [[COPY6]] ; CHECK-NEXT: $vgpr1 = COPY [[COPY7]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - %val = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret <2 x float> %val } -define amdgpu_ps <3 x float> @raw_buffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <3 x float> @raw_buffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -302,10 +302,10 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX3_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX3_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub0 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub1 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub2 @@ -313,11 +313,11 @@ ; CHECK-NEXT: $vgpr1 = COPY [[COPY7]] ; CHECK-NEXT: $vgpr2 = COPY [[COPY8]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 - %val = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret <3 x float> %val } -define amdgpu_ps <4 x float> @raw_buffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <4 x float> @raw_buffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -326,10 +326,10 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 @@ -339,11 +339,11 @@ ; CHECK-NEXT: $vgpr2 = COPY [[COPY8]] ; CHECK-NEXT: $vgpr3 = COPY [[COPY9]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - %val = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret <4 x float> %val } -define amdgpu_ps half @raw_buffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps half @raw_buffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -352,17 +352,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call half @llvm.amdgcn.raw.buffer.load.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret half %val } -define amdgpu_ps <2 x half> @raw_buffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <2 x half> @raw_buffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -371,23 +371,23 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret <2 x half> %val } ; FIXME: Crashes -; define amdgpu_ps <3 x half> @raw_buffer_load_v3f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { -; %val = call <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) +; define amdgpu_ps <3 x half> @raw_buffer_load_v3f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; %val = call <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ; ret <3 x half> %val ; } -define amdgpu_ps <4 x half> @raw_buffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <4 x half> @raw_buffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -396,20 +396,20 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1 ; CHECK-NEXT: $vgpr0 = COPY [[COPY6]] ; CHECK-NEXT: $vgpr1 = COPY [[COPY7]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - %val = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret <4 x half> %val } -define amdgpu_ps float @raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zext(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zext(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zext ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -418,19 +418,19 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8) from %ir.rsrc, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call i8 @llvm.amdgcn.raw.buffer.load.i8(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) %zext = zext i8 %val to i32 %cast = bitcast i32 %zext to float ret float %cast } -define amdgpu_ps float @raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sext(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sext(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sext ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -439,19 +439,19 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_SBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_SBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8) from %ir.rsrc, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call i8 @llvm.amdgcn.raw.buffer.load.i8(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) %zext = sext i8 %val to i32 %cast = bitcast i32 %zext to float ret float %cast } -define amdgpu_ps float @raw_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zext(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zext(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zext ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -460,19 +460,19 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call i16 @llvm.amdgcn.raw.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) %zext = zext i16 %val to i32 %cast = bitcast i32 %zext to float ret float %cast } -define amdgpu_ps float @raw_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sext(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sext(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sext ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -481,20 +481,20 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_SSHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SSHORT_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_SSHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SSHORT_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SSHORT_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call i16 @llvm.amdgcn.raw.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) %sext = sext i16 %val to i32 %cast = bitcast i32 %sext to float ret float %cast } ; Waterfall for rsrc -define amdgpu_ps half @raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps half @raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -504,9 +504,9 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -529,7 +529,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -541,12 +541,12 @@ ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call half @llvm.amdgcn.raw.buffer.load.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret half %val } ; Waterfall for rsrc -define amdgpu_ps float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -556,9 +556,9 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -581,7 +581,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8) from %ir.rsrc, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -593,13 +593,13 @@ ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call i8 @llvm.amdgcn.raw.buffer.load.i8(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) %zext = zext i8 %val to i32 %cast = bitcast i32 %zext to float ret float %cast } -define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__voffset0(<4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__voffset0(ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__voffset0 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 @@ -608,16 +608,16 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[COPY4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[COPY4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret float %val } -define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset4095(<4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset4095(ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset4095 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 @@ -626,16 +626,16 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[COPY4]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[COPY4]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 4095, i32 %soffset, i32 0) ret float %val } -define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset4096(<4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset4096(ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset4096 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 @@ -644,18 +644,18 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE]], [[COPY4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE]], [[COPY4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 4096, i32 %soffset, i32 0) ret float %val } -define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add16(<4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add16(ptr addrspace(8) inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add16 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -664,18 +664,18 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 16, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 16, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset = add i32 %voffset.base, 16 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret float %val } -define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095(<4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095(ptr addrspace(8) inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -684,18 +684,18 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset = add i32 %voffset.base, 4095 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret float %val } -define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4096(<4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4096(ptr addrspace(8) inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4096 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -704,21 +704,21 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset = add i32 %voffset.base, 4096 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret float %val } -define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset4095(<4 x i32> inreg %rsrc, i32 %voffset) { +define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset4095(ptr addrspace(8) inreg %rsrc, i32 %voffset) { ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset4095 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -727,17 +727,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 4095, i32 0) ret float %val } -define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset4096(<4 x i32> inreg %rsrc, i32 %voffset) { +define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset4096(ptr addrspace(8) inreg %rsrc, i32 %voffset) { ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset4096 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -746,17 +746,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 4096, i32 0) ret float %val } -define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add16(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset.base) { +define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add16(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset.base) { ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add16 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -765,20 +765,20 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 16 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret float %val } -define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add4095(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset.base) { +define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add4095(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset.base) { ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add4095 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -787,20 +787,20 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4095 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret float %val } -define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add4096(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset.base) { +define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add4096(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset.base) { ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add4096 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -809,21 +809,21 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4096 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret float %val } ; An add of the offset is necessary, with a waterfall loop. Make sure the add is done outside of the waterfall loop. -define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add5000(<4 x i32> %rsrc, i32 %voffset, i32 inreg %soffset.base) { +define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add5000(ptr addrspace(8) %rsrc, i32 %voffset, i32 inreg %soffset.base) { ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add5000 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -833,11 +833,11 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -860,7 +860,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -873,12 +873,12 @@ ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 5000 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret float %val } ; An add of the offset is necessary, with a waterfall loop. Make sure the add is done outside of the waterfall loop. -define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add5000(<4 x i32> %rsrc, i32 %voffset.base, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add5000(ptr addrspace(8) %rsrc, i32 %voffset.base, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add5000 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -888,9 +888,9 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec @@ -916,7 +916,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[COPY5]], 904, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[COPY5]], 904, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -929,19 +929,19 @@ ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset = add i32 %voffset.base, 5000 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret float %val } -declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32 immarg) -declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32 immarg) -declare <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32>, i32, i32, i32 immarg) -declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32 immarg) +declare float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8), i32, i32, i32 immarg) +declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8), i32, i32, i32 immarg) +declare <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(ptr addrspace(8), i32, i32, i32 immarg) +declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8), i32, i32, i32 immarg) -declare half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32>, i32, i32, i32 immarg) -declare <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32>, i32, i32, i32 immarg) -declare <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(<4 x i32>, i32, i32, i32 immarg) -declare <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32>, i32, i32, i32 immarg) +declare half @llvm.amdgcn.raw.buffer.load.f16(ptr addrspace(8), i32, i32, i32 immarg) +declare <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(ptr addrspace(8), i32, i32, i32 immarg) +declare <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(ptr addrspace(8), i32, i32, i32 immarg) +declare <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(ptr addrspace(8), i32, i32, i32 immarg) -declare i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32>, i32, i32, i32 immarg) -declare i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32>, i32, i32, i32 immarg) +declare i8 @llvm.amdgcn.raw.buffer.load.i8(ptr addrspace(8), i32, i32, i32 immarg) +declare i16 @llvm.amdgcn.raw.buffer.load.i16(ptr addrspace(8), i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll @@ -2,7 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=instruction-select -o - %s | FileCheck -check-prefix=UNPACKED %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=PACKED %s -define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16(<4 x i32> inreg %rsrc, half %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16(ptr addrspace(8) inreg %rsrc, half %val, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16 ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -11,11 +11,11 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16 ; PACKED: bb.1 (%ir-block.0): @@ -25,17 +25,17 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_soffset_f16(<4 x i32> inreg %rsrc, half %val, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_soffset_f16(ptr addrspace(8) inreg %rsrc, half %val, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_soffset_f16 ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -44,10 +44,10 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_gfx80_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_gfx80_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_soffset_f16 ; PACKED: bb.1 (%ir-block.0): @@ -57,16 +57,16 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, ptr addrspace(8) %rsrc, i32 4095, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16(<4 x i32> inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16(ptr addrspace(8) inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16 ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -75,15 +75,15 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec - ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 + ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16 ; PACKED: bb.1 (%ir-block.0): @@ -93,17 +93,17 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16(<4 x i32> inreg %rsrc, <4 x half> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16(ptr addrspace(8) inreg %rsrc, <4 x half> %val, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16 ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -112,7 +112,6 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 @@ -122,8 +121,9 @@ ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY5]], implicit $exec - ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 - ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 + ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16 ; PACKED: bb.1 (%ir-block.0): @@ -133,20 +133,20 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } ; Make sure unpack code is emitted outside of loop -define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16(<4 x i32> %rsrc, <4 x half> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16(ptr addrspace(8) %rsrc, <4 x half> %val, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16 ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: successors: %bb.2(0x80000000) @@ -156,7 +156,6 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 @@ -166,7 +165,8 @@ ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY5]], implicit $exec - ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 + ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: @@ -177,8 +177,8 @@ ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec @@ -189,7 +189,7 @@ ; UNPACKED-NEXT: bb.3: ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) + ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} @@ -209,12 +209,12 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: @@ -225,8 +225,8 @@ ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; PACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; PACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; PACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 ; PACKED-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec @@ -237,7 +237,7 @@ ; PACKED-NEXT: bb.3: ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) + ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} @@ -248,11 +248,11 @@ ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4095(<4 x i32> inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4095(ptr addrspace(8) inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4095 ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -261,15 +261,15 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec - ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 + ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE]], [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4095 ; PACKED: bb.1 (%ir-block.0): @@ -279,17 +279,17 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 4095, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4096(<4 x i32> inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4096(ptr addrspace(8) inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4096 ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -298,15 +298,15 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec - ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 + ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE]], [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4096 ; PACKED: bb.1 (%ir-block.0): @@ -316,17 +316,17 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 4096, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_16(<4 x i32> inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_16(ptr addrspace(8) inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_16 ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -335,15 +335,15 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec - ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 + ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 16, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_16 ; PACKED: bb.1 (%ir-block.0): @@ -353,18 +353,18 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 16 - call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4095(<4 x i32> inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4095(ptr addrspace(8) inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4095 ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -373,15 +373,15 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec - ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 + ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4095 ; PACKED: bb.1 (%ir-block.0): @@ -391,18 +391,18 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 - call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4096(<4 x i32> inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4096(ptr addrspace(8) inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4096 ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -411,7 +411,6 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 @@ -421,8 +420,9 @@ ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec - ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 + ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4096 ; PACKED: bb.1 (%ir-block.0): @@ -432,23 +432,23 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; PACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 - call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) ret void } ; Check what happens with offset add inside a waterfall loop -define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16_add_4096(<4 x i32> %rsrc, <4 x half> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16_add_4096(ptr addrspace(8) %rsrc, <4 x half> %val, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16_add_4096 ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: successors: %bb.2(0x80000000) @@ -458,7 +458,6 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 @@ -471,7 +470,8 @@ ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY10]], [[COPY5]], implicit $exec - ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 + ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: @@ -482,8 +482,8 @@ ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec @@ -494,7 +494,7 @@ ; UNPACKED-NEXT: bb.3: ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) + ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} @@ -514,15 +514,15 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; PACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec + ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: @@ -533,8 +533,8 @@ ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; PACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; PACKED-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; PACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; PACKED-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; PACKED-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 ; PACKED-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec @@ -545,7 +545,7 @@ ; PACKED-NEXT: bb.3: ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) + ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} @@ -557,10 +557,10 @@ ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 - call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) ret void } -declare void @llvm.amdgcn.raw.buffer.store.format.f16(half, <4 x i32>, i32, i32, i32 immarg) -declare void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) -declare void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.format.f16(half, ptr addrspace(8), i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half>, ptr addrspace(8), i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll @@ -2,7 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=instruction-select -o - %s | FileCheck %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck %s -define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32(ptr addrspace(8) inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -11,17 +11,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_soffset_f32(<4 x i32> inreg %rsrc, float %val, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_soffset_f32(ptr addrspace(8) inreg %rsrc, float %val, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_soffset_f32 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -30,16 +30,16 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, ptr addrspace(8) %rsrc, i32 4095, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32(<4 x i32> inreg %rsrc, <2 x float> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32(ptr addrspace(8) inreg %rsrc, <2 x float> %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -48,19 +48,19 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v3f32(<4 x i32> inreg %rsrc, <3 x float> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v3f32(ptr addrspace(8) inreg %rsrc, <3 x float> %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v3f32 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -69,20 +69,20 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE]], [[COPY7]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f32(<4 x i32> inreg %rsrc, <4 x float> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f32(ptr addrspace(8) inreg %rsrc, <4 x float> %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f32 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 @@ -91,21 +91,21 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY8]], [[REG_SEQUENCE1]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f32(<4 x i32> %rsrc, <4 x float> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f32(ptr addrspace(8) %rsrc, <4 x float> %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f32 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -115,14 +115,14 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr7 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr8 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -133,8 +133,8 @@ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec @@ -145,7 +145,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -156,11 +156,11 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32_soffset4095(<4 x i32> inreg %rsrc, <2 x float> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32_soffset4095(ptr addrspace(8) inreg %rsrc, <2 x float> %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32_soffset4095 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -169,19 +169,19 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; CHECK-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 4095, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32_soffset4096(<4 x i32> inreg %rsrc, <2 x float> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32_soffset4096(ptr addrspace(8) inreg %rsrc, <2 x float> %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32_soffset4096 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -190,19 +190,19 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 4096, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32_voffset_add_16(<4 x i32> inreg %rsrc, <2 x float> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32_voffset_add_16(ptr addrspace(8) inreg %rsrc, <2 x float> %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32_voffset_add_16 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -211,20 +211,20 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 16, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 16, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 16 - call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32_voffset_add_4095(<4 x i32> inreg %rsrc, <2 x float> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32_voffset_add_4095(ptr addrspace(8) inreg %rsrc, <2 x float> %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32_voffset_add_4095 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -233,20 +233,20 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 4095, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 - call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32_voffset_add_4096(<4 x i32> inreg %rsrc, <2 x float> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32_voffset_add_4096(ptr addrspace(8) inreg %rsrc, <2 x float> %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32_voffset_add_4096 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -255,25 +255,25 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec - ; CHECK-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 - call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) ret void } ; Check what happens with offset add inside a waterfall loop -define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f32_add_4096(<4 x i32> %rsrc, <4 x float> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f32_add_4096(ptr addrspace(8) %rsrc, <4 x float> %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f32_add_4096 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -283,17 +283,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr7 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr8 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY8]], [[COPY10]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -304,8 +304,8 @@ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec @@ -316,7 +316,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -328,11 +328,11 @@ ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 - call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) ret void } -declare void @llvm.amdgcn.raw.buffer.store.format.f32(float, <4 x i32>, i32, i32, i32 immarg) -declare void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i32 immarg) -declare void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float>, <4 x i32>, i32, i32, i32 immarg) -declare void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.format.f32(float, ptr addrspace(8), i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float>, ptr addrspace(8), i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float>, ptr addrspace(8), i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll @@ -3,7 +3,7 @@ ; FIXME: Test with SI when argument lowering not broken for f16 ; Natural mapping -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -12,18 +12,18 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } ; Copies for VGPR arguments -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, float inreg %val, i32 inreg %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, float inreg %val, i32 inreg %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8 @@ -32,20 +32,20 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr7 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr8 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } ; Waterfall for rsrc -define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(<4 x i32> %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(ptr addrspace(8) %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -55,10 +55,10 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -81,7 +81,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -92,12 +92,12 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } ; Waterfall for soffset -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset(ptr addrspace(8) inreg %rsrc, float %val, i32 %voffset, i32 %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -107,10 +107,10 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -124,7 +124,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -135,12 +135,12 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } ; Waterfall for rsrc and soffset -define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, float %val, i32 %voffset, i32 %soffset) { +define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset(ptr addrspace(8) %rsrc, float %val, i32 %voffset, i32 %soffset) { ; CHECK-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -150,10 +150,10 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -179,7 +179,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -190,11 +190,11 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_glc(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_glc(ptr addrspace(8) inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_glc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -203,17 +203,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 1) + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 1) ret void } -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_slc(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_slc(ptr addrspace(8) inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_slc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -222,17 +222,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 2, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 2, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 2) ret void } -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_glc_slc(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_glc_slc(ptr addrspace(8) inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_glc_slc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -241,17 +241,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 3, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 3, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 3) + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 3) ret void } -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_dlc(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_dlc(ptr addrspace(8) inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_dlc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -260,17 +260,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 4, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 4, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 4) + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 4) ret void } -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_slc_dlc(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_slc_dlc(ptr addrspace(8) inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_slc_dlc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -279,17 +279,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 6, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 6, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 6) + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 6) ret void } -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_glc_dlc(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_glc_dlc(ptr addrspace(8) inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_glc_dlc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -298,17 +298,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 5, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 5, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 5) + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 5) ret void } -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_glc_slc_dlc(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_glc_slc_dlc(ptr addrspace(8) inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_glc_slc_dlc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -317,17 +317,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 7, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 7, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 7) + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 7) ret void } -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32(<4 x i32> inreg %rsrc, <2 x float> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32(ptr addrspace(8) inreg %rsrc, <2 x float> %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -336,19 +336,19 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v3f32(<4 x i32> inreg %rsrc, <3 x float> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v3f32(ptr addrspace(8) inreg %rsrc, <3 x float> %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v3f32 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -357,20 +357,20 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE]], [[COPY7]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f32(<4 x i32> inreg %rsrc, <4 x float> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f32(ptr addrspace(8) inreg %rsrc, <4 x float> %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f32 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 @@ -379,21 +379,21 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE]], [[COPY8]], [[REG_SEQUENCE1]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_i8(<4 x i32> inreg %rsrc, i32 %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_i8(ptr addrspace(8) inreg %rsrc, i32 %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_i8 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -402,18 +402,18 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s8) into %ir.rsrc, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 %val.trunc = trunc i32 %val to i8 - call void @llvm.amdgcn.raw.buffer.store.i8(i8 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i8(i8 %val.trunc, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_i16(<4 x i32> inreg %rsrc, i32 %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_i16(ptr addrspace(8) inreg %rsrc, i32 %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_i16 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -422,18 +422,18 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 %val.trunc = trunc i32 %val to i16 - call void @llvm.amdgcn.raw.buffer.store.i16(i16 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i16(i16 %val.trunc, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16(<4 x i32> inreg %rsrc, half %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16(ptr addrspace(8) inreg %rsrc, half %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -442,17 +442,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16(<4 x i32> inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16(ptr addrspace(8) inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -461,17 +461,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16(<4 x i32> inreg %rsrc, <4 x half> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16(ptr addrspace(8) inreg %rsrc, <4 x half> %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -480,19 +480,19 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16(<4 x i32> %rsrc, <4 x half> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16(ptr addrspace(8) %rsrc, <4 x half> %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -502,12 +502,12 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -518,8 +518,8 @@ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec @@ -530,7 +530,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -541,11 +541,11 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset4095(<4 x i32> inreg %rsrc, float %val, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset4095(ptr addrspace(8) inreg %rsrc, float %val, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset4095 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -554,16 +554,16 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 4095, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset4096(<4 x i32> inreg %rsrc, float %val, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset4096(ptr addrspace(8) inreg %rsrc, float %val, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset4096 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -572,18 +572,18 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 4096, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_16(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_16(ptr addrspace(8) inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_16 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -592,18 +592,18 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 16 - call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_4095(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_4095(ptr addrspace(8) inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_4095 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -612,18 +612,18 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 - call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_4096(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_4096(ptr addrspace(8) inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_4096 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -632,21 +632,21 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 - call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4095(<4 x i32> inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4095(ptr addrspace(8) inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4095 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -655,17 +655,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 4095, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4096(<4 x i32> inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4096(ptr addrspace(8) inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4096 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -674,17 +674,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 4096, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_16(<4 x i32> inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_16(ptr addrspace(8) inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_16 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -693,18 +693,18 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 16 - call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_4095(<4 x i32> inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_4095(ptr addrspace(8) inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_4095 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -713,18 +713,18 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 - call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_4096(<4 x i32> inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_4096(ptr addrspace(8) inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_4096 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -733,22 +733,22 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 - call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) ret void } ; An add of the offset is necessary, with a waterfall loop. Make sure the add is done outside of the waterfall loop. -define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_offset_add_5000(<4 x i32> %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_offset_add_5000(ptr addrspace(8) %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_offset_add_5000 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -758,10 +758,10 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec @@ -787,7 +787,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[COPY6]], 904, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[COPY6]], 904, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -799,12 +799,12 @@ ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 5000 - call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) ret void } ; An add of the offset is necessary, with a waterfall loop. Make sure the add is done outside of the waterfall loop. -define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr_soffset_offset(<4 x i32> %rsrc, float %val, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr_soffset_offset(ptr addrspace(8) %rsrc, float %val, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr_soffset_offset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -814,9 +814,9 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -841,7 +841,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY5]], 904, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY5]], 904, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -852,18 +852,18 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 5000, i32 %soffset, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 5000, i32 %soffset, i32 0) ret void } -declare void @llvm.amdgcn.raw.buffer.store.i8(i8, <4 x i32>, i32, i32, i32 immarg) -declare void @llvm.amdgcn.raw.buffer.store.i16(i16, <4 x i32>, i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.i8(i8, ptr addrspace(8), i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.i16(i16, ptr addrspace(8), i32, i32, i32 immarg) -declare void @llvm.amdgcn.raw.buffer.store.f16(half, <4 x i32>, i32, i32, i32 immarg) -declare void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) -declare void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.f16(half, ptr addrspace(8), i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half>, ptr addrspace(8), i32, i32, i32 immarg) -declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) -declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32 immarg) -declare void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i32 immarg) -declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, ptr addrspace(8), i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float>, ptr addrspace(8), i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll @@ -2,7 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=UNPACKED %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=PACKED %s -define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -11,10 +11,10 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; PACKED-LABEL: name: raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset @@ -25,17 +25,17 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret half %val } -define amdgpu_ps <2 x half> @raw_tbuffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <2 x half> @raw_tbuffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_tbuffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -44,10 +44,10 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub0 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub1 ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 @@ -69,23 +69,23 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XY_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XY_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_OFFEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + %val = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret <2 x half> %val } ; FIXME: Crashes -; define amdgpu_ps <3 x half> @raw_tbuffer_load_v3f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { -; %val = call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) +; define amdgpu_ps <3 x half> @raw_tbuffer_load_v3f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; %val = call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ; ret <3 x half> %val ; } -define amdgpu_ps <4 x half> @raw_tbuffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <4 x half> @raw_tbuffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_tbuffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -94,10 +94,10 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub0 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub1 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub2 @@ -129,20 +129,20 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub0 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub1 ; PACKED-NEXT: $vgpr0 = COPY [[COPY6]] ; PACKED-NEXT: $vgpr1 = COPY [[COPY7]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - %val = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + %val = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret <4 x half> %val } -define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) { +define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(ptr addrspace(8) %rsrc, i32 inreg %voffset, i32 %soffset) { ; UNPACKED-LABEL: name: raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: successors: %bb.2(0x80000000) @@ -152,9 +152,9 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} @@ -181,7 +181,7 @@ ; UNPACKED-NEXT: bb.3: ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} @@ -202,9 +202,9 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} @@ -231,7 +231,7 @@ ; PACKED-NEXT: bb.3: ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} @@ -243,11 +243,11 @@ ; PACKED-NEXT: bb.5: ; PACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret half %val } -define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -256,10 +256,10 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; PACKED-LABEL: name: raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc @@ -270,17 +270,17 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) + %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) ret half %val } -define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -289,10 +289,10 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 2, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 2, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; PACKED-LABEL: name: raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc @@ -303,17 +303,17 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 2, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 2, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) + %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) ret half %val } -define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -322,10 +322,10 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 3, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 3, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; PACKED-LABEL: name: raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc @@ -336,17 +336,17 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 3, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 3, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) + %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) ret half %val } -define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -355,10 +355,10 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 4, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 4, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; PACKED-LABEL: name: raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc @@ -369,19 +369,19 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 4, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 4, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) + %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) ret half %val } -declare half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32>, i32, i32, i32 immarg, i32 immarg) #0 -declare <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32 immarg, i32 immarg) #0 -declare <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32>, i32, i32, i32 immarg, i32 immarg) #0 -declare <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32 immarg, i32 immarg) #0 +declare half @llvm.amdgcn.raw.tbuffer.load.f16(ptr addrspace(8), i32, i32, i32 immarg, i32 immarg) #0 +declare <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(ptr addrspace(8), i32, i32, i32 immarg, i32 immarg) #0 +declare <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(ptr addrspace(8), i32, i32, i32 immarg, i32 immarg) #0 +declare <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(ptr addrspace(8), i32, i32, i32 immarg, i32 immarg) #0 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll @@ -2,7 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -11,17 +11,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret float %val } -define amdgpu_ps <2 x float> @raw_tbuffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <2 x float> @raw_tbuffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_tbuffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -30,20 +30,20 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFEN]].sub0 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFEN]].sub1 ; CHECK-NEXT: $vgpr0 = COPY [[COPY6]] ; CHECK-NEXT: $vgpr1 = COPY [[COPY7]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - %val = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + %val = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret <2 x float> %val } -define amdgpu_ps <3 x float> @raw_tbuffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <3 x float> @raw_tbuffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_tbuffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -52,10 +52,10 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub0 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub1 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub2 @@ -63,11 +63,11 @@ ; CHECK-NEXT: $vgpr1 = COPY [[COPY7]] ; CHECK-NEXT: $vgpr2 = COPY [[COPY8]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 - %val = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + %val = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret <3 x float> %val } -define amdgpu_ps <4 x float> @raw_tbuffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <4 x float> @raw_tbuffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_tbuffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -76,10 +76,10 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 @@ -89,11 +89,11 @@ ; CHECK-NEXT: $vgpr2 = COPY [[COPY8]] ; CHECK-NEXT: $vgpr3 = COPY [[COPY9]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - %val = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + %val = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret <4 x float> %val } -define amdgpu_ps float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) { +define amdgpu_ps float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(ptr addrspace(8) %rsrc, i32 inreg %voffset, i32 %soffset) { ; CHECK-LABEL: name: raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -103,9 +103,9 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} @@ -132,7 +132,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -144,11 +144,11 @@ ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret float %val } -define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -157,17 +157,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) + %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) ret float %val } -define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -176,17 +176,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 2, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 2, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) + %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) ret float %val } -define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -195,17 +195,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 3, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 3, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) + %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) ret float %val } -define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -214,19 +214,19 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 4, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 4, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) + %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) ret float %val } -declare float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32>, i32, i32, i32 immarg, i32 immarg) #0 -declare <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32>, i32, i32, i32 immarg, i32 immarg) #0 -declare <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32>, i32, i32, i32 immarg, i32 immarg) #0 -declare <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32 immarg, i32 immarg) #0 +declare float @llvm.amdgcn.raw.tbuffer.load.f32(ptr addrspace(8), i32, i32, i32 immarg, i32 immarg) #0 +declare <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(ptr addrspace(8), i32, i32, i32 immarg, i32 immarg) #0 +declare <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(ptr addrspace(8), i32, i32, i32 immarg, i32 immarg) #0 +declare <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8), i32, i32, i32 immarg, i32 immarg) #0 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll @@ -2,7 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=UNPACKED %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=PACKED %s -define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(half %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(half %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -12,10 +12,10 @@ ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): @@ -26,16 +26,16 @@ ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void } -define amdgpu_ps void @raw_tbuffer_store_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_tbuffer_store_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -45,14 +45,14 @@ ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY]], implicit $exec - ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 + ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: raw_tbuffer_store_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): @@ -63,22 +63,22 @@ ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void } ; FIXME: Crashes -; define amdgpu_ps void @raw_tbuffer_store_v3f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<3 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { -; call void @llvm.amdgcn.raw.tbuffer.store.v3f16(<3 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) +; define amdgpu_ps void @raw_tbuffer_store_v3f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<3 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; call void @llvm.amdgcn.raw.tbuffer.store.v3f16(<3 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ; ret void ; } -define amdgpu_ps void @raw_tbuffer_store_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_tbuffer_store_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -89,7 +89,6 @@ ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 @@ -97,8 +96,9 @@ ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY]], implicit $exec ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 - ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 + ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: raw_tbuffer_store_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): @@ -111,17 +111,17 @@ ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void } ; Waterfall for rsrc -define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset(half %val, <4 x i32> %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: successors: %bb.2(0x80000000) @@ -132,9 +132,9 @@ ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: @@ -157,7 +157,7 @@ ; UNPACKED-NEXT: bb.3: ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} @@ -178,9 +178,9 @@ ; PACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: @@ -203,7 +203,7 @@ ; PACKED-NEXT: bb.3: ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} @@ -214,12 +214,12 @@ ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void } ; Waterfall for rsrc and soffset -define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soffset(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset) { +define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soffset(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset) { ; UNPACKED-LABEL: name: raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: successors: %bb.2(0x80000000) @@ -230,9 +230,9 @@ ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: @@ -258,7 +258,7 @@ ; UNPACKED-NEXT: bb.3: ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} @@ -279,9 +279,9 @@ ; PACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: @@ -307,7 +307,7 @@ ; PACKED-NEXT: bb.3: ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} @@ -318,12 +318,12 @@ ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void } ; Waterfall for rsrc and soffset, copy for voffset -define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(half %val, <4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) { +define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(half %val, ptr addrspace(8) %rsrc, i32 inreg %voffset, i32 %soffset) { ; UNPACKED-LABEL: name: raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: successors: %bb.2(0x80000000) @@ -334,9 +334,9 @@ ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} @@ -363,7 +363,7 @@ ; UNPACKED-NEXT: bb.3: ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} @@ -384,9 +384,9 @@ ; PACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} @@ -413,7 +413,7 @@ ; PACKED-NEXT: bb.3: ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} @@ -424,11 +424,11 @@ ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void } -define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(half %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(half %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -438,10 +438,10 @@ ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc ; PACKED: bb.1 (%ir-block.0): @@ -452,16 +452,16 @@ ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) + call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) ret void } -define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(half %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(half %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -471,10 +471,10 @@ ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 2, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 2, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc ; PACKED: bb.1 (%ir-block.0): @@ -485,16 +485,16 @@ ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 2, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 2, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) + call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) ret void } -define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc(half %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc(half %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -504,10 +504,10 @@ ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 3, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 3, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc ; PACKED: bb.1 (%ir-block.0): @@ -518,16 +518,16 @@ ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 3, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 3, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) + call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) ret void } -define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(half %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(half %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -537,10 +537,10 @@ ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 4, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 4, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc ; PACKED: bb.1 (%ir-block.0): @@ -551,16 +551,16 @@ ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 4, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 4, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) + call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) ret void } -declare void @llvm.amdgcn.raw.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32 immarg, i32 immarg) -declare void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg, i32 immarg) -declare void @llvm.amdgcn.raw.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32 immarg, i32 immarg) -declare void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32 immarg, i32 immarg) +declare void @llvm.amdgcn.raw.tbuffer.store.f16(half, ptr addrspace(8), i32, i32, i32 immarg, i32 immarg) +declare void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32 immarg, i32 immarg) +declare void @llvm.amdgcn.raw.tbuffer.store.v3f16(<3 x half>, ptr addrspace(8), i32, i32, i32 immarg, i32 immarg) +declare void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half>, ptr addrspace(8), i32, i32, i32 immarg, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll @@ -2,7 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=UNPACKED %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=PACKED %s -define amdgpu_ps void @raw_tbuffer_store_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i8 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i8 %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_tbuffer_store_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -12,10 +12,10 @@ ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8) into %ir.rsrc, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: raw_tbuffer_store_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): @@ -26,17 +26,17 @@ ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; PACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8) into %ir.rsrc, addrspace 8) ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void } ; Waterfall for rsrc -define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset(i8 %val, <4 x i32> %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset(i8 %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: successors: %bb.2(0x80000000) @@ -47,9 +47,9 @@ ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: @@ -72,7 +72,7 @@ ; UNPACKED-NEXT: bb.3: ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) + ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8) into %ir.rsrc, addrspace 8) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} @@ -93,9 +93,9 @@ ; PACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: @@ -118,7 +118,7 @@ ; PACKED-NEXT: bb.3: ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) + ; PACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8) into %ir.rsrc, addrspace 8) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} @@ -129,12 +129,12 @@ ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void } ; Waterfall for rsrc and soffset -define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffset(i8 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset) { +define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffset(i8 %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset) { ; UNPACKED-LABEL: name: raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: successors: %bb.2(0x80000000) @@ -145,9 +145,9 @@ ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: @@ -173,7 +173,7 @@ ; UNPACKED-NEXT: bb.3: ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) + ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8) into %ir.rsrc, addrspace 8) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} @@ -194,9 +194,9 @@ ; PACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: @@ -222,7 +222,7 @@ ; PACKED-NEXT: bb.3: ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) + ; PACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8) into %ir.rsrc, addrspace 8) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} @@ -233,12 +233,12 @@ ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void } ; Waterfall for rsrc and soffset, copy for voffset -define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i8 %val, <4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) { +define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i8 %val, ptr addrspace(8) %rsrc, i32 inreg %voffset, i32 %soffset) { ; UNPACKED-LABEL: name: raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: successors: %bb.2(0x80000000) @@ -249,9 +249,9 @@ ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} @@ -278,7 +278,7 @@ ; UNPACKED-NEXT: bb.3: ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) + ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8) into %ir.rsrc, addrspace 8) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} @@ -299,9 +299,9 @@ ; PACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} @@ -328,7 +328,7 @@ ; PACKED-NEXT: bb.3: ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) + ; PACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8) into %ir.rsrc, addrspace 8) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} @@ -339,8 +339,8 @@ ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void } -declare void @llvm.amdgcn.raw.tbuffer.store.i8(i8, <4 x i32>, i32, i32, i32 immarg, i32 immarg) +declare void @llvm.amdgcn.raw.tbuffer.store.i8(i8, ptr addrspace(8), i32, i32, i32 immarg, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll @@ -3,7 +3,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s ; Natural mapping -define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -13,17 +13,17 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void } ; Natural mapping -define amdgpu_ps void @raw_tbuffer_store_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x float> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x float> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_tbuffer_store_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -35,17 +35,17 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void } ; Natural mapping -define amdgpu_ps void @raw_tbuffer_store_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<3 x float> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<3 x float> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_tbuffer_store_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -58,17 +58,17 @@ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE]], [[COPY7]], [[REG_SEQUENCE1]], [[COPY8]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE]], [[COPY7]], [[REG_SEQUENCE1]], [[COPY8]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v3f32(<3 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void } ; Natural mapping -define amdgpu_ps void @raw_tbuffer_store_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x float> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x float> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_tbuffer_store_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 @@ -82,17 +82,17 @@ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY8]], [[REG_SEQUENCE1]], [[COPY9]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY8]], [[REG_SEQUENCE1]], [[COPY9]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void } ; Copies for VGPR arguments -define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 inreg %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0 @@ -102,18 +102,18 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr7 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void } ; Waterfall for rsrc -define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -124,9 +124,9 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -149,7 +149,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 1, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 1, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -160,12 +160,12 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 1) + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 1) ret void } ; Waterfall for rsrc and soffset -define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset) { +define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset) { ; CHECK-LABEL: name: raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -176,9 +176,9 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -204,7 +204,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -215,12 +215,12 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void } ; Waterfall for rsrc and soffset, copy for voffset -define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(float %val, <4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) { +define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(float %val, ptr addrspace(8) %rsrc, i32 inreg %voffset, i32 %soffset) { ; CHECK-LABEL: name: raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -231,9 +231,9 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} @@ -260,7 +260,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -271,12 +271,12 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void } ; Natural mapping + glc -define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -286,17 +286,17 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) ret void } ; Natural mapping + slc -define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -306,17 +306,17 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 2, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 2, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) ret void } ; Natural mapping + glc + slc -define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -326,17 +326,17 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 3, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 3, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) ret void } ; Natural mapping + dlc -define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -346,18 +346,18 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 4, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 4, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) ret void } -define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__voffset0(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__voffset0(float %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__voffset0 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -367,15 +367,15 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 94, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 94, i32 0) ret void } -define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset4095(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset4095(float %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset4095 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -385,15 +385,15 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 94, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 4095, i32 %soffset, i32 94, i32 0) ret void } -define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset4096(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset4096(float %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset4096 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -403,17 +403,17 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 94, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 4096, i32 %soffset, i32 94, i32 0) ret void } -define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add16(float %val, <4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add16(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add16 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -423,17 +423,17 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 %voffset = add i32 %voffset.base, 16 - call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void } -define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095(float %val, <4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -443,17 +443,17 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 %voffset = add i32 %voffset.base, 4095 - call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void } -define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4096(float %val, <4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4096(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4096 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -463,20 +463,20 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 %voffset = add i32 %voffset.base, 4096 - call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void } -define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset4095(float %val, <4 x i32> inreg %rsrc, i32 %voffset) { +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset4095(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset) { ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset4095 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1 @@ -486,16 +486,16 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 94, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 4095, i32 94, i32 0) ret void } -define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset4096(float %val, <4 x i32> inreg %rsrc, i32 %voffset) { +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset4096(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset) { ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset4096 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1 @@ -505,16 +505,16 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 94, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 4096, i32 94, i32 0) ret void } -define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add16(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset.base) { +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add16(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset.base) { ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add16 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -524,19 +524,19 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY6]], [[S_MOV_B32_]], implicit-def $scc - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 %soffset = add i32 %soffset.base, 16 - call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void } -define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add4095(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset.base) { +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add4095(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset.base) { ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add4095 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -546,19 +546,19 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY6]], [[S_MOV_B32_]], implicit-def $scc - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 %soffset = add i32 %soffset.base, 4095 - call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void } -define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add4096(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset.base) { +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add4096(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset.base) { ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add4096 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -568,20 +568,20 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY6]], [[S_MOV_B32_]], implicit-def $scc - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 %soffset = add i32 %soffset.base, 4096 - call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void } ; An add of the offset is necessary, with a waterfall loop. Make sure the add is done outside of the waterfall loop. -define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add5000(float %val, <4 x i32> %rsrc, i32 %voffset, i32 inreg %soffset.base) { +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add5000(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 inreg %soffset.base) { ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add5000 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -592,11 +592,11 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY6]], [[S_MOV_B32_]], implicit-def $scc + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -619,7 +619,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[S_ADD_I32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[S_ADD_I32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -631,12 +631,12 @@ ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %soffset = add i32 %soffset.base, 5000 - call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void } ; An add of the offset is necessary, with a waterfall loop. Make sure the add is done outside of the waterfall loop. -define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add5000(float %val, <4 x i32> %rsrc, i32 %voffset.base, i32 inreg %soffset) { +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add5000(float %val, ptr addrspace(8) %rsrc, i32 %voffset.base, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add5000 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -647,9 +647,9 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec @@ -675,7 +675,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE1]], [[COPY6]], 904, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE1]], [[COPY6]], 904, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -687,11 +687,11 @@ ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %voffset = add i32 %voffset.base, 5000 - call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void } -declare void @llvm.amdgcn.raw.tbuffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg, i32 immarg) -declare void @llvm.amdgcn.raw.tbuffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32 immarg, i32 immarg) -declare void @llvm.amdgcn.raw.tbuffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i32 immarg, i32 immarg) -declare void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32 immarg, i32 immarg) +declare void @llvm.amdgcn.raw.tbuffer.store.f32(float, ptr addrspace(8), i32, i32, i32 immarg, i32 immarg) +declare void @llvm.amdgcn.raw.tbuffer.store.v2f32(<2 x float>, ptr addrspace(8), i32, i32, i32 immarg, i32 immarg) +declare void @llvm.amdgcn.raw.tbuffer.store.v3f32(<3 x float>, ptr addrspace(8), i32, i32, i32 immarg, i32 immarg) +declare void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32 immarg, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll @@ -6,7 +6,7 @@ ; FIXME: Merge with regbankselect, which mostly overlaps when all types supported. ; Natural mapping -define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps i32 @s_buffer_load_i32(ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; GFX6-LABEL: name: s_buffer_load_i32 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 @@ -15,9 +15,9 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -30,9 +30,9 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -45,18 +45,18 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 - %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret i32 %val } -define amdgpu_ps i32 @s_buffer_load_i32_glc(<4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps i32 @s_buffer_load_i32_glc(ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; GFX6-LABEL: name: s_buffer_load_i32_glc ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 @@ -65,9 +65,9 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 1 :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 1 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -80,9 +80,9 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 1 :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 1 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -95,18 +95,18 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 1 :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 1 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 - %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %soffset, i32 1) + %val = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %rsrc, i32 %soffset, i32 1) ret i32 %val } -define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; GFX6-LABEL: name: s_buffer_load_v2i32 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 @@ -115,9 +115,9 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORDX2_SGPR:%[0-9]+]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (s64), align 4) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORDX2_SGPR:%[0-9]+]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (<2 x s32>) from %ir.rsrc, align 1, addrspace 8) ; GFX6-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR]].sub0 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR]].sub1 ; GFX6-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] @@ -135,9 +135,9 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORDX2_SGPR:%[0-9]+]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (s64), align 4) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORDX2_SGPR:%[0-9]+]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (<2 x s32>) from %ir.rsrc, align 1, addrspace 8) ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR]].sub0 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR]].sub1 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] @@ -155,9 +155,9 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORDX2_SGPR:%[0-9]+]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (s64), align 4) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORDX2_SGPR:%[0-9]+]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (<2 x s32>) from %ir.rsrc, align 1, addrspace 8) ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR]].sub0 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR]].sub1 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] @@ -167,11 +167,11 @@ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 - %val = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <2 x i32> %val } -define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; GFX6-LABEL: name: s_buffer_load_v3i32 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 @@ -180,9 +180,9 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORDX4_SGPR:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (s96), align 4) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORDX4_SGPR:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (<3 x s32>) from %ir.rsrc, align 1, addrspace 8) ; GFX6-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub0 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub1 ; GFX6-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub2 @@ -205,9 +205,9 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORDX4_SGPR:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (s96), align 4) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORDX4_SGPR:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (<3 x s32>) from %ir.rsrc, align 1, addrspace 8) ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub0 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub1 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub2 @@ -230,9 +230,9 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORDX4_SGPR:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (s96), align 4) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORDX4_SGPR:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (<3 x s32>) from %ir.rsrc, align 1, addrspace 8) ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub0 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub1 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub2 @@ -247,11 +247,11 @@ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; GFX8-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 - %val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <3 x i32> %val } -define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; GFX6-LABEL: name: s_buffer_load_v8i32 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 @@ -260,9 +260,9 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORDX8_SGPR:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (s256), align 4) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORDX8_SGPR:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) ; GFX6-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub0 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub1 ; GFX6-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub2 @@ -304,9 +304,9 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORDX8_SGPR:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (s256), align 4) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORDX8_SGPR:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub0 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub1 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub2 @@ -348,9 +348,9 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORDX8_SGPR:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (s256), align 4) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORDX8_SGPR:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub0 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub1 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub2 @@ -384,11 +384,11 @@ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec ; GFX8-NEXT: $sgpr7 = COPY [[V_READFIRSTLANE_B32_7]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7 - %val = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <8 x i32> %val } -define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; GFX6-LABEL: name: s_buffer_load_v16i32 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 @@ -397,9 +397,9 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORDX16_SGPR:%[0-9]+]]:sgpr_512 = S_BUFFER_LOAD_DWORDX16_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (s512), align 4) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORDX16_SGPR:%[0-9]+]]:sgpr_512 = S_BUFFER_LOAD_DWORDX16_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) ; GFX6-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub0 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub1 ; GFX6-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub2 @@ -473,9 +473,9 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORDX16_SGPR:%[0-9]+]]:sgpr_512 = S_BUFFER_LOAD_DWORDX16_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (s512), align 4) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORDX16_SGPR:%[0-9]+]]:sgpr_512 = S_BUFFER_LOAD_DWORDX16_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub0 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub1 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub2 @@ -549,9 +549,9 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORDX16_SGPR:%[0-9]+]]:sgpr_512 = S_BUFFER_LOAD_DWORDX16_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (s512), align 4) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORDX16_SGPR:%[0-9]+]]:sgpr_512 = S_BUFFER_LOAD_DWORDX16_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub0 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub1 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub2 @@ -617,11 +617,11 @@ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_15:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY36]], implicit $exec ; GFX8-NEXT: $sgpr15 = COPY [[V_READFIRSTLANE_B32_15]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15 - %val = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <16 x i32> %val } -define amdgpu_ps i32 @s_buffer_load_i32_offset_1(<4 x i32> inreg %rsrc) { +define amdgpu_ps i32 @s_buffer_load_i32_offset_1(ptr addrspace(8) inreg %rsrc) { ; GFX6-LABEL: name: s_buffer_load_i32_offset_1 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 @@ -630,9 +630,9 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -645,9 +645,9 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -661,16 +661,16 @@ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1, 0 :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 - %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 1, i32 0) + %val = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %rsrc, i32 1, i32 0) ret i32 %val } -define amdgpu_ps i32 @s_buffer_load_i32_offset_glc_4(<4 x i32> inreg %rsrc) { +define amdgpu_ps i32 @s_buffer_load_i32_offset_glc_4(ptr addrspace(8) inreg %rsrc) { ; GFX6-LABEL: name: s_buffer_load_i32_offset_glc_4 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 @@ -680,7 +680,7 @@ ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1, 1 :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1, 1 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -694,7 +694,7 @@ ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1, 1 :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1, 1 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -708,16 +708,16 @@ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 4, 1 :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 4, 1 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 - %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 4, i32 1) + %val = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %rsrc, i32 4, i32 1) ret i32 %val } -define amdgpu_ps i32 @s_buffer_load_i32_offset_255(<4 x i32> inreg %rsrc) { +define amdgpu_ps i32 @s_buffer_load_i32_offset_255(ptr addrspace(8) inreg %rsrc) { ; GFX6-LABEL: name: s_buffer_load_i32_offset_255 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 @@ -726,9 +726,9 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 255 - ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -741,9 +741,9 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 255 - ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -757,16 +757,16 @@ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 255, 0 :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 255, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 - %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 255, i32 0) + %val = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %rsrc, i32 255, i32 0) ret i32 %val } -define amdgpu_ps i32 @s_buffer_load_i32_offset_256(<4 x i32> inreg %rsrc) { +define amdgpu_ps i32 @s_buffer_load_i32_offset_256(ptr addrspace(8) inreg %rsrc) { ; GFX6-LABEL: name: s_buffer_load_i32_offset_256 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 @@ -776,7 +776,7 @@ ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 64, 0 :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 64, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -790,7 +790,7 @@ ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 64, 0 :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 64, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -804,16 +804,16 @@ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 256, 0 :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 256, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 - %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 256, i32 0) + %val = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %rsrc, i32 256, i32 0) ret i32 %val } -define amdgpu_ps i32 @s_buffer_load_i32_offset_1020(<4 x i32> inreg %rsrc) { +define amdgpu_ps i32 @s_buffer_load_i32_offset_1020(ptr addrspace(8) inreg %rsrc) { ; GFX6-LABEL: name: s_buffer_load_i32_offset_1020 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 @@ -823,7 +823,7 @@ ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 255, 0 :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 255, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -837,7 +837,7 @@ ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 255, 0 :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 255, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -851,16 +851,16 @@ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1020, 0 :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1020, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 - %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 1020, i32 0) + %val = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %rsrc, i32 1020, i32 0) ret i32 %val } -define amdgpu_ps i32 @s_buffer_load_i32_offset_1023(<4 x i32> inreg %rsrc) { +define amdgpu_ps i32 @s_buffer_load_i32_offset_1023(ptr addrspace(8) inreg %rsrc) { ; GFX6-LABEL: name: s_buffer_load_i32_offset_1023 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 @@ -869,9 +869,9 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023 - ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -884,9 +884,9 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023 - ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -900,16 +900,16 @@ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1023, 0 :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1023, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 - %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 1023, i32 0) + %val = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %rsrc, i32 1023, i32 0) ret i32 %val } -define amdgpu_ps i32 @s_buffer_load_i32_offset_1024(<4 x i32> inreg %rsrc) { +define amdgpu_ps i32 @s_buffer_load_i32_offset_1024(ptr addrspace(8) inreg %rsrc) { ; GFX6-LABEL: name: s_buffer_load_i32_offset_1024 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 @@ -918,9 +918,9 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 - ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -934,7 +934,7 @@ ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 256, 0 :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 256, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -948,16 +948,16 @@ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1024, 0 :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1024, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 - %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 1024, i32 0) + %val = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %rsrc, i32 1024, i32 0) ret i32 %val } -define amdgpu_ps i32 @s_buffer_load_i32_offset_1025(<4 x i32> inreg %rsrc) { +define amdgpu_ps i32 @s_buffer_load_i32_offset_1025(ptr addrspace(8) inreg %rsrc) { ; GFX6-LABEL: name: s_buffer_load_i32_offset_1025 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 @@ -966,9 +966,9 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1025 - ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -981,9 +981,9 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1025 - ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -997,16 +997,16 @@ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1025, 0 :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1025, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 - %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 1025, i32 0) + %val = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %rsrc, i32 1025, i32 0) ret i32 %val } -define amdgpu_ps i32 @s_buffer_load_i32_offset_neg1(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_i32_offset_neg1(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: name: s_buffer_load_i32_offset_neg1 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 @@ -1015,9 +1015,9 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -1030,9 +1030,9 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -1045,18 +1045,18 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -1, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 -1, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_i32_offset_neg4(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_i32_offset_neg4(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: name: s_buffer_load_i32_offset_neg4 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 @@ -1065,9 +1065,9 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -4 - ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -1081,7 +1081,7 @@ ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 1073741823, 0 :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 1073741823, 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -1094,18 +1094,18 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -4 - ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -4, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 -4, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_i32_offset_neg8(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_i32_offset_neg8(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: name: s_buffer_load_i32_offset_neg8 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 @@ -1114,9 +1114,9 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -8 - ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -1130,7 +1130,7 @@ ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 1073741822, 0 :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 1073741822, 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -1143,18 +1143,18 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -8 - ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -8, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 -8, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_i32_offset_bit31(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_i32_offset_bit31(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: name: s_buffer_load_i32_offset_bit31 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 @@ -1163,9 +1163,9 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 - ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -1179,7 +1179,7 @@ ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 536870912, 0 :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 536870912, 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -1192,18 +1192,18 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 - ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -2147483648, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 -2147483648, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_i32_offset_glc_bit30(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_i32_offset_glc_bit30(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: name: s_buffer_load_i32_offset_glc_bit30 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 @@ -1212,9 +1212,9 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1073741824 - ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 1 :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 1 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -1228,7 +1228,7 @@ ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 268435456, 1 :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 268435456, 1 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -1241,18 +1241,18 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1073741824 - ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 1 :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 1 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1073741824, i32 1) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 1073741824, i32 1) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_i32_offset_bit29(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_i32_offset_bit29(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: name: s_buffer_load_i32_offset_bit29 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 @@ -1261,9 +1261,9 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 536870912 - ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -1277,7 +1277,7 @@ ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 134217728, 0 :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 134217728, 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -1290,18 +1290,18 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 536870912 - ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 536870912, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 536870912, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_i32_offset_bit21(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_i32_offset_bit21(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: name: s_buffer_load_i32_offset_bit21 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 @@ -1310,9 +1310,9 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2097152 - ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -1326,7 +1326,7 @@ ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 524288, 0 :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 524288, 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -1339,18 +1339,18 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2097152 - ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 2097152, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 2097152, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_i32_offset_bit20(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_i32_offset_bit20(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: name: s_buffer_load_i32_offset_bit20 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 @@ -1359,9 +1359,9 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1048576 - ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -1375,7 +1375,7 @@ ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 262144, 0 :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 262144, 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -1388,18 +1388,18 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1048576 - ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1048576, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 1048576, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit20(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit20(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: name: s_buffer_load_i32_offset_neg_bit20 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 @@ -1408,9 +1408,9 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1048576 - ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -1424,7 +1424,7 @@ ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 1073479680, 0 :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 1073479680, 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -1437,18 +1437,18 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1048576 - ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -1048576, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 -1048576, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_i32_offset_bit19(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_i32_offset_bit19(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: name: s_buffer_load_i32_offset_bit19 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 @@ -1457,9 +1457,9 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 524288 - ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -1473,7 +1473,7 @@ ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 131072, 0 :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 131072, 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -1487,16 +1487,16 @@ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 524288, 0 :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 524288, 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 524288, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 524288, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit19(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit19(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: name: s_buffer_load_i32_offset_neg_bit19 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 @@ -1505,9 +1505,9 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -524288 - ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -1521,7 +1521,7 @@ ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 1073610752, 0 :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 1073610752, 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -1534,19 +1534,19 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -524288 - ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32) from %ir.desc, align 1, addrspace 8) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -524288, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 -524288, i32 0) ret i32 %load } ; Check cases that need to be converted to MUBUF due to the offset being a VGPR. -define amdgpu_ps float @s_buffer_load_f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { +define amdgpu_ps float @s_buffer_load_f32_vgpr_offset(ptr addrspace(8) inreg %rsrc, i32 %soffset) { ; GFX6-LABEL: name: s_buffer_load_f32_vgpr_offset ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -1555,10 +1555,10 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_offset @@ -1569,10 +1569,10 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_vgpr_offset @@ -1583,17 +1583,17 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret float %val } -define amdgpu_ps <2 x float> @s_buffer_load_v2f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { +define amdgpu_ps <2 x float> @s_buffer_load_v2f32_vgpr_offset(ptr addrspace(8) inreg %rsrc, i32 %soffset) { ; GFX6-LABEL: name: s_buffer_load_v2f32_vgpr_offset ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -1602,10 +1602,10 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s64), align 4) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s64) from %ir.rsrc, align 4, addrspace 8) ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1 ; GFX6-NEXT: $vgpr0 = COPY [[COPY5]] @@ -1619,10 +1619,10 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s64), align 4) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s64) from %ir.rsrc, align 4, addrspace 8) ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1 ; GFX7-NEXT: $vgpr0 = COPY [[COPY5]] @@ -1636,20 +1636,20 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s64), align 4) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s64) from %ir.rsrc, align 4, addrspace 8) ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1 ; GFX8-NEXT: $vgpr0 = COPY [[COPY5]] ; GFX8-NEXT: $vgpr1 = COPY [[COPY6]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - %val = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <2 x float> %val } -define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { +define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(ptr addrspace(8) inreg %rsrc, i32 %soffset) { ; GFX6-LABEL: name: s_buffer_load_v3f32_vgpr_offset ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -1658,10 +1658,10 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 ; GFX6-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 @@ -1678,10 +1678,10 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 @@ -1698,10 +1698,10 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 @@ -1710,11 +1710,11 @@ ; GFX8-NEXT: $vgpr1 = COPY [[COPY6]] ; GFX8-NEXT: $vgpr2 = COPY [[COPY7]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 - %val = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <3 x float> %val } -define amdgpu_ps <4 x float> @s_buffer_load_v4f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { +define amdgpu_ps <4 x float> @s_buffer_load_v4f32_vgpr_offset(ptr addrspace(8) inreg %rsrc, i32 %soffset) { ; GFX6-LABEL: name: s_buffer_load_v4f32_vgpr_offset ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -1723,10 +1723,10 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 ; GFX6-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 @@ -1744,10 +1744,10 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 @@ -1765,10 +1765,10 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 @@ -1778,11 +1778,11 @@ ; GFX8-NEXT: $vgpr2 = COPY [[COPY7]] ; GFX8-NEXT: $vgpr3 = COPY [[COPY8]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - %val = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <4 x float> %val } -define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset(ptr addrspace(8) inreg %rsrc, i32 %soffset) { ; GFX6-LABEL: name: s_buffer_load_v8f32_vgpr_offset ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -1791,11 +1791,11 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -1822,11 +1822,11 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -1853,11 +1853,11 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -1876,11 +1876,11 @@ ; GFX8-NEXT: $vgpr6 = COPY [[COPY11]] ; GFX8-NEXT: $vgpr7 = COPY [[COPY12]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 - %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <8 x float> %val } -define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { +define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset(ptr addrspace(8) inreg %rsrc, i32 %soffset) { ; GFX6-LABEL: name: s_buffer_load_v16f32_vgpr_offset ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -1889,13 +1889,13 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc + 16, align 4, addrspace 8) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc + 48, align 4, addrspace 8) ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -1938,13 +1938,13 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc + 16, align 4, addrspace 8) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc + 48, align 4, addrspace 8) ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -1987,13 +1987,13 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc + 16, align 4, addrspace 8) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc + 48, align 4, addrspace 8) ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2028,11 +2028,11 @@ ; GFX8-NEXT: $vgpr14 = COPY [[COPY19]] ; GFX8-NEXT: $vgpr15 = COPY [[COPY20]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 - %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <16 x float> %val } -define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4092(<4 x i32> inreg %rsrc, i32 %soffset.base) { +define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4092(ptr addrspace(8) inreg %rsrc, i32 %soffset.base) { ; GFX6-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4092 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -2041,10 +2041,10 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4092 @@ -2055,10 +2055,10 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4092 @@ -2069,18 +2069,18 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4092 - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret float %val } -define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4095(<4 x i32> inreg %rsrc, i32 %soffset.base) { +define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4095(ptr addrspace(8) inreg %rsrc, i32 %soffset.base) { ; GFX6-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4095 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -2089,10 +2089,10 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4095 @@ -2103,10 +2103,10 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4095 @@ -2117,18 +2117,18 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4095 - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret float %val } -define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4096(<4 x i32> inreg %rsrc, i32 %soffset.base) { +define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4096(ptr addrspace(8) inreg %rsrc, i32 %soffset.base) { ; GFX6-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -2137,10 +2137,10 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096 @@ -2151,10 +2151,10 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096 @@ -2165,19 +2165,19 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4096 - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret float %val } ; Make sure the base offset is added to each split load. -define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4064(<4 x i32> inreg %rsrc, i32 %soffset.base) { +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4064(ptr addrspace(8) inreg %rsrc, i32 %soffset.base) { ; GFX6-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4064 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -2186,11 +2186,11 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2217,11 +2217,11 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2248,11 +2248,11 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2272,12 +2272,12 @@ ; GFX8-NEXT: $vgpr7 = COPY [[COPY12]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %soffset.base, 4064 - %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <8 x float> %val } ; Make sure the maximum offset isn't exeeded when splitting this -define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4068(<4 x i32> inreg %rsrc, i32 %soffset.base) { +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4068(ptr addrspace(8) inreg %rsrc, i32 %soffset.base) { ; GFX6-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4068 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -2286,11 +2286,11 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4068 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2317,11 +2317,11 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4068 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2348,11 +2348,11 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4 - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2372,11 +2372,11 @@ ; GFX8-NEXT: $vgpr7 = COPY [[COPY12]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %soffset.base, 4068 - %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <8 x float> %val } -define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4032(<4 x i32> inreg %rsrc, i32 %soffset.base) { +define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4032(ptr addrspace(8) inreg %rsrc, i32 %soffset.base) { ; GFX6-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4032 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -2385,13 +2385,13 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc + 16, align 4, addrspace 8) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc + 48, align 4, addrspace 8) ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2434,13 +2434,13 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc + 16, align 4, addrspace 8) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc + 48, align 4, addrspace 8) ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2483,13 +2483,13 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc + 16, align 4, addrspace 8) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc + 48, align 4, addrspace 8) ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2525,11 +2525,11 @@ ; GFX8-NEXT: $vgpr15 = COPY [[COPY20]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 %soffset = add i32 %soffset.base, 4032 - %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <16 x float> %val } -define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4036(<4 x i32> inreg %rsrc, i32 %soffset.base) { +define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4036(ptr addrspace(8) inreg %rsrc, i32 %soffset.base) { ; GFX6-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4036 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -2538,13 +2538,13 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4036 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc + 16, align 4, addrspace 8) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc + 48, align 4, addrspace 8) ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2587,13 +2587,13 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4036 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc + 16, align 4, addrspace 8) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc + 48, align 4, addrspace 8) ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2636,13 +2636,13 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4 - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc + 16, align 4, addrspace 8) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc + 48, align 4, addrspace 8) ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2678,12 +2678,12 @@ ; GFX8-NEXT: $vgpr15 = COPY [[COPY20]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 %soffset = add i32 %soffset.base, 4036 - %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <16 x float> %val } ; Waterfall loop due to resource being VGPR -define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg %soffset) { +define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(ptr addrspace(8) %rsrc, i32 inreg %soffset) { ; GFX6-LABEL: name: s_buffer_load_f32_vgpr_rsrc ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: successors: %bb.2(0x80000000) @@ -2693,8 +2693,8 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -2719,7 +2719,7 @@ ; GFX6-NEXT: bb.3: ; GFX6-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} @@ -2740,8 +2740,8 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -2766,7 +2766,7 @@ ; GFX7-NEXT: bb.3: ; GFX7-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} @@ -2787,8 +2787,8 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -2813,7 +2813,7 @@ ; GFX8-NEXT: bb.3: ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} @@ -2825,12 +2825,12 @@ ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret float %val } ; Use the offset inside the waterfall loop -define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> %rsrc, i32 inreg %soffset.base) { +define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(ptr addrspace(8) %rsrc, i32 inreg %soffset.base) { ; GFX6-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4092 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: successors: %bb.2(0x80000000) @@ -2840,8 +2840,8 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: @@ -2864,7 +2864,7 @@ ; GFX6-NEXT: bb.3: ; GFX6-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} @@ -2885,8 +2885,8 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: @@ -2909,7 +2909,7 @@ ; GFX7-NEXT: bb.3: ; GFX7-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} @@ -2930,8 +2930,8 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: @@ -2954,7 +2954,7 @@ ; GFX8-NEXT: bb.3: ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} @@ -2967,12 +2967,12 @@ ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4092 - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret float %val } ; Scalar offset exceeds MUBUF limit, keep add out of the loop -define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> %rsrc, i32 inreg %soffset.base) { +define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(ptr addrspace(8) %rsrc, i32 inreg %soffset.base) { ; GFX6-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4096 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: successors: %bb.2(0x80000000) @@ -2982,10 +2982,10 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX6-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -3010,7 +3010,7 @@ ; GFX6-NEXT: bb.3: ; GFX6-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} @@ -3031,10 +3031,10 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX7-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -3059,7 +3059,7 @@ ; GFX7-NEXT: bb.3: ; GFX7-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} @@ -3080,10 +3080,10 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX8-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -3108,7 +3108,7 @@ ; GFX8-NEXT: bb.3: ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} @@ -3121,12 +3121,12 @@ ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4096 - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret float %val } ; Waterfall loop, but constant offset -define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) { +define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(ptr addrspace(8) %rsrc) { ; GFX6-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4095 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: successors: %bb.2(0x80000000) @@ -3160,7 +3160,7 @@ ; GFX6-NEXT: bb.3: ; GFX6-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc + 4095, align 1, basealign 4, addrspace 8) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} @@ -3205,7 +3205,7 @@ ; GFX7-NEXT: bb.3: ; GFX7-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc + 4095, align 1, basealign 4, addrspace 8) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} @@ -3250,7 +3250,7 @@ ; GFX8-NEXT: bb.3: ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc + 4095, align 1, basealign 4, addrspace 8) ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} @@ -3262,12 +3262,12 @@ ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4095, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 4095, i32 0) ret float %val } ; Waterfall loop, but constant offset -define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) { +define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(ptr addrspace(8) %rsrc) { ; GFX6-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4096 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: successors: %bb.2(0x80000000) @@ -3277,8 +3277,8 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -3303,7 +3303,7 @@ ; GFX6-NEXT: bb.3: ; GFX6-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} @@ -3324,8 +3324,8 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -3350,7 +3350,7 @@ ; GFX7-NEXT: bb.3: ; GFX7-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} @@ -3395,7 +3395,7 @@ ; GFX8-NEXT: bb.3: ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from unknown-address + 4096) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc + 4096, addrspace 8) ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} @@ -3407,13 +3407,13 @@ ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4096, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 4096, i32 0) ret float %val } ; Need a waterfall loop, but the offset is scalar. ; Make sure the base offset is added to each split load. -define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %rsrc, i32 inreg %soffset.base) { +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(ptr addrspace(8) %rsrc, i32 inreg %soffset.base) { ; GFX6-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4064 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: successors: %bb.2(0x80000000) @@ -3423,8 +3423,8 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: @@ -3447,8 +3447,8 @@ ; GFX6-NEXT: bb.3: ; GFX6-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} @@ -3485,8 +3485,8 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: @@ -3509,8 +3509,8 @@ ; GFX7-NEXT: bb.3: ; GFX7-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} @@ -3547,8 +3547,8 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: @@ -3571,8 +3571,8 @@ ; GFX8-NEXT: bb.3: ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} @@ -3601,13 +3601,13 @@ ; GFX8-NEXT: $vgpr7 = COPY [[COPY16]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %soffset.base, 4064 - %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <8 x float> %val } ; Need a waterfall loop, but the offset is scalar. ; Make sure the maximum offset isn't exeeded when splitting this -define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %rsrc, i32 inreg %soffset.base) { +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(ptr addrspace(8) %rsrc, i32 inreg %soffset.base) { ; GFX6-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4068 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: successors: %bb.2(0x80000000) @@ -3617,10 +3617,10 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4068 ; GFX6-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -3645,8 +3645,8 @@ ; GFX6-NEXT: bb.3: ; GFX6-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} @@ -3683,10 +3683,10 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4068 ; GFX7-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -3711,8 +3711,8 @@ ; GFX7-NEXT: bb.3: ; GFX7-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} @@ -3749,10 +3749,10 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4068 ; GFX8-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -3777,8 +3777,8 @@ ; GFX8-NEXT: bb.3: ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} @@ -3807,11 +3807,11 @@ ; GFX8-NEXT: $vgpr7 = COPY [[COPY17]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %soffset.base, 4068 - %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <8 x float> %val } -define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %rsrc, i32 inreg %soffset.base) { +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(ptr addrspace(8) %rsrc, i32 inreg %soffset.base) { ; GFX6-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4096 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: successors: %bb.2(0x80000000) @@ -3821,10 +3821,10 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX6-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -3849,8 +3849,8 @@ ; GFX6-NEXT: bb.3: ; GFX6-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} @@ -3887,10 +3887,10 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX7-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -3915,8 +3915,8 @@ ; GFX7-NEXT: bb.3: ; GFX7-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} @@ -3953,10 +3953,10 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX8-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -3981,8 +3981,8 @@ ; GFX8-NEXT: bb.3: ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} @@ -4011,11 +4011,11 @@ ; GFX8-NEXT: $vgpr7 = COPY [[COPY17]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %soffset.base, 4096 - %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <8 x float> %val } -define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000(<4 x i32> %rsrc, i32 %offset.base) { +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000(ptr addrspace(8) %rsrc, i32 %offset.base) { ; GFX6-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: successors: %bb.2(0x80000000) @@ -4025,9 +4025,9 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: @@ -4050,8 +4050,8 @@ ; GFX6-NEXT: bb.3: ; GFX6-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} @@ -4088,9 +4088,9 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: @@ -4113,8 +4113,8 @@ ; GFX7-NEXT: bb.3: ; GFX7-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} @@ -4151,8 +4151,8 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4064 ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} @@ -4176,8 +4176,8 @@ ; GFX8-NEXT: bb.3: ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 936, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 952, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 936, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 952, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} @@ -4206,11 +4206,11 @@ ; GFX8-NEXT: $vgpr7 = COPY [[COPY16]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %offset.base, 5000 - %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <8 x float> %val } -define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076(<4 x i32> %rsrc, i32 %offset.base) { +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076(ptr addrspace(8) %rsrc, i32 %offset.base) { ; GFX6-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: successors: %bb.2(0x80000000) @@ -4220,9 +4220,9 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4076 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: @@ -4245,8 +4245,8 @@ ; GFX6-NEXT: bb.3: ; GFX6-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} @@ -4283,9 +4283,9 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4076 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: @@ -4308,8 +4308,8 @@ ; GFX7-NEXT: bb.3: ; GFX7-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} @@ -4346,8 +4346,8 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 12 ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} @@ -4371,8 +4371,8 @@ ; GFX8-NEXT: bb.3: ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} @@ -4401,11 +4401,11 @@ ; GFX8-NEXT: $vgpr7 = COPY [[COPY16]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %offset.base, 4076 - %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <8 x float> %val } -define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080(<4 x i32> %rsrc, i32 %offset.base) { +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080(ptr addrspace(8) %rsrc, i32 %offset.base) { ; GFX6-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: successors: %bb.2(0x80000000) @@ -4415,9 +4415,9 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4080 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: @@ -4440,8 +4440,8 @@ ; GFX6-NEXT: bb.3: ; GFX6-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} @@ -4478,9 +4478,9 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4080 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: @@ -4503,8 +4503,8 @@ ; GFX7-NEXT: bb.3: ; GFX7-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} @@ -4541,8 +4541,8 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} @@ -4566,8 +4566,8 @@ ; GFX8-NEXT: bb.3: ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} @@ -4596,11 +4596,11 @@ ; GFX8-NEXT: $vgpr7 = COPY [[COPY16]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %offset.base, 4080 - %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <8 x float> %val } -define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4064(<4 x i32> %rsrc, i32 %offset.base) { +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4064(ptr addrspace(8) %rsrc, i32 %offset.base) { ; GFX6-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4064 ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: successors: %bb.2(0x80000000) @@ -4634,8 +4634,8 @@ ; GFX6-NEXT: bb.3: ; GFX6-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc + 4064, align 4, addrspace 8) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc + 4064, align 4, addrspace 8) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} @@ -4696,8 +4696,8 @@ ; GFX7-NEXT: bb.3: ; GFX7-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc + 4064, align 4, addrspace 8) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc + 4064, align 4, addrspace 8) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} @@ -4758,8 +4758,8 @@ ; GFX8-NEXT: bb.3: ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) - ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc + 4064, align 4, addrspace 8) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.rsrc + 4064, align 4, addrspace 8) ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} @@ -4787,11 +4787,11 @@ ; GFX8-NEXT: $vgpr6 = COPY [[COPY14]] ; GFX8-NEXT: $vgpr7 = COPY [[COPY15]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 - %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 4064, i32 0) + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8) %rsrc, i32 4064, i32 0) ret <8 x float> %val } -define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { +define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr(ptr addrspace(8) inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { ; GFX6-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -4800,10 +4800,10 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr @@ -4814,10 +4814,10 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr @@ -4828,18 +4828,18 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset = add i32 %offset.v, %offset.s - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %offset, i32 0) ret float %val } -define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { +define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr(ptr addrspace(8) inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { ; GFX6-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -4848,10 +4848,10 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr @@ -4862,10 +4862,10 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr @@ -4876,18 +4876,18 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset = add i32 %offset.s, %offset.v - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %offset, i32 0) ret float %val } -define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr_imm(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { +define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr_imm(ptr addrspace(8) inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { ; GFX6-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr_imm ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -4896,13 +4896,13 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr_imm @@ -4913,13 +4913,13 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr_imm @@ -4930,22 +4930,22 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.v, %offset.s %offset = add i32 %offset.base, 1024 - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %offset, i32 0) ret float %val } -define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr_imm(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { +define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr_imm(ptr addrspace(8) inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { ; GFX6-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr_imm ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -4954,13 +4954,13 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr_imm @@ -4971,13 +4971,13 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr_imm @@ -4988,23 +4988,23 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.s, %offset.v %offset = add i32 %offset.base, 1024 - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %offset, i32 0) ret float %val } ; TODO: Ideally this would be reassociated to fold. -define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { +define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(ptr addrspace(8) inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { ; GFX6-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -5013,12 +5013,12 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 ; GFX6-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc - ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr @@ -5029,12 +5029,12 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 ; GFX7-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc - ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr @@ -5045,21 +5045,21 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 ; GFX8-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc - ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.s, 1024 %offset = add i32 %offset.base, %offset.v - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %offset, i32 0) ret float %val } -define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { +define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(ptr addrspace(8) inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { ; GFX6-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr ; GFX6: bb.1 (%ir-block.0): ; GFX6-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -5068,13 +5068,13 @@ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec - ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr @@ -5085,13 +5085,13 @@ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec - ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr @@ -5102,44 +5102,44 @@ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec - ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.v, 1024 %offset = add i32 %offset.base, %offset.s - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %offset, i32 0) ret float %val } -declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg) -declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg) -declare <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32>, i32, i32 immarg) -declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32 immarg) -declare <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32>, i32, i32 immarg) -declare <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32>, i32, i32 immarg) +declare i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8), i32, i32 immarg) +declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(ptr addrspace(8), i32, i32 immarg) +declare <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(ptr addrspace(8), i32, i32 immarg) +declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(ptr addrspace(8), i32, i32 immarg) +declare <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(ptr addrspace(8), i32, i32 immarg) +declare <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(ptr addrspace(8), i32, i32 immarg) -declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) -declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32>, i32, i32 immarg) -declare <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32>, i32, i32 immarg) -declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32 immarg) -declare <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32>, i32, i32 immarg) -declare <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32>, i32, i32 immarg) +declare float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8), i32, i32 immarg) +declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8), i32, i32 immarg) +declare <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(ptr addrspace(8), i32, i32 immarg) +declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8), i32, i32 immarg) +declare <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8), i32, i32 immarg) +declare <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(ptr addrspace(8), i32, i32 immarg) -declare i96 @llvm.amdgcn.s.buffer.load.i96(<4 x i32>, i32, i32 immarg) -declare i256 @llvm.amdgcn.s.buffer.load.i256(<4 x i32>, i32, i32 immarg) -declare i512 @llvm.amdgcn.s.buffer.load.i512(<4 x i32>, i32, i32 immarg) +declare i96 @llvm.amdgcn.s.buffer.load.i96(ptr addrspace(8), i32, i32 immarg) +declare i256 @llvm.amdgcn.s.buffer.load.i256(ptr addrspace(8), i32, i32 immarg) +declare i512 @llvm.amdgcn.s.buffer.load.i512(ptr addrspace(8), i32, i32 immarg) -declare <16 x i16> @llvm.amdgcn.s.buffer.load.v16i16(<4 x i32>, i32, i32 immarg) -declare <32 x i16> @llvm.amdgcn.s.buffer.load.v32i16(<4 x i32>, i32, i32 immarg) +declare <16 x i16> @llvm.amdgcn.s.buffer.load.v16i16(ptr addrspace(8), i32, i32 immarg) +declare <32 x i16> @llvm.amdgcn.s.buffer.load.v32i16(ptr addrspace(8), i32, i32 immarg) -declare <4 x i64> @llvm.amdgcn.s.buffer.load.v4i64(<4 x i32>, i32, i32 immarg) -declare <8 x i64> @llvm.amdgcn.s.buffer.load.v8i64(<4 x i32>, i32, i32 immarg) +declare <4 x i64> @llvm.amdgcn.s.buffer.load.v4i64(ptr addrspace(8), i32, i32 immarg) +declare <8 x i64> @llvm.amdgcn.s.buffer.load.v8i64(ptr addrspace(8), i32, i32 immarg) -declare <4 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v4p1(<4 x i32>, i32, i32 immarg) -declare <8 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v8p1(<4 x i32>, i32, i32 immarg) +declare <4 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v4p1(ptr addrspace(8), i32, i32 immarg) +declare <8 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v8p1(ptr addrspace(8), i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -40,7 +40,7 @@ ret void } -define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x i32> inreg %desc) { +define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, ptr addrspace(8) inreg %desc) { ; GCN-LABEL: set_inactive_scc: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 @@ -75,7 +75,7 @@ ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: .LBB2_4: ; %.exit ; GCN-NEXT: s_endpgm - %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 0, i32 0) %cmp = icmp eq i32 %val, 56 %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 br i1 %cmp, label %.zero, label %.one @@ -95,6 +95,6 @@ declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0 declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0 -declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32) +declare i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8), i32, i32) attributes #0 = { convergent readnone } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll @@ -2,7 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s ; Natural mapping -define amdgpu_ps float @struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -12,20 +12,20 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) %cast = bitcast i32 %ret to float ret float %cast } -define amdgpu_ps float @struct_buffer_atomic_add_i32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @struct_buffer_atomic_add_i32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_atomic_add_i32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -35,20 +35,20 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) %cast = bitcast i32 %ret to float ret float %cast } -define amdgpu_ps <2 x float> @struct_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <2 x float> @struct_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -60,23 +60,23 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN]].sub0 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN]].sub1 ; CHECK-NEXT: $vgpr0 = COPY [[COPY9]] ; CHECK-NEXT: $vgpr1 = COPY [[COPY10]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.add.i64(i64 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.add.i64(i64 %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) %cast = bitcast i64 %ret to <2 x float> ret <2 x float> %cast } -define amdgpu_ps void @struct_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -88,19 +88,19 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 - ; CHECK-NEXT: BUFFER_ATOMIC_ADD_X2_BOTHEN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_ATOMIC_ADD_X2_BOTHEN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.add.i64(i64 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.add.i64(i64 %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } ; All register operands need legalization -define amdgpu_ps float @struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i32 inreg %val, <4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { +define amdgpu_ps float @struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i32 inreg %val, ptr addrspace(8) %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { ; CHECK-LABEL: name: struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -111,10 +111,10 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] @@ -144,7 +144,7 @@ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -156,13 +156,13 @@ ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) %cast = bitcast i32 %ret to float ret float %cast } ; All register operands need legalization -define amdgpu_ps void @struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i32 inreg %val, <4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { +define amdgpu_ps void @struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i32 inreg %val, ptr addrspace(8) %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { ; CHECK-LABEL: name: struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -173,10 +173,10 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] @@ -206,7 +206,7 @@ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 - ; CHECK-NEXT: BUFFER_ATOMIC_ADD_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_ATOMIC_ADD_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -217,12 +217,12 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 - %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } ; Natural mapping + slc -define amdgpu_ps float @struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(i32 %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -232,20 +232,20 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) + %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) %cast = bitcast i32 %ret to float ret float %cast } -declare i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32, i32 immarg) #0 -declare i64 @llvm.amdgcn.struct.buffer.atomic.add.i64(i64, <4 x i32>, i32, i32, i32, i32 immarg) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32, ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare i64 @llvm.amdgcn.struct.buffer.atomic.add.i64(i64, ptr addrspace(8), i32, i32, i32, i32 immarg) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll @@ -3,7 +3,7 @@ ; Natural mapping -define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, i32 %cmp, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, i32 %cmp, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -14,23 +14,23 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN]].sub0 ; CHECK-NEXT: $vgpr0 = COPY [[COPY9]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) %cast = bitcast i32 %ret to float ret float %cast } ; Natural mapping -define amdgpu_ps void @struct_buffer_atomic_cmpswap_noret_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, i32 %cmp, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_atomic_cmpswap_noret_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, i32 %cmp, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_atomic_cmpswap_noret_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -41,20 +41,20 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; CHECK-NEXT: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } ; All operands need legalization -define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i32 inreg %val, i32 inreg %cmp, <4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { +define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i32 inreg %val, i32 inreg %cmp, ptr addrspace(8) %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { ; CHECK-LABEL: name: struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -66,10 +66,10 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] @@ -101,7 +101,7 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY12]], %subreg.sub1 ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN]].sub0 ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec @@ -114,13 +114,13 @@ ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[COPY17]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) %cast = bitcast i32 %ret to float ret float %cast } ; All operands need legalization -define amdgpu_ps void @struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i32 inreg %val, i32 inreg %cmp, <4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { +define amdgpu_ps void @struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i32 inreg %val, i32 inreg %cmp, ptr addrspace(8) %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { ; CHECK-LABEL: name: struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -132,10 +132,10 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] @@ -167,7 +167,7 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY12]], %subreg.sub1 ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 - ; CHECK-NEXT: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -178,11 +178,11 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 - %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095(i32 %val, i32 %cmp, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset.base, i32 inreg %soffset) { +define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095(i32 %val, i32 %cmp, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset.base, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -193,23 +193,23 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN]].sub0 ; CHECK-NEXT: $vgpr0 = COPY [[COPY9]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset = add i32 %voffset.base, 4095 - %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) %cast = bitcast i32 %ret to float ret float %cast } -declare i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32, i32 immarg) #0 -declare i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64, i64, <4 x i32>, i32, i32, i32, i32 immarg) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32, i32, ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64, i64, ptr addrspace(8), i32, i32, i32, i32 immarg) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll @@ -1,26 +1,26 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s ; RUN: not --crash llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 -; GFX908: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_AMDGPU_BUFFER_ATOMIC_FADD %{{[0-9]+}}:vgpr, %{{[0-9]+}}:sgpr(<4 x s32>), %{{[0-9]+}}:vgpr(s32), %{{[0-9]+}}:vgpr, %{{[0-9]+}}:sgpr, 0, 0, -1 :: (volatile dereferenceable load store (s32), align 1, addrspace 8) (in function: buffer_atomic_add_f32_rtn) +; GFX908: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_AMDGPU_BUFFER_ATOMIC_FADD %{{[0-9]+}}:vgpr, %{{[0-9]+|rsrcVec}}:sgpr(<4 x s32>), %{{[0-9]+}}:vgpr(s32), %{{[0-9]+}}:vgpr, %{{[0-9]+}}:sgpr, 0, 0, -1 :: (volatile dereferenceable load store (s32) on %ir.rsrc.load, align 1, addrspace 8) (in function: buffer_atomic_add_f32_rtn) -declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) -declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) +declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32, i32 immarg) ; GFX90A-LABEL: {{^}}buffer_atomic_add_f32_rtn: ; GFX90A: buffer_atomic_add_f32 v{{[0-9]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}], s{{[0-9]+}} idxen offen glc -define amdgpu_kernel void @buffer_atomic_add_f32_rtn(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) { +define amdgpu_kernel void @buffer_atomic_add_f32_rtn(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) { main_body: - %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) store float %ret, ptr undef ret void } ; GFX90A-LABEL: {{^}}buffer_atomic_add_v2f16_rtn: ; GFX90A: buffer_atomic_pk_add_f16 v{{[0-9]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}], s{{[0-9]+}} idxen offen glc -define amdgpu_kernel void @buffer_atomic_add_v2f16_rtn(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) { +define amdgpu_kernel void @buffer_atomic_add_v2f16_rtn(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) { main_body: - %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) store <2 x half> %ret, ptr undef ret void } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll @@ -3,7 +3,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX90A ; Natural mapping -define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; GFX908-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -13,12 +13,12 @@ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX90A: bb.1 (%ir-block.0): @@ -29,18 +29,18 @@ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; GFX908-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -50,12 +50,12 @@ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset ; GFX90A: bb.1 (%ir-block.0): @@ -66,19 +66,19 @@ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 - %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) { ; GFX908-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -88,10 +88,10 @@ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset ; GFX90A: bb.1 (%ir-block.0): @@ -102,17 +102,17 @@ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 4095, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 4095, i32 %soffset, i32 0) ret void } ; Natural mapping, no voffset -define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) { ; GFX908-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -122,10 +122,10 @@ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset ; GFX90A: bb.1 (%ir-block.0): @@ -136,17 +136,17 @@ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } ; All register operands need legalization -define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(float inreg %val, <4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { +define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(float inreg %val, ptr addrspace(8) %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { ; GFX908-LABEL: name: struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: successors: %bb.2(0x80000000) @@ -157,10 +157,10 @@ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] @@ -190,7 +190,7 @@ ; GFX908-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 - ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX908-NEXT: {{ $}} @@ -211,10 +211,10 @@ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] @@ -244,7 +244,7 @@ ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 - ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX90A-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX90A-NEXT: {{ $}} @@ -255,12 +255,12 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } ; All register operands need legalization, no voffset -define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset(float inreg %val, <4 x i32> %rsrc, i32 inreg %vindex, i32 %soffset) { +define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset(float inreg %val, ptr addrspace(8) %rsrc, i32 inreg %vindex, i32 %soffset) { ; GFX908-LABEL: name: struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: successors: %bb.2(0x80000000) @@ -271,9 +271,9 @@ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -301,7 +301,7 @@ ; GFX908-NEXT: bb.3: ; GFX908-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX908-NEXT: {{ $}} @@ -322,9 +322,9 @@ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -352,7 +352,7 @@ ; GFX90A-NEXT: bb.3: ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX90A-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX90A-NEXT: {{ $}} @@ -363,12 +363,12 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } ; Natural mapping + slc -define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; GFX908-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -378,12 +378,12 @@ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc ; GFX90A: bb.1 (%ir-block.0): @@ -394,18 +394,18 @@ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void } -define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) { ; GFX908-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -415,10 +415,10 @@ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc ; GFX90A: bb.1 (%ir-block.0): @@ -429,16 +429,16 @@ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 2) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 2) ret void } -define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; GFX908-LABEL: name: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -448,12 +448,12 @@ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX90A: bb.1 (%ir-block.0): @@ -464,18 +464,18 @@ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A-NEXT: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX90A-NEXT: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) ; GFX90A-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) { ; GFX908-LABEL: name: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -485,10 +485,10 @@ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset ; GFX90A: bb.1 (%ir-block.0): @@ -499,16 +499,16 @@ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX90A-NEXT: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A-NEXT: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) ; GFX90A-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } -declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0 -declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) #0 +declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32, i32 immarg) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll @@ -2,7 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=UNPACKED %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=PACKED %s -define amdgpu_ps half @struct_buffer_load_format_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps half @struct_buffer_load_format_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: struct_buffer_load_format_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -11,12 +11,12 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; PACKED-LABEL: name: struct_buffer_load_format_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset @@ -27,19 +27,19 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call half @llvm.amdgcn.struct.buffer.load.format.f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call half @llvm.amdgcn.struct.buffer.load.format.f16(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret half %val } -define amdgpu_ps <2 x half> @struct_buffer_load_format_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <2 x half> @struct_buffer_load_format_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: struct_buffer_load_format_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -48,12 +48,12 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>), align 1, addrspace 8) + ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub0 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub1 ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 @@ -75,25 +75,25 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XY_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>), align 1, addrspace 8) + ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XY_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_BOTHEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call <2 x half> @llvm.amdgcn.struct.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call <2 x half> @llvm.amdgcn.struct.buffer.load.format.v2f16(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret <2 x half> %val } ; FIXME: Crashes -; define amdgpu_ps <3 x half> @struct_buffer_load_format_v3f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { -; %val = call <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) +; define amdgpu_ps <3 x half> @struct_buffer_load_format_v3f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +; %val = call <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ; ret <3 x half> %val ; } -define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: struct_buffer_load_format_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -102,12 +102,12 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) + ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub0 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub2 @@ -139,23 +139,23 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) + ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 ; PACKED-NEXT: $vgpr0 = COPY [[COPY7]] ; PACKED-NEXT: $vgpr1 = COPY [[COPY8]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - %val = call <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret <4 x half> %val } ; Waterfall for rsrc and soffset, copy for voffset -define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { +define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset(ptr addrspace(8) %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { ; UNPACKED-LABEL: name: struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: successors: %bb.2(0x80000000) @@ -165,10 +165,10 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -197,7 +197,7 @@ ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 - ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) + ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} @@ -239,10 +239,10 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -271,7 +271,7 @@ ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 - ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) + ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} @@ -286,11 +286,11 @@ ; PACKED-NEXT: $vgpr0 = COPY [[COPY13]] ; PACKED-NEXT: $vgpr1 = COPY [[COPY14]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - %val = call <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret <4 x half> %val } -define amdgpu_ps half @struct_buffer_load_format_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffsset_add_4095(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset.base, i32 inreg %soffset) { +define amdgpu_ps half @struct_buffer_load_format_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffsset_add_4095(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset.base, i32 inreg %soffset) { ; UNPACKED-LABEL: name: struct_buffer_load_format_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffsset_add_4095 ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -299,12 +299,12 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; PACKED-LABEL: name: struct_buffer_load_format_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffsset_add_4095 @@ -315,20 +315,20 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset = add i32 %voffset.base, 4095 - %val = call half @llvm.amdgcn.struct.buffer.load.format.f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call half @llvm.amdgcn.struct.buffer.load.format.f16(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret half %val } -define amdgpu_ps half @struct_buffer_load_format_i16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps half @struct_buffer_load_format_i16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: struct_buffer_load_format_i16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -337,12 +337,12 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; PACKED-LABEL: name: struct_buffer_load_format_i16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset @@ -353,23 +353,23 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call i16 @llvm.amdgcn.struct.buffer.load.format.i16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call i16 @llvm.amdgcn.struct.buffer.load.format.i16(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) %fval = bitcast i16 %val to half ret half %fval } -declare half @llvm.amdgcn.struct.buffer.load.format.f16(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare <2 x half> @llvm.amdgcn.struct.buffer.load.format.v2f16(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare i16 @llvm.amdgcn.struct.buffer.load.format.i16(<4 x i32>, i32, i32, i32, i32 immarg) #0 +declare half @llvm.amdgcn.struct.buffer.load.format.f16(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare <2 x half> @llvm.amdgcn.struct.buffer.load.format.v2f16(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare i16 @llvm.amdgcn.struct.buffer.load.format.i16(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -define amdgpu_ps float @struct_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @struct_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -10,19 +10,19 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret float %val } -define amdgpu_ps <2 x float> @struct_buffer_load_format_v2f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <2 x float> @struct_buffer_load_format_v2f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_load_format_v2f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -31,22 +31,22 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XY_BOTHEN]].sub0 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XY_BOTHEN]].sub1 ; CHECK-NEXT: $vgpr0 = COPY [[COPY7]] ; CHECK-NEXT: $vgpr1 = COPY [[COPY8]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - %val = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret <2 x float> %val } -define amdgpu_ps <3 x float> @struct_buffer_load_format_v3f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <3 x float> @struct_buffer_load_format_v3f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_load_format_v3f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -55,12 +55,12 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_FORMAT_XYZ_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_FORMAT_XYZ_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub0 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub1 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub2 @@ -68,11 +68,11 @@ ; CHECK-NEXT: $vgpr1 = COPY [[COPY8]] ; CHECK-NEXT: $vgpr2 = COPY [[COPY9]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 - %val = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret <3 x float> %val } -define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_load_format_v4f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -81,12 +81,12 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub2 @@ -96,12 +96,12 @@ ; CHECK-NEXT: $vgpr2 = COPY [[COPY9]] ; CHECK-NEXT: $vgpr3 = COPY [[COPY10]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - %val = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret <4 x float> %val } ; Waterfall for rsrc and soffset, copy for voffset -define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32__vpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { +define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32__vpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset(ptr addrspace(8) %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { ; CHECK-LABEL: name: struct_buffer_load_format_v4f32__vpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -111,10 +111,10 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -143,7 +143,7 @@ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -162,11 +162,11 @@ ; CHECK-NEXT: $vgpr2 = COPY [[COPY15]] ; CHECK-NEXT: $vgpr3 = COPY [[COPY16]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - %val = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret <4 x float> %val } -define amdgpu_ps float @struct_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffsset_add_4095(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset.base, i32 inreg %soffset) { +define amdgpu_ps float @struct_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffsset_add_4095(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset.base, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffsset_add_4095 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -175,20 +175,20 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset = add i32 %voffset.base, 4095 - %val = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret float %val } -define amdgpu_ps float @struct_buffer_load_format_i32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @struct_buffer_load_format_i32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_load_format_i32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -197,20 +197,20 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) %fval = bitcast i32 %val to float ret float %fval } -define amdgpu_cs void @struct_buffer_load_format_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %value, ptr addrspace(1) %status) { +define amdgpu_cs void @struct_buffer_load_format_v4i32_tfe(ptr addrspace(8) inreg %rsrc, ptr addrspace(1) %value, ptr addrspace(1) %status) { ; CHECK-LABEL: name: struct_buffer_load_format_v4i32_tfe ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -219,26 +219,26 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN:%[0-9]+]]:vreg_160 = BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN [[COPY8]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN:%[0-9]+]]:vreg_160 = BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN [[COPY8]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub0 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub1 ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub2 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub3 ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub4 ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY12]], %subreg.sub3 - ; CHECK-NEXT: FLAT_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>) into %ir.value, addrspace 1) - ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE2]], [[COPY13]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.status, addrspace 1) + ; CHECK-NEXT: FLAT_STORE_DWORDX4 [[REG_SEQUENCE]], [[REG_SEQUENCE3]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>) into %ir.value, addrspace 1) + ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY13]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.status, addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 - %load = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %load = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %v = extractvalue { <4 x i32>, i32 } %load, 0 store <4 x i32> %v, ptr addrspace(1) %value @@ -249,7 +249,7 @@ ret void } -define amdgpu_cs void @struct_buffer_load_format_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %value, ptr addrspace(1) %status) { +define amdgpu_cs void @struct_buffer_load_format_v3i32_tfe(ptr addrspace(8) inreg %rsrc, ptr addrspace(1) %value, ptr addrspace(1) %status) { ; CHECK-LABEL: name: struct_buffer_load_format_v3i32_tfe ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -258,25 +258,25 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN [[COPY8]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN [[COPY8]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub0 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub1 ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub2 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub3 ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2 - ; CHECK-NEXT: FLAT_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<3 x s32>) into %ir.value, align 16, addrspace 1) - ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.status, addrspace 1) + ; CHECK-NEXT: FLAT_STORE_DWORDX3 [[REG_SEQUENCE]], [[REG_SEQUENCE3]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<3 x s32>) into %ir.value, align 16, addrspace 1) + ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY12]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.status, addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 - %load = call { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %load = call { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %v = extractvalue { <3 x i32>, i32 } %load, 0 store <3 x i32> %v, ptr addrspace(1) %value @@ -287,7 +287,7 @@ ret void } -define amdgpu_cs void @struct_buffer_load_format_i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %value, ptr addrspace(1) %status) { +define amdgpu_cs void @struct_buffer_load_format_i32_tfe(ptr addrspace(8) inreg %rsrc, ptr addrspace(1) %value, ptr addrspace(1) %status) { ; CHECK-LABEL: name: struct_buffer_load_format_i32_tfe ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -296,22 +296,22 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_TFE_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_X_TFE_IDXEN [[COPY8]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_TFE_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_X_TFE_IDXEN [[COPY8]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_X_TFE_IDXEN]].sub0 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_X_TFE_IDXEN]].sub1 - ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.value, addrspace 1) - ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.status, addrspace 1) + ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.value, addrspace 1) + ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY10]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.status, addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 - %load = call { i32, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %load = call { i32, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %v = extractvalue { i32, i32 } %load, 0 store i32 %v, ptr addrspace(1) %value @@ -322,13 +322,13 @@ ret void } -declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3i32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare { i32, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_i32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0 +declare float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3i32i32s(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare { i32, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_i32i32s(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll @@ -2,7 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s ; Natural mapping -define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -11,20 +11,20 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret float %val } ; Natural mapping -define amdgpu_ps <2 x float> @struct_buffer_load_v2f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <2 x float> @struct_buffer_load_v2f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_load_v2f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -33,23 +33,23 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX2_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX2_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_BOTHEN]].sub0 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_BOTHEN]].sub1 ; CHECK-NEXT: $vgpr0 = COPY [[COPY7]] ; CHECK-NEXT: $vgpr1 = COPY [[COPY8]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - %val = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret <2 x float> %val } ; Natural mapping -define amdgpu_ps <3 x float> @struct_buffer_load_v3f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <3 x float> @struct_buffer_load_v3f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_load_v3f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -58,12 +58,12 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX3_BOTHEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX3_BOTHEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_BOTHEN]].sub0 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_BOTHEN]].sub1 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_BOTHEN]].sub2 @@ -71,12 +71,12 @@ ; CHECK-NEXT: $vgpr1 = COPY [[COPY8]] ; CHECK-NEXT: $vgpr2 = COPY [[COPY9]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 - %val = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret <3 x float> %val } ; Natural mapping -define amdgpu_ps <4 x float> @struct_buffer_load_v4f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <4 x float> @struct_buffer_load_v4f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_load_v4f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -85,12 +85,12 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_BOTHEN]].sub0 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_BOTHEN]].sub1 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_BOTHEN]].sub2 @@ -100,12 +100,12 @@ ; CHECK-NEXT: $vgpr2 = COPY [[COPY9]] ; CHECK-NEXT: $vgpr3 = COPY [[COPY10]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - %val = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret <4 x float> %val } ; Natural mapping -define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -114,21 +114,21 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 %voffset, i32 %soffset, i32 0) ret float %val } ; Natural mapping -define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset.base, i32 inreg %soffset) { +define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset.base, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -137,20 +137,20 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset = add i32 %voffset.base, 4095 - %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret float %val } -define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_soffset_64(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset) { +define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_soffset_64(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset) { ; CHECK-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_soffset_64 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1 @@ -159,20 +159,20 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 64 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 64, i32 0) + %val = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 64, i32 0) ret float %val } ; Need to legalize all reg operands -define amdgpu_ps float @struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { +define amdgpu_ps float @struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset(ptr addrspace(8) %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { ; CHECK-LABEL: name: struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -182,10 +182,10 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -214,7 +214,7 @@ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -226,11 +226,11 @@ ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret float %val } -define amdgpu_ps float @struct_buffer_load_i8_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @struct_buffer_load_i8_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_load_i8_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -239,21 +239,21 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_LOAD_UBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_UBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8) from %ir.rsrc, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call i8 @llvm.amdgcn.struct.buffer.load.i8(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) %ext = zext i8 %val to i32 %cast = bitcast i32 %ext to float ret float %cast } -define amdgpu_ps float @struct_buffer_load_i8_sext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @struct_buffer_load_i8_sext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_load_i8_sext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -262,21 +262,21 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_LOAD_SBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_SBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8) from %ir.rsrc, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call i8 @llvm.amdgcn.struct.buffer.load.i8(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) %ext = sext i8 %val to i32 %cast = bitcast i32 %ext to float ret float %cast } -define amdgpu_ps float @struct_buffer_load_i16_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @struct_buffer_load_i16_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_load_i16_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -285,21 +285,21 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_LOAD_USHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_USHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call i16 @llvm.amdgcn.struct.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) %ext = zext i16 %val to i32 %cast = bitcast i32 %ext to float ret float %cast } -define amdgpu_ps float @struct_buffer_load_i16_sext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @struct_buffer_load_i16_sext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_load_i16_sext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -308,22 +308,22 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_LOAD_SSHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SSHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_SSHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SSHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SSHORT_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call i16 @llvm.amdgcn.struct.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) %ext = sext i16 %val to i32 %cast = bitcast i32 %ext to float ret float %cast } ; Natural mapping -define amdgpu_ps half @struct_buffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps half @struct_buffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -332,20 +332,20 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_LOAD_USHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_USHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call half @llvm.amdgcn.struct.buffer.load.f16(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret half %val } ; Natural mapping -define amdgpu_ps <2 x half> @struct_buffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <2 x half> @struct_buffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -354,26 +354,26 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret <2 x half> %val } ; FIXME: Crashes -; define amdgpu_ps <3 x half> @struct_buffer_load_v3f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { -; %val = call <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) +; define amdgpu_ps <3 x half> @struct_buffer_load_v3f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +; %val = call <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ; ret <3 x half> %val ; } ; Natural mapping -define amdgpu_ps <4 x half> @struct_buffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <4 x half> @struct_buffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -382,23 +382,23 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX2_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX2_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_BOTHEN]].sub0 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_BOTHEN]].sub1 ; CHECK-NEXT: $vgpr0 = COPY [[COPY7]] ; CHECK-NEXT: $vgpr1 = COPY [[COPY8]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - %val = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret <4 x half> %val } ; Natural mapping + glc -define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_glc(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_glc(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_glc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -407,27 +407,27 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 1) + %val = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 1) ret float %val } -declare i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32>, i32, i32, i32, i32 immarg) #0 +declare i8 @llvm.amdgcn.struct.buffer.load.i8(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare i16 @llvm.amdgcn.struct.buffer.load.i16(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare half @llvm.amdgcn.struct.buffer.load.f16(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll @@ -2,7 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=UNPACKED %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=PACKED %s -define amdgpu_ps void @struct_buffer_store_format_f16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(half %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_store_format_f16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(half %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: struct_buffer_store_format_f16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -12,12 +12,12 @@ ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_gfx80_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_gfx80_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: struct_buffer_store_format_f16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): @@ -28,18 +28,18 @@ ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.f16(half %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @struct_buffer_store_format_v2f16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_store_format_v2f16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: struct_buffer_store_format_v2f16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -49,16 +49,16 @@ ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY]], implicit $exec - ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 + ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_BOTHEN_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) + ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: struct_buffer_store_format_v2f16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): @@ -69,24 +69,24 @@ ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) + ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } ; FIXME: -; define amdgpu_ps void @struct_buffer_store_format_v3f16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<3 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { -; call void @llvm.amdgcn.struct.buffer.store.format.v3f16(<3 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) +; define amdgpu_ps void @struct_buffer_store_format_v3f16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<3 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +; call void @llvm.amdgcn.struct.buffer.store.format.v3f16(<3 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ; ret void ; } -define amdgpu_ps void @struct_buffer_store_format_v4f16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_store_format_v4f16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: struct_buffer_store_format_v4f16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -97,7 +97,6 @@ ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 @@ -106,9 +105,10 @@ ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY]], implicit $exec ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY10]], [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 + ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 - ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_BOTHEN_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) + ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: struct_buffer_store_format_v4f16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): @@ -121,18 +121,18 @@ ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; PACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 - ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) + ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v4f16(<4 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset(half inreg %val, <4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { +define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset(half inreg %val, ptr addrspace(8) %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { ; UNPACKED-LABEL: name: struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: successors: %bb.2(0x80000000) @@ -143,10 +143,10 @@ ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] @@ -176,7 +176,7 @@ ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 - ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_gfx80_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_gfx80_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} @@ -197,10 +197,10 @@ ; PACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; PACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; PACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] @@ -230,7 +230,7 @@ ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 - ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} @@ -241,11 +241,11 @@ ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.f16(half %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @struct_buffer_store_format_i16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(i16 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_store_format_i16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(i16 %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: struct_buffer_store_format_i16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -255,12 +255,12 @@ ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_gfx80_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_gfx80_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: struct_buffer_store_format_i16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): @@ -271,19 +271,19 @@ ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.store.format.i16(i16 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.i16(i16 %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } -declare void @llvm.amdgcn.struct.buffer.store.format.f16(half, <4 x i32>, i32, i32, i32, i32 immarg) -declare void @llvm.amdgcn.struct.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) -declare void @llvm.amdgcn.struct.buffer.store.format.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32 immarg) -declare void @llvm.amdgcn.struct.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32 immarg) -declare void @llvm.amdgcn.struct.buffer.store.format.i16(i16, <4 x i32>, i32, i32, i32, i32 immarg) +declare void @llvm.amdgcn.struct.buffer.store.format.f16(half, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare void @llvm.amdgcn.struct.buffer.store.format.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare void @llvm.amdgcn.struct.buffer.store.format.v3f16(<3 x half>, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare void @llvm.amdgcn.struct.buffer.store.format.v4f16(<4 x half>, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare void @llvm.amdgcn.struct.buffer.store.format.i16(i16, ptr addrspace(8), i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=instruction-select -o - %s | FileCheck %s -define amdgpu_ps void @struct_buffer_store_format_f32__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_store_format_f32__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_store_format_f32__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -11,18 +11,18 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; CHECK-NEXT: BUFFER_STORE_FORMAT_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_FORMAT_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @struct_buffer_store_format_v2f32__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<2 x float> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_store_format_v2f32__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<2 x float> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_store_format_v2f32__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -34,18 +34,18 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 - ; CHECK-NEXT: BUFFER_STORE_FORMAT_XY_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_FORMAT_XY_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @struct_buffer_store_format_v3f32__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<3 x float> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_store_format_v3f32__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<3 x float> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_store_format_v3f32__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 @@ -58,18 +58,18 @@ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 - ; CHECK-NEXT: BUFFER_STORE_FORMAT_XYZ_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_FORMAT_XYZ_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.store.format.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v3f32(<3 x float> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @struct_buffer_store_format_v4f32__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x float> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_store_format_v4f32__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x float> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_store_format_v4f32__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 @@ -83,18 +83,18 @@ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1 - ; CHECK-NEXT: BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY10]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY10]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @struct_buffer_store_format_f32__sgpr_val__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset(float inreg %val, <4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { +define amdgpu_ps void @struct_buffer_store_format_f32__sgpr_val__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset(float inreg %val, ptr addrspace(8) %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { ; CHECK-LABEL: name: struct_buffer_store_format_f32__sgpr_val__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -105,10 +105,10 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] @@ -138,7 +138,7 @@ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 - ; CHECK-NEXT: BUFFER_STORE_FORMAT_X_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_FORMAT_X_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -149,11 +149,11 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @struct_buffer_store_format_i32__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_store_format_i32__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(i32 %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_store_format_i32__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -163,19 +163,19 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; CHECK-NEXT: BUFFER_STORE_FORMAT_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_FORMAT_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.store.format.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.i32(i32 %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } -declare void @llvm.amdgcn.struct.buffer.store.format.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) -declare void @llvm.amdgcn.struct.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32 immarg) -declare void @llvm.amdgcn.struct.buffer.store.format.v3f32(<3 x float>, <4 x i32>, i32, i32, i32, i32 immarg) -declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) -declare void @llvm.amdgcn.struct.buffer.store.format.i32(i32, <4 x i32>, i32, i32, i32, i32 immarg) +declare void @llvm.amdgcn.struct.buffer.store.format.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare void @llvm.amdgcn.struct.buffer.store.format.v2f32(<2 x float>, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare void @llvm.amdgcn.struct.buffer.store.format.v3f32(<3 x float>, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare void @llvm.amdgcn.struct.buffer.store.format.i32(i32, ptr addrspace(8), i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll @@ -2,7 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s ; Natural mapping -define amdgpu_ps void @struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -12,18 +12,18 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; CHECK-NEXT: BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @struct_buffer_store_v2f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(<2 x float> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_store_v2f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(<2 x float> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_store_v2f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -35,18 +35,18 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 - ; CHECK-NEXT: BUFFER_STORE_DWORDX2_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_DWORDX2_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @struct_buffer_store_v3f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(<3 x float> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_store_v3f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(<3 x float> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_store_v3f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 @@ -59,18 +59,18 @@ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 - ; CHECK-NEXT: BUFFER_STORE_DWORDX3_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_DWORDX3_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v3f32(<3 x float> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @struct_buffer_store_v4f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x float> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_store_v4f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x float> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_store_v4f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 @@ -84,18 +84,18 @@ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1 - ; CHECK-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY10]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY10]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vindex__sgpr_voffset__vgpr_soffset(<4 x float> inreg %val, <4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { +define amdgpu_ps void @struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vindex__sgpr_voffset__vgpr_soffset(<4 x float> inreg %val, ptr addrspace(8) %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { ; CHECK-LABEL: name: struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vindex__sgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -110,10 +110,10 @@ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] @@ -143,7 +143,7 @@ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1 - ; CHECK-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact [[COPY11]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact [[COPY11]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -154,11 +154,11 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @struct_buffer_store_i8_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_store_i8_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(i32 %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_store_i8_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -168,19 +168,19 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; CHECK-NEXT: BUFFER_STORE_BYTE_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_BYTE_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s8) into %ir.rsrc, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 %val.trunc = trunc i32 %val to i8 - call void @llvm.amdgcn.struct.buffer.store.i8(i8 %val.trunc, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.struct.buffer.store.i8(i8 %val.trunc, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @struct_buffer_store_i16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_store_i16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(i32 %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_store_i16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -190,19 +190,19 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; CHECK-NEXT: BUFFER_STORE_SHORT_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_SHORT_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 %val.trunc = trunc i32 %val to i16 - call void @llvm.amdgcn.struct.buffer.store.i16(i16 %val.trunc, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.struct.buffer.store.i16(i16 %val.trunc, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset_glc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset_glc(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset_glc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -212,18 +212,18 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; CHECK-NEXT: BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 1) + call void @llvm.amdgcn.struct.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 1) ret void } -define amdgpu_ps void @struct_buffer_store_v2f16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_store_v2f16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_store_v2f16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -233,24 +233,24 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; CHECK-NEXT: BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } ; FIXME: -; define amdgpu_ps void @struct_buffer_store_v3f16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(<3 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { -; call void @llvm.amdgcn.struct.buffer.store.v3f16(<3 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) +; define amdgpu_ps void @struct_buffer_store_v3f16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(<3 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +; call void @llvm.amdgcn.struct.buffer.store.v3f16(<3 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ; ret void ; } -define amdgpu_ps void @struct_buffer_store_v4f16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_store_v4f16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_store_v4f16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -262,26 +262,26 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 - ; CHECK-NEXT: BUFFER_STORE_DWORDX2_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_DWORDX2_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } -declare void @llvm.amdgcn.struct.buffer.store.i8(i8, <4 x i32>, i32, i32, i32, i32 immarg) -declare void @llvm.amdgcn.struct.buffer.store.i16(i16, <4 x i32>, i32, i32, i32, i32 immarg) +declare void @llvm.amdgcn.struct.buffer.store.i8(i8, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare void @llvm.amdgcn.struct.buffer.store.i16(i16, ptr addrspace(8), i32, i32, i32, i32 immarg) -declare void @llvm.amdgcn.struct.buffer.store.f16(half, <4 x i32>, i32, i32, i32, i32 immarg) -declare void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) -declare void @llvm.amdgcn.struct.buffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32 immarg) -declare void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32 immarg) +declare void @llvm.amdgcn.struct.buffer.store.f16(half, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare void @llvm.amdgcn.struct.buffer.store.v3f16(<3 x half>, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half>, ptr addrspace(8), i32, i32, i32, i32 immarg) -declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) -declare void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32 immarg) -declare void @llvm.amdgcn.struct.buffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i32, i32 immarg) -declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) +declare void @llvm.amdgcn.struct.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float>, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare void @llvm.amdgcn.struct.buffer.store.v3f32(<3 x float>, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll @@ -4,7 +4,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=PACKED %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=UNPACKED %s -define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): ; PACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -13,12 +13,12 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; UNPACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset @@ -29,19 +29,19 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) + %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) ret half %val } -define amdgpu_ps <2 x half> @struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <2 x half> @struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; PACKED-LABEL: name: struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): ; PACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -50,12 +50,12 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>), align 1, addrspace 8) + ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; UNPACKED-LABEL: name: struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset @@ -66,12 +66,12 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>), align 1, addrspace 8) + ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub0 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub1 ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 @@ -85,17 +85,17 @@ ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call <2 x half> @llvm.amdgcn.struct.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) + %val = call <2 x half> @llvm.amdgcn.struct.tbuffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) ret <2 x half> %val } ; FIXME: Crashes -; define amdgpu_ps <3 x half> @struct_tbuffer_load_v3f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { -; %val = call <3 x half> @llvm.amdgcn.struct.tbuffer.load.v3f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) +; define amdgpu_ps <3 x half> @struct_tbuffer_load_v3f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +; %val = call <3 x half> @llvm.amdgcn.struct.tbuffer.load.v3f16(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) ; ret <3 x half> %val ; } -define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; PACKED-LABEL: name: struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): ; PACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -104,12 +104,12 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) + ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 ; PACKED-NEXT: $vgpr0 = COPY [[COPY7]] @@ -123,12 +123,12 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) + ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub0 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub2 @@ -152,11 +152,11 @@ ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - %val = call <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) + %val = call <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) ret <4 x half> %val } -define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0 ; PACKED: bb.1 (%ir-block.0): ; PACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -165,13 +165,13 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; UNPACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0 @@ -182,20 +182,20 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 %voffset, i32 %soffset, i32 78, i32 0) + %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(ptr addrspace(8) %rsrc, i32 0, i32 %voffset, i32 %soffset, i32 78, i32 0) ret half %val } -define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { +define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset(ptr addrspace(8) %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { ; PACKED-LABEL: name: struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset ; PACKED: bb.1 (%ir-block.0): ; PACKED-NEXT: successors: %bb.2(0x80000000) @@ -205,10 +205,10 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -237,7 +237,7 @@ ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 - ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) + ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} @@ -261,10 +261,10 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -293,7 +293,7 @@ ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 - ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) + ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} @@ -326,11 +326,11 @@ ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - %val = call <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) + %val = call <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) ret <4 x half> %val } -define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset.base, i32 inreg %soffset) { +define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset.base, i32 inreg %soffset) { ; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095 ; PACKED: bb.1 (%ir-block.0): ; PACKED-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -339,12 +339,12 @@ ; PACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; UNPACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095 @@ -355,22 +355,22 @@ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset = add i32 %voffset.base, 4095 - %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) + %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) ret half %val } -declare half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32 immarg, i32 immarg) #0 -declare <2 x half> @llvm.amdgcn.struct.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32 immarg, i32 immarg) #0 -declare <3 x half> @llvm.amdgcn.struct.tbuffer.load.v3f16(<4 x i32>, i32, i32, i32, i32 immarg, i32 immarg) #0 -declare <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32 immarg, i32 immarg) #0 +declare half @llvm.amdgcn.struct.tbuffer.load.f16(ptr addrspace(8), i32, i32, i32, i32 immarg, i32 immarg) #0 +declare <2 x half> @llvm.amdgcn.struct.tbuffer.load.v2f16(ptr addrspace(8), i32, i32, i32, i32 immarg, i32 immarg) #0 +declare <3 x half> @llvm.amdgcn.struct.tbuffer.load.v3f16(ptr addrspace(8), i32, i32, i32, i32 immarg, i32 immarg) #0 +declare <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(ptr addrspace(8), i32, i32, i32, i32 immarg, i32 immarg) #0 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll @@ -3,7 +3,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK %s -define amdgpu_ps float @struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -12,19 +12,19 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) + %val = call float @llvm.amdgcn.struct.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) ret float %val } -define amdgpu_ps <2 x float> @struct_tbuffer_load_v2f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <2 x float> @struct_tbuffer_load_v2f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_tbuffer_load_v2f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -33,22 +33,22 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub0 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub1 ; CHECK-NEXT: $vgpr0 = COPY [[COPY7]] ; CHECK-NEXT: $vgpr1 = COPY [[COPY8]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - %val = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) + %val = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) ret <2 x float> %val } -define amdgpu_ps <3 x float> @struct_tbuffer_load_v3f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <3 x float> @struct_tbuffer_load_v3f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_tbuffer_load_v3f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -57,12 +57,12 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub0 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub1 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub2 @@ -70,11 +70,11 @@ ; CHECK-NEXT: $vgpr1 = COPY [[COPY8]] ; CHECK-NEXT: $vgpr2 = COPY [[COPY9]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 - %val = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) + %val = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) ret <3 x float> %val } -define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_tbuffer_load_v4f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -83,12 +83,12 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub2 @@ -98,11 +98,11 @@ ; CHECK-NEXT: $vgpr2 = COPY [[COPY9]] ; CHECK-NEXT: $vgpr3 = COPY [[COPY10]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - %val = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) + %val = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) ret <4 x float> %val } -define amdgpu_ps float @struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -111,20 +111,20 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %voffset, i32 %soffset, i32 78, i32 0) + %val = call float @llvm.amdgcn.struct.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 %voffset, i32 %soffset, i32 78, i32 0) ret float %val } -define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { +define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset(ptr addrspace(8) %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { ; CHECK-LABEL: name: struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -134,10 +134,10 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -166,7 +166,7 @@ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 - ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -185,11 +185,11 @@ ; CHECK-NEXT: $vgpr2 = COPY [[COPY15]] ; CHECK-NEXT: $vgpr3 = COPY [[COPY16]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - %val = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) + %val = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) ret <4 x float> %val } -define amdgpu_ps float @struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset.base, i32 inreg %soffset) { +define amdgpu_ps float @struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset.base, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -198,22 +198,22 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset = add i32 %voffset.base, 4095 - %val = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) + %val = call float @llvm.amdgcn.struct.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) ret float %val } -declare float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32>, i32, i32, i32, i32 immarg, i32 immarg) #0 -declare <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32>, i32, i32, i32, i32 immarg, i32 immarg) #0 -declare <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32>, i32, i32, i32, i32 immarg, i32 immarg) #0 -declare <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32 immarg, i32 immarg) #0 +declare float @llvm.amdgcn.struct.tbuffer.load.f32(ptr addrspace(8), i32, i32, i32, i32 immarg, i32 immarg) #0 +declare <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(ptr addrspace(8), i32, i32, i32, i32 immarg, i32 immarg) #0 +declare <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(ptr addrspace(8), i32, i32, i32, i32 immarg, i32 immarg) #0 +declare <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8), i32, i32, i32, i32 immarg, i32 immarg) #0 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -o - %s | FileCheck %s -define amdgpu_cs void @test1(i32 %arg1, <4 x i32> inreg %arg2, i32, ptr addrspace(6) inreg %arg3) { +define amdgpu_cs void @test1(i32 %arg1, ptr addrspace(8) inreg %arg2, i32, ptr addrspace(6) inreg %arg3) { ; CHECK-LABEL: test1: ; CHECK: ; %bb.0: ; %.entry ; CHECK-NEXT: v_and_b32_e32 v3, 0x3ffffffc, v0 @@ -17,27 +17,27 @@ %bs1 = and i32 %arg1, 1073741820 %ep1 = getelementptr i32, ptr addrspace(6) %arg3, i32 %bs1 %ad1 = ptrtoint ptr addrspace(6) %ep1 to i32 - call void @llvm.amdgcn.raw.buffer.store.i32(i32 11, <4 x i32> %arg2, i32 %ad1, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i32(i32 11, ptr addrspace(8) %arg2, i32 %ad1, i32 0, i32 0) %bs2 = or i32 %bs1, 1 %ep2 = getelementptr i32, ptr addrspace(6) %arg3, i32 %bs2 %ad2 = ptrtoint ptr addrspace(6) %ep2 to i32 - call void @llvm.amdgcn.raw.buffer.store.i32(i32 22, <4 x i32> %arg2, i32 %ad2, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i32(i32 22, ptr addrspace(8) %arg2, i32 %ad2, i32 0, i32 0) %bs3 = or i32 %bs1, 2 %ep3 = getelementptr i32, ptr addrspace(6) %arg3, i32 %bs3 %ad3 = ptrtoint ptr addrspace(6) %ep3 to i32 - call void @llvm.amdgcn.raw.buffer.store.i32(i32 33, <4 x i32> %arg2, i32 %ad3, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i32(i32 33, ptr addrspace(8) %arg2, i32 %ad3, i32 0, i32 0) %bs4 = or i32 %bs1, 3 %ep4 = getelementptr i32, ptr addrspace(6) %arg3, i32 %bs4 %ad4 = ptrtoint ptr addrspace(6) %ep4 to i32 - call void @llvm.amdgcn.raw.buffer.store.i32(i32 44, <4 x i32> %arg2, i32 %ad4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i32(i32 44, ptr addrspace(8) %arg2, i32 %ad4, i32 0, i32 0) ret void } -define amdgpu_cs void @test2(i32 %arg1, <4 x i32> inreg %arg2) { +define amdgpu_cs void @test2(i32 %arg1, ptr addrspace(8) inreg %arg2) { ; CHECK-LABEL: test2: ; CHECK: ; %bb.0: ; %.entry ; CHECK-NEXT: v_and_b32_e32 v3, 0x3ffffffc, v0 @@ -52,24 +52,24 @@ %bs1 = and i32 %arg1, 1073741820 %ep1 = getelementptr <{ [64 x i32] }>, ptr addrspace(6) null, i32 0, i32 0, i32 %bs1 %ad1 = ptrtoint ptr addrspace(6) %ep1 to i32 - call void @llvm.amdgcn.raw.buffer.store.i32(i32 11, <4 x i32> %arg2, i32 %ad1, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i32(i32 11, ptr addrspace(8) %arg2, i32 %ad1, i32 0, i32 0) %bs2 = or i32 %bs1, 1 %ep2 = getelementptr <{ [64 x i32] }>, ptr addrspace(6) null, i32 0, i32 0, i32 %bs2 %ad2 = ptrtoint ptr addrspace(6) %ep2 to i32 - call void @llvm.amdgcn.raw.buffer.store.i32(i32 22, <4 x i32> %arg2, i32 %ad2, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i32(i32 22, ptr addrspace(8) %arg2, i32 %ad2, i32 0, i32 0) %bs3 = or i32 %bs1, 2 %ep3 = getelementptr <{ [64 x i32] }>, ptr addrspace(6) null, i32 0, i32 0, i32 %bs3 %ad3 = ptrtoint ptr addrspace(6) %ep3 to i32 - call void @llvm.amdgcn.raw.buffer.store.i32(i32 33, <4 x i32> %arg2, i32 %ad3, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i32(i32 33, ptr addrspace(8) %arg2, i32 %ad3, i32 0, i32 0) %bs4 = or i32 %bs1, 3 %ep4 = getelementptr <{ [64 x i32] }>, ptr addrspace(6) null, i32 0, i32 0, i32 %bs4 %ad4 = ptrtoint ptr addrspace(6) %ep4 to i32 - call void @llvm.amdgcn.raw.buffer.store.i32(i32 44, <4 x i32> %arg2, i32 %ad4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i32(i32 44, ptr addrspace(8) %arg2, i32 %ad4, i32 0, i32 0) ret void } -declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.i32(i32, ptr addrspace(8), i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir @@ -18,7 +18,7 @@ ; CHECK-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[COPY1]](s32), 0 %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $sgpr4 - %2:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD %0, %1, 0 + %2:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD %0, %1, 0 :: (dereferenceable invariant load (s128), align 16) ... @@ -40,7 +40,7 @@ ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[COPY]](<4 x s32>), [[C1]](s32), [[COPY1]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $vgpr0 - %2:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD %0, %1, 0 + %2:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD %0, %1, 0 :: (dereferenceable invariant load (s128), align 16) ... @@ -97,7 +97,7 @@ ; CHECK-NEXT: .4: %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $sgpr0 - %2:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD %0, %1, 0 + %2:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD %0, %1, 0 :: (dereferenceable invariant load (s128), align 16) ... @@ -153,6 +153,6 @@ ; CHECK-NEXT: .4: %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $vgpr4 - %2:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD %0, %1, 0 + %2:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD %0, %1, 0 :: (dereferenceable invariant load (s128), align 16) ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll @@ -3,7 +3,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=amdgpu-regbankselect -regbankselect-greedy -o - %s | FileCheck %s ; Natural mapping -define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -12,20 +12,20 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret float %val } ; Copies for VGPR arguments -define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 inreg %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 inreg %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7 @@ -34,21 +34,21 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY7]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY7]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret float %val } ; Waterfall for rsrc -define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(<4 x i32> %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(ptr addrspace(8) %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -58,9 +58,9 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF @@ -69,7 +69,7 @@ ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %20, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -87,7 +87,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -99,12 +99,12 @@ ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret float %val } ; Waterfall for soffset -define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 %soffset) { +define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 %soffset) { ; CHECK-LABEL: name: raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -114,9 +114,9 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF @@ -125,7 +125,7 @@ ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %20, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_]](s32), [[COPY5]] ; CHECK-NEXT: [[INT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1) @@ -134,7 +134,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -146,12 +146,12 @@ ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret float %val } ; Waterfall for rsrc and soffset -define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, i32 %voffset, i32 %soffset) { +define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset) { ; CHECK-LABEL: name: raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -161,9 +161,9 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr5 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF @@ -172,7 +172,7 @@ ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %20, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -193,7 +193,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -205,8 +205,8 @@ ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret float %val } -declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32 immarg) +declare float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8), i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll @@ -3,7 +3,7 @@ ; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -simplify-mir -stop-after=amdgpu-regbankselect -regbankselect-greedy -o - %s | FileCheck %s --check-prefix=GREEDY ; Natural mapping -define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps i32 @s_buffer_load_i32(ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; CHECK-LABEL: name: s_buffer_load_i32 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 @@ -12,9 +12,9 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; CHECK-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[AMDGPU_S_BUFFER_LOAD]](s32) ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) ; CHECK-NEXT: $sgpr0 = COPY [[INT]](s32) @@ -27,18 +27,18 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; GREEDY-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s32)) + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, align 1, addrspace 8) ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[AMDGPU_S_BUFFER_LOAD]](s32) ; GREEDY-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) ; GREEDY-NEXT: $sgpr0 = COPY [[INT]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 - %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret i32 %val } -define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; CHECK-LABEL: name: s_buffer_load_v2i32 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 @@ -47,9 +47,9 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; CHECK-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s64), align 4) + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (<2 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<2 x s32>) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) @@ -66,9 +66,9 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; GREEDY-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s64), align 4) + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (<2 x s32>) from %ir.rsrc, align 1, addrspace 8) ; GREEDY-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<2 x s32>) ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) ; GREEDY-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) @@ -77,11 +77,11 @@ ; GREEDY-NEXT: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32) ; GREEDY-NEXT: $sgpr1 = COPY [[INT1]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 - %val = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <2 x i32> %val } -define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; CHECK-LABEL: name: s_buffer_load_v3i32 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 @@ -90,9 +90,9 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; CHECK-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s96), align 4) + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (<3 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) @@ -112,9 +112,9 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; GREEDY-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s96), align 4) + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (<3 x s32>) from %ir.rsrc, align 1, addrspace 8) ; GREEDY-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) ; GREEDY-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) @@ -126,11 +126,11 @@ ; GREEDY-NEXT: [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32) ; GREEDY-NEXT: $sgpr2 = COPY [[INT2]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 - %val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <3 x i32> %val } -define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; CHECK-LABEL: name: s_buffer_load_v8i32 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 @@ -139,9 +139,9 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; CHECK-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s256), align 4) + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) @@ -176,9 +176,9 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; GREEDY-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s256), align 4) + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) ; GREEDY-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>) ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) ; GREEDY-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) @@ -205,11 +205,11 @@ ; GREEDY-NEXT: [[INT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32) ; GREEDY-NEXT: $sgpr7 = COPY [[INT7]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7 - %val = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <8 x i32> %val } -define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; CHECK-LABEL: name: s_buffer_load_v16i32 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 @@ -218,9 +218,9 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; CHECK-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s512), align 4) + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32), [[UV8:%[0-9]+]]:sgpr(s32), [[UV9:%[0-9]+]]:sgpr(s32), [[UV10:%[0-9]+]]:sgpr(s32), [[UV11:%[0-9]+]]:sgpr(s32), [[UV12:%[0-9]+]]:sgpr(s32), [[UV13:%[0-9]+]]:sgpr(s32), [[UV14:%[0-9]+]]:sgpr(s32), [[UV15:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<16 x s32>) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) @@ -279,9 +279,9 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; GREEDY-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s512), align 4) + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) ; GREEDY-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32), [[UV8:%[0-9]+]]:sgpr(s32), [[UV9:%[0-9]+]]:sgpr(s32), [[UV10:%[0-9]+]]:sgpr(s32), [[UV11:%[0-9]+]]:sgpr(s32), [[UV12:%[0-9]+]]:sgpr(s32), [[UV13:%[0-9]+]]:sgpr(s32), [[UV14:%[0-9]+]]:sgpr(s32), [[UV15:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<16 x s32>) ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) ; GREEDY-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) @@ -332,12 +332,12 @@ ; GREEDY-NEXT: [[INT15:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY20]](s32) ; GREEDY-NEXT: $sgpr15 = COPY [[INT15]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15 - %val = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <16 x i32> %val } ; Check cases that need to be converted to MUBUF due to the offset being a VGPR. -define amdgpu_ps float @s_buffer_load_f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { +define amdgpu_ps float @s_buffer_load_f32_vgpr_offset(ptr addrspace(8) inreg %rsrc, i32 %soffset) { ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_offset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -346,11 +346,11 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GREEDY-LABEL: name: s_buffer_load_f32_vgpr_offset @@ -361,18 +361,18 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GREEDY-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret float %val } -define amdgpu_ps <2 x float> @s_buffer_load_v2f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { +define amdgpu_ps <2 x float> @s_buffer_load_v2f32_vgpr_offset(ptr addrspace(8) inreg %rsrc, i32 %soffset) { ; CHECK-LABEL: name: s_buffer_load_v2f32_vgpr_offset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -381,11 +381,11 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s64), align 4) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s64) from %ir.rsrc, align 4, addrspace 8) ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<2 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -398,20 +398,20 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s64), align 4) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s64) from %ir.rsrc, align 4, addrspace 8) ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<2 x s32>) ; GREEDY-NEXT: $vgpr0 = COPY [[UV]](s32) ; GREEDY-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - %val = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <2 x float> %val } -define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { +define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(ptr addrspace(8) inreg %rsrc, i32 %soffset) { ; CHECK-LABEL: name: s_buffer_load_v3f32_vgpr_offset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -420,11 +420,11 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -438,21 +438,21 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>) ; GREEDY-NEXT: $vgpr0 = COPY [[UV]](s32) ; GREEDY-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GREEDY-NEXT: $vgpr2 = COPY [[UV2]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 - %val = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <3 x float> %val } -define amdgpu_ps <4 x float> @s_buffer_load_v4f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { +define amdgpu_ps <4 x float> @s_buffer_load_v4f32_vgpr_offset(ptr addrspace(8) inreg %rsrc, i32 %soffset) { ; CHECK-LABEL: name: s_buffer_load_v4f32_vgpr_offset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -461,11 +461,11 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -480,22 +480,22 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>) ; GREEDY-NEXT: $vgpr0 = COPY [[UV]](s32) ; GREEDY-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GREEDY-NEXT: $vgpr2 = COPY [[UV2]](s32) ; GREEDY-NEXT: $vgpr3 = COPY [[UV3]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - %val = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <4 x float> %val } -define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset(ptr addrspace(8) inreg %rsrc, i32 %soffset) { ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -504,12 +504,12 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -529,12 +529,12 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GREEDY-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; GREEDY-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -546,11 +546,11 @@ ; GREEDY-NEXT: $vgpr6 = COPY [[UV6]](s32) ; GREEDY-NEXT: $vgpr7 = COPY [[UV7]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 - %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <8 x float> %val } -define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { +define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset(ptr addrspace(8) inreg %rsrc, i32 %soffset) { ; CHECK-LABEL: name: s_buffer_load_v16f32_vgpr_offset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -559,14 +559,14 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 16, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 48, align 4, addrspace 8) ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>) ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -594,14 +594,14 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 16, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 48, align 4, addrspace 8) ; GREEDY-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>) ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) ; GREEDY-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -621,11 +621,11 @@ ; GREEDY-NEXT: $vgpr14 = COPY [[UV14]](s32) ; GREEDY-NEXT: $vgpr15 = COPY [[UV15]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 - %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <16 x float> %val } -define amdgpu_ps void @s_buffer_load_i96_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { +define amdgpu_ps void @s_buffer_load_i96_vgpr_offset(ptr addrspace(8) inreg %rsrc, i32 %soffset) { ; CHECK-LABEL: name: s_buffer_load_i96_vgpr_offset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -634,12 +634,12 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s96) = G_TRUNC [[AMDGPU_BUFFER_LOAD]](s128) ; CHECK-NEXT: G_STORE [[TRUNC]](s96), [[DEF]](p1) :: (store (s96) into `ptr addrspace(1) undef`, align 8, addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 @@ -651,22 +651,22 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GREEDY-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s96) = G_TRUNC [[AMDGPU_BUFFER_LOAD]](s128) ; GREEDY-NEXT: G_STORE [[TRUNC]](s96), [[DEF]](p1) :: (store (s96) into `ptr addrspace(1) undef`, align 8, addrspace 1) ; GREEDY-NEXT: S_ENDPGM 0 - %val = call i96 @llvm.amdgcn.s.buffer.load.i96(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call i96 @llvm.amdgcn.s.buffer.load.i96(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) store i96 %val, ptr addrspace(1) undef ret void } ; Test split of a wide scalar -define amdgpu_ps void @s_buffer_load_i256_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { +define amdgpu_ps void @s_buffer_load_i256_vgpr_offset(ptr addrspace(8) inreg %rsrc, i32 %soffset) { ; CHECK-LABEL: name: s_buffer_load_i256_vgpr_offset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -675,13 +675,13 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s256) = G_MERGE_VALUES [[AMDGPU_BUFFER_LOAD]](s128), [[AMDGPU_BUFFER_LOAD1]](s128) ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s128), [[UV1:%[0-9]+]]:vgpr(s128) = G_UNMERGE_VALUES [[MV]](s256) ; CHECK-NEXT: G_STORE [[UV]](s128), [[DEF]](p1) :: (store (s128) into `ptr addrspace(1) undef`, align 8, addrspace 1) @@ -697,13 +697,13 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GREEDY-NEXT: [[MV:%[0-9]+]]:vgpr(s256) = G_MERGE_VALUES [[AMDGPU_BUFFER_LOAD]](s128), [[AMDGPU_BUFFER_LOAD1]](s128) ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr(s128), [[UV1:%[0-9]+]]:vgpr(s128) = G_UNMERGE_VALUES [[MV]](s256) ; GREEDY-NEXT: G_STORE [[UV]](s128), [[DEF]](p1) :: (store (s128) into `ptr addrspace(1) undef`, align 8, addrspace 1) @@ -711,13 +711,13 @@ ; GREEDY-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) ; GREEDY-NEXT: G_STORE [[UV1]](s128), [[PTR_ADD]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 16, align 8, addrspace 1) ; GREEDY-NEXT: S_ENDPGM 0 - %val = call i256 @llvm.amdgcn.s.buffer.load.i256(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call i256 @llvm.amdgcn.s.buffer.load.i256(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) store i256 %val, ptr addrspace(1) undef ret void } ; Test split of a wide scalar -define amdgpu_ps void @s_buffer_load_i512_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { +define amdgpu_ps void @s_buffer_load_i512_vgpr_offset(ptr addrspace(8) inreg %rsrc, i32 %soffset) { ; CHECK-LABEL: name: s_buffer_load_i512_vgpr_offset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -726,15 +726,15 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 16, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 48, align 4, addrspace 8) ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s512) = G_MERGE_VALUES [[AMDGPU_BUFFER_LOAD]](s128), [[AMDGPU_BUFFER_LOAD1]](s128), [[AMDGPU_BUFFER_LOAD2]](s128), [[AMDGPU_BUFFER_LOAD3]](s128) ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s128), [[UV1:%[0-9]+]]:vgpr(s128), [[UV2:%[0-9]+]]:vgpr(s128), [[UV3:%[0-9]+]]:vgpr(s128) = G_UNMERGE_VALUES [[MV]](s512) ; CHECK-NEXT: G_STORE [[UV]](s128), [[DEF]](p1) :: (store (s128) into `ptr addrspace(1) undef`, align 8, addrspace 1) @@ -756,15 +756,15 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 16, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 48, align 4, addrspace 8) ; GREEDY-NEXT: [[MV:%[0-9]+]]:vgpr(s512) = G_MERGE_VALUES [[AMDGPU_BUFFER_LOAD]](s128), [[AMDGPU_BUFFER_LOAD1]](s128), [[AMDGPU_BUFFER_LOAD2]](s128), [[AMDGPU_BUFFER_LOAD3]](s128) ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr(s128), [[UV1:%[0-9]+]]:vgpr(s128), [[UV2:%[0-9]+]]:vgpr(s128), [[UV3:%[0-9]+]]:vgpr(s128) = G_UNMERGE_VALUES [[MV]](s512) ; GREEDY-NEXT: G_STORE [[UV]](s128), [[DEF]](p1) :: (store (s128) into `ptr addrspace(1) undef`, align 8, addrspace 1) @@ -778,13 +778,13 @@ ; GREEDY-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) ; GREEDY-NEXT: G_STORE [[UV3]](s128), [[PTR_ADD2]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 48, align 8, addrspace 1) ; GREEDY-NEXT: S_ENDPGM 0 - %val = call i512 @llvm.amdgcn.s.buffer.load.i512(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call i512 @llvm.amdgcn.s.buffer.load.i512(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) store i512 %val, ptr addrspace(1) undef ret void } ; Test split of a vector with 16-bit elements -define amdgpu_ps void @s_buffer_load_v16i16_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { +define amdgpu_ps void @s_buffer_load_v16i16_vgpr_offset(ptr addrspace(8) inreg %rsrc, i32 %soffset) { ; CHECK-LABEL: name: s_buffer_load_v16i16_vgpr_offset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -793,13 +793,13 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>) ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s16>) ; CHECK-NEXT: G_STORE [[UV]](<8 x s16>), [[DEF]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef`, align 32, addrspace 1) @@ -815,13 +815,13 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GREEDY-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>) ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s16>) ; GREEDY-NEXT: G_STORE [[UV]](<8 x s16>), [[DEF]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef`, align 32, addrspace 1) @@ -829,13 +829,13 @@ ; GREEDY-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) ; GREEDY-NEXT: G_STORE [[UV1]](<8 x s16>), [[PTR_ADD]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 16, basealign 32, addrspace 1) ; GREEDY-NEXT: S_ENDPGM 0 - %val = call <16 x i16> @llvm.amdgcn.s.buffer.load.v16i16(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <16 x i16> @llvm.amdgcn.s.buffer.load.v16i16(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) store <16 x i16> %val, ptr addrspace(1) undef ret void } ; Test split of a vector with 16-bit elements -define amdgpu_ps void @s_buffer_load_v32i16_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { +define amdgpu_ps void @s_buffer_load_v32i16_vgpr_offset(ptr addrspace(8) inreg %rsrc, i32 %soffset) { ; CHECK-LABEL: name: s_buffer_load_v32i16_vgpr_offset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -844,15 +844,15 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 16, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 48, align 4, addrspace 8) ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<32 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>), [[AMDGPU_BUFFER_LOAD2]](<8 x s16>), [[AMDGPU_BUFFER_LOAD3]](<8 x s16>) ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>), [[UV2:%[0-9]+]]:vgpr(<8 x s16>), [[UV3:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<32 x s16>) ; CHECK-NEXT: G_STORE [[UV]](<8 x s16>), [[DEF]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef`, align 64, addrspace 1) @@ -874,15 +874,15 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 16, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 48, align 4, addrspace 8) ; GREEDY-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<32 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>), [[AMDGPU_BUFFER_LOAD2]](<8 x s16>), [[AMDGPU_BUFFER_LOAD3]](<8 x s16>) ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>), [[UV2:%[0-9]+]]:vgpr(<8 x s16>), [[UV3:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<32 x s16>) ; GREEDY-NEXT: G_STORE [[UV]](<8 x s16>), [[DEF]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef`, align 64, addrspace 1) @@ -896,13 +896,13 @@ ; GREEDY-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) ; GREEDY-NEXT: G_STORE [[UV3]](<8 x s16>), [[PTR_ADD2]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 48, basealign 64, addrspace 1) ; GREEDY-NEXT: S_ENDPGM 0 - %val = call <32 x i16> @llvm.amdgcn.s.buffer.load.v32i16(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <32 x i16> @llvm.amdgcn.s.buffer.load.v32i16(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) store <32 x i16> %val, ptr addrspace(1) undef ret void } ; Test split of a vector with 64-bit elements -define amdgpu_ps void @s_buffer_load_v4i64_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { +define amdgpu_ps void @s_buffer_load_v4i64_vgpr_offset(ptr addrspace(8) inreg %rsrc, i32 %soffset) { ; CHECK-LABEL: name: s_buffer_load_v4i64_vgpr_offset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -911,13 +911,13 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>) ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s64>) ; CHECK-NEXT: G_STORE [[UV]](<2 x s64>), [[DEF]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef`, align 32, addrspace 1) @@ -933,13 +933,13 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GREEDY-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>) ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s64>) ; GREEDY-NEXT: G_STORE [[UV]](<2 x s64>), [[DEF]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef`, align 32, addrspace 1) @@ -947,13 +947,13 @@ ; GREEDY-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) ; GREEDY-NEXT: G_STORE [[UV1]](<2 x s64>), [[PTR_ADD]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 16, basealign 32, addrspace 1) ; GREEDY-NEXT: S_ENDPGM 0 - %val = call <4 x i64> @llvm.amdgcn.s.buffer.load.v4i64(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <4 x i64> @llvm.amdgcn.s.buffer.load.v4i64(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) store <4 x i64> %val, ptr addrspace(1) undef ret void } ; Test split of a vector with 64-bit elements -define amdgpu_ps void @s_buffer_load_v8i64_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { +define amdgpu_ps void @s_buffer_load_v8i64_vgpr_offset(ptr addrspace(8) inreg %rsrc, i32 %soffset) { ; CHECK-LABEL: name: s_buffer_load_v8i64_vgpr_offset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -962,15 +962,15 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 16, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 48, align 4, addrspace 8) ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>), [[AMDGPU_BUFFER_LOAD2]](<2 x s64>), [[AMDGPU_BUFFER_LOAD3]](<2 x s64>) ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>), [[UV2:%[0-9]+]]:vgpr(<2 x s64>), [[UV3:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s64>) ; CHECK-NEXT: G_STORE [[UV]](<2 x s64>), [[DEF]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef`, align 64, addrspace 1) @@ -992,15 +992,15 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 16, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 48, align 4, addrspace 8) ; GREEDY-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>), [[AMDGPU_BUFFER_LOAD2]](<2 x s64>), [[AMDGPU_BUFFER_LOAD3]](<2 x s64>) ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>), [[UV2:%[0-9]+]]:vgpr(<2 x s64>), [[UV3:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s64>) ; GREEDY-NEXT: G_STORE [[UV]](<2 x s64>), [[DEF]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef`, align 64, addrspace 1) @@ -1014,13 +1014,13 @@ ; GREEDY-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) ; GREEDY-NEXT: G_STORE [[UV3]](<2 x s64>), [[PTR_ADD2]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 48, basealign 64, addrspace 1) ; GREEDY-NEXT: S_ENDPGM 0 - %val = call <8 x i64> @llvm.amdgcn.s.buffer.load.v8i64(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <8 x i64> @llvm.amdgcn.s.buffer.load.v8i64(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) store <8 x i64> %val, ptr addrspace(1) undef ret void } ; Test split of a vector with 64-bit pointer elements -define amdgpu_ps void @s_buffer_load_v4p1_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { +define amdgpu_ps void @s_buffer_load_v4p1_vgpr_offset(ptr addrspace(8) inreg %rsrc, i32 %soffset) { ; CHECK-LABEL: name: s_buffer_load_v4p1_vgpr_offset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -1029,13 +1029,13 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>) ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x p1>) ; CHECK-NEXT: G_STORE [[UV]](<2 x p1>), [[DEF]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef`, align 32, addrspace 1) @@ -1051,13 +1051,13 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GREEDY-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>) ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x p1>) ; GREEDY-NEXT: G_STORE [[UV]](<2 x p1>), [[DEF]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef`, align 32, addrspace 1) @@ -1065,13 +1065,13 @@ ; GREEDY-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) ; GREEDY-NEXT: G_STORE [[UV1]](<2 x p1>), [[PTR_ADD]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 16, basealign 32, addrspace 1) ; GREEDY-NEXT: S_ENDPGM 0 - %val = call <4 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v4p1(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <4 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v4p1(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) store <4 x ptr addrspace(1)> %val, ptr addrspace(1) undef ret void } ; Test split of a vector with 64-bit pointer elements -define amdgpu_ps void @s_buffer_load_v8p1_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { +define amdgpu_ps void @s_buffer_load_v8p1_vgpr_offset(ptr addrspace(8) inreg %rsrc, i32 %soffset) { ; CHECK-LABEL: name: s_buffer_load_v8p1_vgpr_offset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -1080,15 +1080,15 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 16, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 48, align 4, addrspace 8) ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>), [[AMDGPU_BUFFER_LOAD2]](<2 x p1>), [[AMDGPU_BUFFER_LOAD3]](<2 x p1>) ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>), [[UV2:%[0-9]+]]:vgpr(<2 x p1>), [[UV3:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x p1>) ; CHECK-NEXT: G_STORE [[UV]](<2 x p1>), [[DEF]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef`, align 64, addrspace 1) @@ -1110,15 +1110,15 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 16, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 48, align 4, addrspace 8) ; GREEDY-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>), [[AMDGPU_BUFFER_LOAD2]](<2 x p1>), [[AMDGPU_BUFFER_LOAD3]](<2 x p1>) ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>), [[UV2:%[0-9]+]]:vgpr(<2 x p1>), [[UV3:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x p1>) ; GREEDY-NEXT: G_STORE [[UV]](<2 x p1>), [[DEF]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef`, align 64, addrspace 1) @@ -1132,12 +1132,12 @@ ; GREEDY-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) ; GREEDY-NEXT: G_STORE [[UV3]](<2 x p1>), [[PTR_ADD2]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 48, basealign 64, addrspace 1) ; GREEDY-NEXT: S_ENDPGM 0 - %val = call <8 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v8p1(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <8 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v8p1(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) store <8 x ptr addrspace(1)> %val, ptr addrspace(1) undef ret void } -define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4092(<4 x i32> inreg %rsrc, i32 %soffset.base) { +define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4092(ptr addrspace(8) inreg %rsrc, i32 %soffset.base) { ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4092 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -1146,14 +1146,14 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4092, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4092, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GREEDY-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4092 @@ -1164,22 +1164,22 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092 ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; GREEDY-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4092, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4092, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GREEDY-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4092 - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret float %val } -define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4095(<4 x i32> inreg %rsrc, i32 %soffset.base) { +define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4095(ptr addrspace(8) inreg %rsrc, i32 %soffset.base) { ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4095 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -1188,14 +1188,14 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4095, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4095, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GREEDY-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4095 @@ -1206,22 +1206,22 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095 ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; GREEDY-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4095, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4095, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GREEDY-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4095 - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret float %val } -define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4096(<4 x i32> inreg %rsrc, i32 %soffset.base) { +define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4096(ptr addrspace(8) inreg %rsrc, i32 %soffset.base) { ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -1230,13 +1230,13 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GREEDY-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096 @@ -1247,22 +1247,22 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; GREEDY-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GREEDY-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4096 - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret float %val } ; Make sure the base offset is added to each split load. -define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4064(<4 x i32> inreg %rsrc, i32 %soffset.base) { +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4064(ptr addrspace(8) inreg %rsrc, i32 %soffset.base) { ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4064 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -1271,15 +1271,15 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1299,15 +1299,15 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064 ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; GREEDY-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GREEDY-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; GREEDY-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1320,12 +1320,12 @@ ; GREEDY-NEXT: $vgpr7 = COPY [[UV7]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %soffset.base, 4064 - %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <8 x float> %val } ; Make sure the maximum offset isn't exeeded when splitting this -define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4068(<4 x i32> inreg %rsrc, i32 %soffset.base) { +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4068(ptr addrspace(8) inreg %rsrc, i32 %soffset.base) { ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4068 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -1334,14 +1334,14 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1361,14 +1361,14 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068 ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; GREEDY-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GREEDY-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; GREEDY-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1381,11 +1381,11 @@ ; GREEDY-NEXT: $vgpr7 = COPY [[UV7]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %soffset.base, 4068 - %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <8 x float> %val } -define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4032(<4 x i32> inreg %rsrc, i32 %soffset.base) { +define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4032(ptr addrspace(8) inreg %rsrc, i32 %soffset.base) { ; CHECK-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4032 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -1394,17 +1394,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4032 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4032, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4048, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4032, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4048, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 16, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 48, align 4, addrspace 8) ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>) ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1432,17 +1432,17 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4032 ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; GREEDY-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4032, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4048, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4032, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4048, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 16, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 48, align 4, addrspace 8) ; GREEDY-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>) ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) ; GREEDY-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1463,11 +1463,11 @@ ; GREEDY-NEXT: $vgpr15 = COPY [[UV15]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 %soffset = add i32 %soffset.base, 4032 - %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <16 x float> %val } -define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4036(<4 x i32> inreg %rsrc, i32 %soffset.base) { +define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4036(ptr addrspace(8) inreg %rsrc, i32 %soffset.base) { ; CHECK-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4036 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 @@ -1476,16 +1476,16 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4036 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 16, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 48, align 4, addrspace 8) ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>) ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1513,16 +1513,16 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4036 ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; GREEDY-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 16, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 48, align 4, addrspace 8) ; GREEDY-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>) ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) ; GREEDY-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1543,12 +1543,12 @@ ; GREEDY-NEXT: $vgpr15 = COPY [[UV15]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 %soffset = add i32 %soffset.base, 4036 - %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <16 x float> %val } ; Waterfall loop due to resource being VGPR -define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg %soffset) { +define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(ptr addrspace(8) %rsrc, i32 inreg %soffset) { ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_rsrc ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -1557,8 +1557,8 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 @@ -1566,7 +1566,7 @@ ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %20, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -1584,7 +1584,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4, %bb.2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY5]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY5]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -1602,8 +1602,8 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32) ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 @@ -1611,7 +1611,7 @@ ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %20, %bb.3 ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -1629,7 +1629,7 @@ ; GREEDY-NEXT: bb.3: ; GREEDY-NEXT: successors: %bb.4, %bb.2 ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY5]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY5]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} @@ -1639,12 +1639,12 @@ ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret float %val } ; Use the offset inside the waterfall loop -define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> %rsrc, i32 inreg %soffset.base) { +define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(ptr addrspace(8) %rsrc, i32 inreg %soffset.base) { ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4092 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -1653,17 +1653,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092 ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %16, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %21, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -1681,7 +1681,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4, %bb.2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4092, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4092, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -1699,17 +1699,17 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092 ; GREEDY-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %16, %bb.3 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %21, %bb.3 ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -1727,7 +1727,7 @@ ; GREEDY-NEXT: bb.3: ; GREEDY-NEXT: successors: %bb.4, %bb.2 ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4092, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4092, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} @@ -1738,12 +1738,12 @@ ; GREEDY-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4092 - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret float %val } ; Scalar offset exceeds MUBUF limit, keep add out of the loop -define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> %rsrc, i32 inreg %soffset.base) { +define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(ptr addrspace(8) %rsrc, i32 inreg %soffset.base) { ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4096 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -1752,10 +1752,10 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 @@ -1763,7 +1763,7 @@ ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %17, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %22, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -1781,7 +1781,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4, %bb.2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -1799,10 +1799,10 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 ; GREEDY-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) ; GREEDY-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 @@ -1810,7 +1810,7 @@ ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %17, %bb.3 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %22, %bb.3 ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -1828,7 +1828,7 @@ ; GREEDY-NEXT: bb.3: ; GREEDY-NEXT: successors: %bb.4, %bb.2 ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} @@ -1839,12 +1839,12 @@ ; GREEDY-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4096 - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret float %val } ; Waterfall loop, but constant offset -define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) { +define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(ptr addrspace(8) %rsrc) { ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4095 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -1853,8 +1853,8 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 @@ -1862,7 +1862,7 @@ ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %20, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -1880,7 +1880,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4, %bb.2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4095, 0, 0 :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4095, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc + 4095, align 1, basealign 4, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -1898,8 +1898,8 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095 + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 @@ -1907,7 +1907,7 @@ ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %20, %bb.3 ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -1925,7 +1925,7 @@ ; GREEDY-NEXT: bb.3: ; GREEDY-NEXT: successors: %bb.4, %bb.2 ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4095, 0, 0 :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4095, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc + 4095, align 1, basealign 4, addrspace 8) ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} @@ -1935,12 +1935,12 @@ ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4095, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 4095, i32 0) ret float %val } ; Waterfall loop, but constant offset -define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) { +define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(ptr addrspace(8) %rsrc) { ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4096 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -1949,8 +1949,8 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 @@ -1958,7 +1958,7 @@ ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %20, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -1976,7 +1976,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4, %bb.2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -1994,8 +1994,8 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; GREEDY-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 @@ -2003,7 +2003,7 @@ ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %20, %bb.3 ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -2021,7 +2021,7 @@ ; GREEDY-NEXT: bb.3: ; GREEDY-NEXT: successors: %bb.4, %bb.2 ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} @@ -2031,13 +2031,13 @@ ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4096, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 4096, i32 0) ret float %val } ; Need a waterfall loop, but the offset is scalar. ; Make sure the base offset is added to each split load. -define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %rsrc, i32 inreg %soffset.base) { +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(ptr addrspace(8) %rsrc, i32 inreg %soffset.base) { ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4064 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -2046,17 +2046,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064 ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %31, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -2074,8 +2074,8 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4, %bb.2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -2102,17 +2102,17 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064 ; GREEDY-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %31, %bb.3 ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -2130,8 +2130,8 @@ ; GREEDY-NEXT: bb.3: ; GREEDY-NEXT: successors: %bb.4, %bb.2 ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} @@ -2151,13 +2151,13 @@ ; GREEDY-NEXT: $vgpr7 = COPY [[UV15]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %soffset.base, 4064 - %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <8 x float> %val } ; Need a waterfall loop, but the offset is scalar. ; Make sure the maximum offset isn't exeeded when splitting this -define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %rsrc, i32 inreg %soffset.base) { +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(ptr addrspace(8) %rsrc, i32 inreg %soffset.base) { ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4068 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -2166,10 +2166,10 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068 ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 @@ -2177,7 +2177,7 @@ ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %32, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -2195,8 +2195,8 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4, %bb.2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -2223,10 +2223,10 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068 ; GREEDY-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) ; GREEDY-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 @@ -2234,7 +2234,7 @@ ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.3 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %32, %bb.3 ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -2252,8 +2252,8 @@ ; GREEDY-NEXT: bb.3: ; GREEDY-NEXT: successors: %bb.4, %bb.2 ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} @@ -2273,11 +2273,11 @@ ; GREEDY-NEXT: $vgpr7 = COPY [[UV15]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %soffset.base, 4068 - %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <8 x float> %val } -define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %rsrc, i32 inreg %soffset.base) { +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(ptr addrspace(8) %rsrc, i32 inreg %soffset.base) { ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4096 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -2286,10 +2286,10 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 @@ -2297,7 +2297,7 @@ ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %32, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -2315,8 +2315,8 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4, %bb.2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -2343,10 +2343,10 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 ; GREEDY-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) ; GREEDY-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 @@ -2354,7 +2354,7 @@ ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.3 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %32, %bb.3 ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -2372,8 +2372,8 @@ ; GREEDY-NEXT: bb.3: ; GREEDY-NEXT: successors: %bb.4, %bb.2 ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} @@ -2393,11 +2393,11 @@ ; GREEDY-NEXT: $vgpr7 = COPY [[UV15]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %soffset.base, 4096 - %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <8 x float> %val } -define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000(<4 x i32> %rsrc, i32 %offset.base) { +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000(ptr addrspace(8) %rsrc, i32 %offset.base) { ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 @@ -2406,17 +2406,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5000 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %31, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -2434,8 +2434,8 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4, %bb.2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -2462,17 +2462,17 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5000 ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; GREEDY-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %31, %bb.3 ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -2490,8 +2490,8 @@ ; GREEDY-NEXT: bb.3: ; GREEDY-NEXT: successors: %bb.4, %bb.2 ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} @@ -2511,11 +2511,11 @@ ; GREEDY-NEXT: $vgpr7 = COPY [[UV15]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %offset.base, 5000 - %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <8 x float> %val } -define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076(<4 x i32> %rsrc, i32 %offset.base) { +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076(ptr addrspace(8) %rsrc, i32 %offset.base) { ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 @@ -2524,17 +2524,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4076 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %31, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -2552,8 +2552,8 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4, %bb.2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -2580,17 +2580,17 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4076 ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; GREEDY-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %31, %bb.3 ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -2608,8 +2608,8 @@ ; GREEDY-NEXT: bb.3: ; GREEDY-NEXT: successors: %bb.4, %bb.2 ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} @@ -2629,11 +2629,11 @@ ; GREEDY-NEXT: $vgpr7 = COPY [[UV15]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %offset.base, 4076 - %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <8 x float> %val } -define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080(<4 x i32> %rsrc, i32 %offset.base) { +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080(ptr addrspace(8) %rsrc, i32 %offset.base) { ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 @@ -2642,17 +2642,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4080 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %31, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -2670,8 +2670,8 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4, %bb.2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -2698,17 +2698,17 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4080 ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; GREEDY-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %31, %bb.3 ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -2726,8 +2726,8 @@ ; GREEDY-NEXT: bb.3: ; GREEDY-NEXT: successors: %bb.4, %bb.2 ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc, align 4, addrspace 8) ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} @@ -2747,11 +2747,11 @@ ; GREEDY-NEXT: $vgpr7 = COPY [[UV15]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %offset.base, 4080 - %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8) %rsrc, i32 %soffset, i32 0) ret <8 x float> %val } -define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4064(<4 x i32> %rsrc, i32 %offset.base) { +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4064(ptr addrspace(8) %rsrc, i32 %offset.base) { ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4064 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -2760,8 +2760,8 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 @@ -2769,7 +2769,7 @@ ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %31, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -2787,8 +2787,8 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4, %bb.2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 4064, align 4, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 4064, align 4, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -2815,8 +2815,8 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064 + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 @@ -2824,7 +2824,7 @@ ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %31, %bb.3 ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -2842,8 +2842,8 @@ ; GREEDY-NEXT: bb.3: ; GREEDY-NEXT: successors: %bb.4, %bb.2 ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 4064, align 4, addrspace 8) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from %ir.rsrc + 4064, align 4, addrspace 8) ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} @@ -2862,11 +2862,11 @@ ; GREEDY-NEXT: $vgpr6 = COPY [[UV14]](s32) ; GREEDY-NEXT: $vgpr7 = COPY [[UV15]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 - %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 4064, i32 0) + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8) %rsrc, i32 4064, i32 0) ret <8 x float> %val } -define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { +define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr(ptr addrspace(8) inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { ; CHECK-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -2875,13 +2875,13 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GREEDY-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr @@ -2892,21 +2892,21 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; GREEDY-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) ; GREEDY-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GREEDY-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset = add i32 %offset.v, %offset.s - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %offset, i32 0) ret float %val } -define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { +define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr(ptr addrspace(8) inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { ; CHECK-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -2915,13 +2915,13 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GREEDY-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr @@ -2932,21 +2932,21 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; GREEDY-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) ; GREEDY-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GREEDY-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset = add i32 %offset.s, %offset.v - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %offset, i32 0) ret float %val } -define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr_imm(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { +define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr_imm(ptr addrspace(8) inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { ; CHECK-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr_imm ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -2955,7 +2955,6 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) @@ -2963,9 +2962,10 @@ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GREEDY-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr_imm @@ -2976,7 +2976,6 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; GREEDY-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) @@ -2984,18 +2983,19 @@ ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 ; GREEDY-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; GREEDY-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GREEDY-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.v, %offset.s %offset = add i32 %offset.base, 1024 - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %offset, i32 0) ret float %val } -define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr_imm(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { +define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr_imm(ptr addrspace(8) inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { ; CHECK-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr_imm ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -3004,7 +3004,6 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) @@ -3012,9 +3011,10 @@ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GREEDY-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr_imm @@ -3025,7 +3025,6 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; GREEDY-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) @@ -3033,19 +3032,20 @@ ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 ; GREEDY-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; GREEDY-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GREEDY-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.s, %offset.v %offset = add i32 %offset.base, 1024 - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %offset, i32 0) ret float %val } ; TODO: Ideally this would be reassociated to fold. -define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { +define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(ptr addrspace(8) inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { ; CHECK-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -3054,15 +3054,15 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY5]], [[C]] ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) ; CHECK-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[ADD]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[ADD]], 0, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GREEDY-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr @@ -3073,24 +3073,24 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 ; GREEDY-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY5]], [[C]] ; GREEDY-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) ; GREEDY-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[ADD]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[ADD]], 0, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GREEDY-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.s, 1024 %offset = add i32 %offset.base, %offset.v - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %offset, i32 0) ret float %val } -define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { +define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(ptr addrspace(8) inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { ; CHECK-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -3099,7 +3099,6 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 @@ -3107,8 +3106,9 @@ ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) ; CHECK-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[ADD]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[ADD]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GREEDY-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr @@ -3119,7 +3119,6 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 @@ -3127,39 +3126,40 @@ ; GREEDY-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] ; GREEDY-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) ; GREEDY-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[ADD]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[ADD]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32) from %ir.rsrc, addrspace 8) ; GREEDY-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.v, 1024 %offset = add i32 %offset.base, %offset.s - %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) + %val = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %offset, i32 0) ret float %val } -declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg) -declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg) -declare <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32>, i32, i32 immarg) -declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32 immarg) -declare <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32>, i32, i32 immarg) -declare <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32>, i32, i32 immarg) +declare i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8), i32, i32 immarg) +declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(ptr addrspace(8), i32, i32 immarg) +declare <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(ptr addrspace(8), i32, i32 immarg) +declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(ptr addrspace(8), i32, i32 immarg) +declare <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(ptr addrspace(8), i32, i32 immarg) +declare <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(ptr addrspace(8), i32, i32 immarg) -declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) -declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32>, i32, i32 immarg) -declare <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32>, i32, i32 immarg) -declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32 immarg) -declare <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32>, i32, i32 immarg) -declare <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32>, i32, i32 immarg) +declare float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8), i32, i32 immarg) +declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8), i32, i32 immarg) +declare <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(ptr addrspace(8), i32, i32 immarg) +declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8), i32, i32 immarg) +declare <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8), i32, i32 immarg) +declare <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(ptr addrspace(8), i32, i32 immarg) -declare i96 @llvm.amdgcn.s.buffer.load.i96(<4 x i32>, i32, i32 immarg) -declare i256 @llvm.amdgcn.s.buffer.load.i256(<4 x i32>, i32, i32 immarg) -declare i512 @llvm.amdgcn.s.buffer.load.i512(<4 x i32>, i32, i32 immarg) +declare i96 @llvm.amdgcn.s.buffer.load.i96(ptr addrspace(8), i32, i32 immarg) +declare i256 @llvm.amdgcn.s.buffer.load.i256(ptr addrspace(8), i32, i32 immarg) +declare i512 @llvm.amdgcn.s.buffer.load.i512(ptr addrspace(8), i32, i32 immarg) -declare <16 x i16> @llvm.amdgcn.s.buffer.load.v16i16(<4 x i32>, i32, i32 immarg) -declare <32 x i16> @llvm.amdgcn.s.buffer.load.v32i16(<4 x i32>, i32, i32 immarg) +declare <16 x i16> @llvm.amdgcn.s.buffer.load.v16i16(ptr addrspace(8), i32, i32 immarg) +declare <32 x i16> @llvm.amdgcn.s.buffer.load.v32i16(ptr addrspace(8), i32, i32 immarg) -declare <4 x i64> @llvm.amdgcn.s.buffer.load.v4i64(<4 x i32>, i32, i32 immarg) -declare <8 x i64> @llvm.amdgcn.s.buffer.load.v8i64(<4 x i32>, i32, i32 immarg) +declare <4 x i64> @llvm.amdgcn.s.buffer.load.v4i64(ptr addrspace(8), i32, i32 immarg) +declare <8 x i64> @llvm.amdgcn.s.buffer.load.v8i64(ptr addrspace(8), i32, i32 immarg) -declare <4 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v4p1(<4 x i32>, i32, i32 immarg) -declare <8 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v8p1(<4 x i32>, i32, i32 immarg) +declare <4 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v4p1(ptr addrspace(8), i32, i32 immarg) +declare <8 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v8p1(ptr addrspace(8), i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.mir @@ -44,7 +44,7 @@ %1:_(s32) = COPY $sgpr0 %2:vgpr(s32) = G_CONSTANT i32 256 %3:_(s32) = G_ADD %1, %2 - %4:_(s32) = G_AMDGPU_S_BUFFER_LOAD %0, %3, 0 + %4:_(s32) = G_AMDGPU_S_BUFFER_LOAD %0, %3, 0 :: (dereferenceable invariant load (s32)) S_ENDPGM 0, implicit %4 ... @@ -85,7 +85,7 @@ %1:_(s32) = COPY $vgpr0 %2:_(s32) = G_CONSTANT i32 -60 %3:_(s32) = G_ADD %1, %2 - %4:_(s32) = G_AMDGPU_S_BUFFER_LOAD %0, %3, 0 + %4:_(s32) = G_AMDGPU_S_BUFFER_LOAD %0, %3, 0 :: (dereferenceable invariant load (s32)) S_ENDPGM 0, implicit %4 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll @@ -3,7 +3,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=amdgpu-regbankselect -regbankselect-greedy -o - %s | FileCheck %s ; Natural mapping -define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -12,19 +12,19 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[COPY6]], 0, 0, -1 :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[COPY6]], 0, 0, -1 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret float %val } ; Copies for VGPR arguments -define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__sgpr_val__sgpr_vindex__sgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 inreg %soffset) { +define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__sgpr_val__sgpr_vindex__sgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_load__sgpr_rsrc__sgpr_val__sgpr_vindex__sgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8 @@ -33,21 +33,21 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32) ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY7]](s32), [[COPY8]], [[COPY6]], 0, 0, -1 :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY7]](s32), [[COPY8]], [[COPY6]], 0, 0, -1 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret float %val } ; Waterfall for rsrc -define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -57,17 +57,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %19, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -85,7 +85,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[COPY6]], 0, 0, -1 :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[COPY6]], 0, 0, -1 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -97,12 +97,12 @@ ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret float %val } ; Waterfall for soffset -define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex_vgpr_voffset__vgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) { +define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex_vgpr_voffset__vgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) { ; CHECK-LABEL: name: struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex_vgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -112,17 +112,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %19, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_]](s32), [[COPY6]] ; CHECK-NEXT: [[INT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1) @@ -131,7 +131,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[V_READFIRSTLANE_B32_]], 0, 0, -1 :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[V_READFIRSTLANE_B32_]], 0, 0, -1 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -143,12 +143,12 @@ ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret float %val } ; Waterfall for rsrc and soffset -define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) { +define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__vgpr_soffset(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) { ; CHECK-LABEL: name: struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -158,17 +158,17 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr6 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %19, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -189,7 +189,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[V_READFIRSTLANE_B32_4]], 0, 0, -1 :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[V_READFIRSTLANE_B32_4]], 0, 0, -1 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -201,8 +201,8 @@ ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %val = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret float %val } -declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32 immarg) +declare float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8), i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll @@ -3,7 +3,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=amdgpu-regbankselect -regbankselect-greedy -o - %s | FileCheck %s ; Natural mapping -define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, float %val, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, float %val, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 @@ -12,19 +12,19 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]], [[COPY7]], 0, 0, -1 :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]], [[COPY7]], 0, 0, -1 :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } ; Copies for VGPR arguments -define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__sgpr_val__sgpr_vindex__sgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, float inreg %val, i32 inreg %vindex, i32 inreg %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__sgpr_val__sgpr_vindex__sgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, float inreg %val, i32 inreg %vindex, i32 inreg %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_store__sgpr_rsrc__sgpr_val__sgpr_vindex__sgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9 @@ -33,22 +33,22 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32) ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[COPY6]](s32) - ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE [[COPY8]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY9]](s32), [[COPY10]], [[COPY7]], 0, 0, -1 :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE [[COPY8]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY9]](s32), [[COPY10]], [[COPY7]], 0, 0, -1 :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } ; Waterfall for rsrc -define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> %rsrc, float %val, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) %rsrc, float %val, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -58,18 +58,18 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %19, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -87,7 +87,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]], [[COPY7]], 0, 0, -1 :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]], [[COPY7]], 0, 0, -1 :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -98,12 +98,12 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } ; Waterfall for soffset -define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__vgpr_soffset(<4 x i32> inreg %rsrc, float %val, i32 %vindex, i32 %voffset, i32 %soffset) { +define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__vgpr_soffset(ptr addrspace(8) inreg %rsrc, float %val, i32 %vindex, i32 %voffset, i32 %soffset) { ; CHECK-LABEL: name: struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -113,18 +113,18 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr3 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %19, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY7]](s32), implicit $exec ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_]](s32), [[COPY7]] ; CHECK-NEXT: [[INT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1) @@ -133,7 +133,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]], [[V_READFIRSTLANE_B32_]], 0, 0, -1 :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]], [[V_READFIRSTLANE_B32_]], 0, 0, -1 :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -144,12 +144,12 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } ; Waterfall for rsrc and soffset -define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, float %val, i32 %vindex, i32 %voffset, i32 %soffset) { +define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__vgpr_soffset(ptr addrspace(8) %rsrc, float %val, i32 %vindex, i32 %voffset, i32 %soffset) { ; CHECK-LABEL: name: struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__vgpr_soffset ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -159,18 +159,18 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr7 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %19, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -191,7 +191,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]], [[V_READFIRSTLANE_B32_4]], 0, 0, -1 :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]], [[V_READFIRSTLANE_B32_4]], 0, 0, -1 :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -202,8 +202,8 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } -declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) +declare void @llvm.amdgcn.struct.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/unsupported-load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/unsupported-load.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/unsupported-load.ll @@ -0,0 +1,9 @@ +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - < %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s + +; GISEL-ERR: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_LOAD %{{[0-9]+}}:vgpr(p8) :: (load (s32) from %ir.rsrc, addrspace 8) + +define float @load_from_rsrc(ptr addrspace(8) %rsrc) { +body: + %res = load float, ptr addrspace(8) %rsrc + ret float %res +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/unsupported-ptr-add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/unsupported-ptr-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/unsupported-ptr-add.ll @@ -0,0 +1,14 @@ +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - < %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s + +; GISEL-ERR: LLVM ERROR: unable to legalize instruction: %{{[0-9]+}}:_(p8) = G_PTR_ADD %{{[0-9]+}}:_, %{{[0-9]+}}:_(s128) + + +define float @gep_on_rsrc(ptr addrspace(8) %rsrc) { +body: + %next = getelementptr float, ptr addrspace(8) %rsrc, i128 1 + %res = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %next, i32 0, i32 0, i32 0) + ret float %res +} + +declare float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8), i32, i32, i32 immarg) + diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll --- a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll @@ -9,7 +9,7 @@ ; GCN-LABEL: name: test_load_zext ; GCN: %[[OFFSET:[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @DescriptorBuffer ; SDAG: %{{[0-9]+}}:sgpr_128 = S_LOAD_DWORDX4_SGPR killed %{{[0-9]+}}, killed %[[OFFSET]], 0 :: (invariant load (s128) from %ir.12, addrspace 4) -; GISEL: %{{[0-9]+}}:sgpr_128 = S_LOAD_DWORDX4_SGPR %{{[0-9]+}}, %[[OFFSET]], 0 :: (invariant load (<4 x s32>) from {{.*}}, addrspace 4) +; GISEL: %{{[0-9]+}}:sgpr_128 = S_LOAD_DWORDX4_SGPR %{{[0-9]+}}, %[[OFFSET]], 0 :: (invariant load (p8) from {{.*}}, addrspace 4) define amdgpu_cs void @test_load_zext(i32 inreg %0, i32 inreg %1, i32 inreg %resNode0, i32 inreg %resNode1, <3 x i32> inreg %2, i32 inreg %3, <3 x i32> %4) local_unnamed_addr #2 { .entry: %5 = call i64 @llvm.amdgcn.s.getpc() #3 @@ -20,9 +20,9 @@ %10 = call i32 @llvm.amdgcn.reloc.constant(metadata !4) %11 = zext i32 %10 to i64 %12 = getelementptr [4294967295 x i8], ptr addrspace(4) %9, i64 0, i64 %11 - %13 = load <4 x i32>, ptr addrspace(4) %12, align 16, !invariant.load !5 - %14 = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %13, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %14, <4 x i32> %13, i32 0, i32 0, i32 0) + %13 = load ptr addrspace(8), ptr addrspace(4) %12, align 16, !invariant.load !5 + %14 = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(ptr addrspace(8) %13, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %14, ptr addrspace(8) %13, i32 0, i32 0, i32 0) ret void } @@ -86,12 +86,18 @@ } ; GCN-LABEL: name: test_buffer_load_sgpr_plus_imm_offset -; SDAG-DAG: %[[BASE0:.*]]:sgpr_32 = COPY $sgpr0 -; SDAG-DAG: %[[BASE1:.*]]:sgpr_32 = COPY $sgpr1 -; SDAG-DAG: %[[BASE2:.*]]:sgpr_32 = COPY $sgpr2 -; SDAG-DAG: %[[BASE3:.*]]:sgpr_32 = COPY $sgpr3 +; SDAG-DAG: %[[PTR0:.*]]:sgpr_32 = COPY $sgpr0 +; SDAG-DAG: %[[PTR1:.*]]:sgpr_32 = COPY $sgpr1 +; SDAG-DAG: %[[PTR2:.*]]:sgpr_32 = COPY $sgpr2 +; SDAG-DAG: %[[PTR3:.*]]:sgpr_32 = COPY $sgpr3 ; SDAG-DAG: %[[OFFSET:.*]]:sgpr_32 = COPY $sgpr4 -; SDAG-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1, %[[BASE2]], %subreg.sub2, %[[BASE3]], %subreg.sub3 +; SDAG-DAG: %[[BASELO:.*]]:sgpr_64 = REG_SEQUENCE %[[PTR0]], %subreg.sub0, %[[PTR1]], %subreg.sub1 +; SDAG-DAG: %[[BASE0:.*]]:sreg_32 = COPY %[[BASELO]].sub0 +; SDAG-DAG: %[[BASE1:.*]]:sreg_32 = COPY %[[BASELO]].sub1 +; SDAG-DAG: %[[BASEHI:.*]]:sgpr_64 = REG_SEQUENCE %[[PTR2]], %subreg.sub0, %[[PTR3]], %subreg.sub1 +; SDAG-DAG: %[[BASE2:.*]]:sreg_32 = COPY %[[BASEHI]].sub0 +; SDAG-DAG: %[[BASE3:.*]]:sreg_32 = COPY %[[BASEHI]].sub1 +; SDAG-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE killed %[[BASE0]], %subreg.sub0, killed %[[BASE1]], %subreg.sub1, killed %[[BASE2]], %subreg.sub2, killed %[[BASE3]], %subreg.sub3 ; SDAG: S_BUFFER_LOAD_DWORD_SGPR_IMM killed %[[BASE]], %[[OFFSET]], 77, ; GISEL-DAG: %[[BASE0:.*]]:sreg_32 = COPY $sgpr0 ; GISEL-DAG: %[[BASE1:.*]]:sreg_32 = COPY $sgpr1 @@ -100,21 +106,27 @@ ; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = COPY $sgpr4 ; GISEL-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1, %[[BASE2]], %subreg.sub2, %[[BASE3]], %subreg.sub3 ; GISEL: S_BUFFER_LOAD_DWORD_SGPR_IMM %[[BASE]], %[[OFFSET]], 77, -define amdgpu_cs void @test_buffer_load_sgpr_plus_imm_offset(<4 x i32> inreg %base, i32 inreg %i, ptr addrspace(1) inreg %out) { +define amdgpu_cs void @test_buffer_load_sgpr_plus_imm_offset(ptr addrspace(8) inreg %base, i32 inreg %i, ptr addrspace(1) inreg %out) { %off = add nuw nsw i32 %i, 77 - %v = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %base, i32 %off, i32 0) + %v = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %base, i32 %off, i32 0) store i32 %v, ptr addrspace(1) %out, align 4 ret void } ; GCN-LABEL: name: test_buffer_load_sgpr_or_imm_offset -; SDAG-DAG: %[[BASE0:.*]]:sgpr_32 = COPY $sgpr0 -; SDAG-DAG: %[[BASE1:.*]]:sgpr_32 = COPY $sgpr1 -; SDAG-DAG: %[[BASE2:.*]]:sgpr_32 = COPY $sgpr2 -; SDAG-DAG: %[[BASE3:.*]]:sgpr_32 = COPY $sgpr3 +; SDAG-DAG: %[[PTR0:.*]]:sgpr_32 = COPY $sgpr0 +; SDAG-DAG: %[[PTR1:.*]]:sgpr_32 = COPY $sgpr1 +; SDAG-DAG: %[[PTR2:.*]]:sgpr_32 = COPY $sgpr2 +; SDAG-DAG: %[[PTR3:.*]]:sgpr_32 = COPY $sgpr3 ; SDAG-DAG: %[[INDEX:.*]]:sgpr_32 = COPY $sgpr4 ; SDAG-DAG: %[[SHIFT:.*]]:sreg_32 = S_LSHL_B32 %[[INDEX]], -; SDAG-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1, %[[BASE2]], %subreg.sub2, %[[BASE3]], %subreg.sub3 +; SDAG-DAG: %[[BASELO:.*]]:sgpr_64 = REG_SEQUENCE %[[PTR0]], %subreg.sub0, %[[PTR1]], %subreg.sub1 +; SDAG-DAG: %[[BASE0:.*]]:sreg_32 = COPY %[[BASELO]].sub0 +; SDAG-DAG: %[[BASE1:.*]]:sreg_32 = COPY %[[BASELO]].sub1 +; SDAG-DAG: %[[BASEHI:.*]]:sgpr_64 = REG_SEQUENCE %[[PTR2]], %subreg.sub0, %[[PTR3]], %subreg.sub1 +; SDAG-DAG: %[[BASE2:.*]]:sreg_32 = COPY %[[BASEHI]].sub0 +; SDAG-DAG: %[[BASE3:.*]]:sreg_32 = COPY %[[BASEHI]].sub1 +; SDAG-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE killed %[[BASE0]], %subreg.sub0, killed %[[BASE1]], %subreg.sub1, killed %[[BASE2]], %subreg.sub2, killed %[[BASE3]], %subreg.sub3 ; SDAG: S_BUFFER_LOAD_DWORD_SGPR_IMM killed %[[BASE]], killed %[[SHIFT]], 5, ; GISEL-DAG: %[[BASE0:.*]]:sreg_32 = COPY $sgpr0 ; GISEL-DAG: %[[BASE1:.*]]:sreg_32 = COPY $sgpr1 @@ -124,17 +136,17 @@ ; GISEL-DAG: %[[SHIFT:.*]]:sreg_32 = S_LSHL_B32 %[[INDEX]], ; GISEL-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1, %[[BASE2]], %subreg.sub2, %[[BASE3]], %subreg.sub3 ; GISEL: S_BUFFER_LOAD_DWORD_SGPR_IMM %[[BASE]], %[[SHIFT]], 5, -define amdgpu_cs void @test_buffer_load_sgpr_or_imm_offset(<4 x i32> inreg %base, i32 inreg %i, ptr addrspace(1) inreg %out) { +define amdgpu_cs void @test_buffer_load_sgpr_or_imm_offset(ptr addrspace(8) inreg %base, i32 inreg %i, ptr addrspace(1) inreg %out) { %shift = shl i32 %i, 7 %off = or i32 %shift, 5 - %v = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %base, i32 %off, i32 0) + %v = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %base, i32 %off, i32 0) store i32 %v, ptr addrspace(1) %out, align 4 ret void } -declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32 immarg) #1 +declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, ptr addrspace(8), i32, i32, i32 immarg) #1 -declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg) nounwind readnone willreturn +declare i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8), i32, i32 immarg) nounwind readnone willreturn ; Function Attrs: nounwind readnone speculatable declare i32 @llvm.amdgcn.reloc.constant(metadata) #3 @@ -143,7 +155,7 @@ declare i64 @llvm.amdgcn.s.getpc() #3 ; Function Attrs: nounwind readnone -declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32 immarg) #1 +declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(ptr addrspace(8), i32, i32 immarg) #1 attributes #0 = { argmemonly nounwind willreturn } attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll @@ -287,12 +287,12 @@ ret void } -declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32 immarg) +declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8), i32, i32 immarg) ; FUNC-LABEL: {{^}}bitcast_v4f32_to_v2i64: ; GCN: s_buffer_load_dwordx4 define <2 x i64> @bitcast_v4f32_to_v2i64(<2 x i64> %arg) { - %val = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> undef, i32 0, i32 0) + %val = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) undef, i32 0, i32 0) %cast = bitcast <4 x float> %val to <2 x i64> %div = udiv <2 x i64> %cast, %arg ret <2 x i64> %div diff --git a/llvm/test/CodeGen/AMDGPU/amdpal.ll b/llvm/test/CodeGen/AMDGPU/amdpal.ll --- a/llvm/test/CodeGen/AMDGPU/amdpal.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal.ll @@ -72,13 +72,13 @@ %e = getelementptr [2 x i32], ptr addrspace(5) %v1, i32 0, i32 %idx %x = load i32, ptr addrspace(5) %e %xf = bitcast i32 %x to float - call void @llvm.amdgcn.raw.buffer.store.f32(float %xf, <4 x i32> undef, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %xf, ptr addrspace(8) undef, i32 0, i32 0, i32 0) ret void } attributes #0 = { nounwind "amdgpu-git-ptr-high"="0x1234" } -declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32 immarg) ; PAL: .amdgpu_pal_metadata diff --git a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll --- a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll @@ -2,7 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx1010 -amdgpu-atomic-optimizations < %s | FileCheck %s -check-prefixes=GFX10 declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) -declare i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i32 immarg) +declare i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32, ptr addrspace(8), i32, i32, i32 immarg) declare float @llvm.amdgcn.strict.wqm.f32(float) define amdgpu_ps void @main(i32 %arg) { @@ -48,7 +48,7 @@ br label %bb4 bb4: - %i5 = call i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32 0, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0) + %i5 = call i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32 0, ptr addrspace(8) zeroinitializer, i32 0, i32 0, i32 0) %i7 = icmp eq i32 %arg, 1 br i1 %i7, label %bb8, label %bb4 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -8,13 +8,13 @@ ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s declare i32 @llvm.amdgcn.workitem.id.x() -declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32 immarg) -declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32 immarg) -declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32 immarg) +declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg) +declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, ptr addrspace(8), i32, i32, i32 immarg) ; Show what the atomic optimization pass will do for raw buffers. -define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, <4 x i32> %inout) { +define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_constant: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec @@ -214,12 +214,12 @@ ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: - %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, <4 x i32> %inout, i32 %additive) { +define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %additive) { ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec @@ -429,12 +429,12 @@ ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: - %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, <4 x i32> %inout) { +define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd @@ -760,12 +760,12 @@ ; GFX11W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, <4 x i32> %inout, i32 %vindex) { +define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %vindex) { ; GFX6-LABEL: struct_add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[0:1], 0x11 @@ -1108,12 +1108,12 @@ ; GFX11W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 %vindex, i32 0, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, ptr addrspace(8) %inout, i32 %vindex, i32 0, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, <4 x i32> %inout) { +define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd @@ -1178,12 +1178,12 @@ ; GFX11-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, <4 x i32> %inout) { +define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_constant: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec @@ -1390,12 +1390,12 @@ ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: - %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, <4 x i32> %inout, i32 %subitive) { +define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %subitive) { ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec @@ -1609,12 +1609,12 @@ ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: - %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, <4 x i32> %inout) { +define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd @@ -1940,12 +1940,12 @@ ; GFX11W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, <4 x i32> %inout) { +define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd @@ -2010,7 +2010,7 @@ ; GFX11-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -8,12 +8,12 @@ ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s declare i1 @llvm.amdgcn.wqm.vote(i1) -declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32 immarg) -declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) +declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32 immarg) ; Show what the atomic optimization pass will do for raw buffers. -define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %inout) { +define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspace(8) inreg %inout) { ; GFX7-LABEL: add_i32_constant: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b64 s[10:11], exec @@ -231,19 +231,19 @@ ; GFX1132-NEXT: s_endpgm entry: %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true) - %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true) %cond = and i1 %cond1, %cond2 br i1 %cond, label %if, label %else if: %bitcast = bitcast i32 %old to float - call void @llvm.amdgcn.raw.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %bitcast, ptr addrspace(8) %out, i32 0, i32 0, i32 0) ret void else: ret void } -define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %inout, i32 %val) { +define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspace(8) inreg %inout, i32 %val) { ; GFX7-LABEL: add_i32_varying: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_wqm_b64 s[8:9], -1 @@ -623,13 +623,13 @@ ; GFX1132-NEXT: s_endpgm entry: %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true) - %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %val, <4 x i32> %inout, i32 0, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %val, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true) %cond = and i1 %cond1, %cond2 br i1 %cond, label %if, label %else if: %bitcast = bitcast i32 %old to float - call void @llvm.amdgcn.raw.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %bitcast, ptr addrspace(8) %out, i32 0, i32 0, i32 0) ret void else: ret void diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -8,12 +8,12 @@ ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s declare i32 @llvm.amdgcn.workitem.id.x() -declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32) -declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32) +declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32) +declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, ptr addrspace(8), i32, i32, i32) ; Show what the atomic optimization pass will do for raw buffers. -define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, <4 x i32> %inout) { +define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_constant: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec @@ -213,12 +213,12 @@ ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: - %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, <4 x i32> %inout, i32 %additive) { +define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %additive) { ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec @@ -428,12 +428,12 @@ ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: - %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, <4 x i32> %inout) { +define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd @@ -759,12 +759,12 @@ ; GFX11W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, <4 x i32> %inout) { +define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd @@ -829,12 +829,12 @@ ; GFX11-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, <4 x i32> %inout) { +define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_constant: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec @@ -1041,12 +1041,12 @@ ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: - %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, <4 x i32> %inout, i32 %subitive) { +define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %subitive) { ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec @@ -1260,12 +1260,12 @@ ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: - %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, <4 x i32> %inout) { +define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd @@ -1591,12 +1591,12 @@ ; GFX11W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, <4 x i32> %inout) { +define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd @@ -1661,7 +1661,7 @@ ; GFX11-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -8,12 +8,12 @@ ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s declare i32 @llvm.amdgcn.workitem.id.x() -declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32) -declare i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32, i32) +declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32, i32) +declare i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32, ptr addrspace(8), i32, i32, i32, i32) ; Show what the atomic optimization pass will do for struct buffers. -define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, <4 x i32> %inout) { +define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_constant: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec @@ -220,12 +220,12 @@ ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: - %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, <4 x i32> %inout, i32 %additive) { +define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %additive) { ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec @@ -442,12 +442,12 @@ ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: - %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %additive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, <4 x i32> %inout) { +define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd @@ -773,12 +773,12 @@ ; GFX11W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, <4 x i32> %inout) { +define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_vindex: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd @@ -843,12 +843,12 @@ ; GFX11-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, <4 x i32> %inout) { +define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: v_mov_b32_e32 v1, v0 @@ -945,12 +945,12 @@ ; GFX11W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, ptr addrspace(8) %inout, i32 0, i32 %lane, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, <4 x i32> %inout) { +define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_constant: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec @@ -1164,12 +1164,12 @@ ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: - %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, <4 x i32> %inout, i32 %subitive) { +define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %subitive) { ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec @@ -1390,12 +1390,12 @@ ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: - %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %subitive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, <4 x i32> %inout) { +define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd @@ -1721,12 +1721,12 @@ ; GFX11W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %lane, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, <4 x i32> %inout) { +define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_vindex: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd @@ -1791,12 +1791,12 @@ ; GFX11-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, <4 x i32> %inout) { +define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: v_mov_b32_e32 v1, v0 @@ -1893,7 +1893,7 @@ ; GFX11W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0) + %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, ptr addrspace(8) %inout, i32 0, i32 %lane, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/bitcast-v4f16-v4i16.ll b/llvm/test/CodeGen/AMDGPU/bitcast-v4f16-v4i16.ll --- a/llvm/test/CodeGen/AMDGPU/bitcast-v4f16-v4i16.ll +++ b/llvm/test/CodeGen/AMDGPU/bitcast-v4f16-v4i16.ll @@ -5,31 +5,31 @@ ; CHECK-LABEL: {{^}}test_to_i16: ; CHECK: s_endpgm -define amdgpu_ps void @test_to_i16(<4 x i32> inreg, <4 x half> inreg) #0 { +define amdgpu_ps void @test_to_i16(ptr addrspace(8) inreg, <4 x half> inreg) #0 { %a_tmp = call <4 x half> @llvm.amdgcn.wqm.v4f16(<4 x half> %1) %a_i16_tmp = bitcast <4 x half> %a_tmp to <4 x i16> %a_i16 = call <4 x i16> @llvm.amdgcn.wqm.v4i16(<4 x i16> %a_i16_tmp) %a_i32 = bitcast <4 x i16> %a_i16 to <2 x i32> - call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %a_i32, <4 x i32> %0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %a_i32, ptr addrspace(8) %0, i32 0, i32 0, i32 0) ret void } ; CHECK-LABEL: {{^}}test_to_half: ; CHECK: s_endpgm -define amdgpu_ps void @test_to_half(<4 x i32> inreg, <4 x i16> inreg) #0 { +define amdgpu_ps void @test_to_half(ptr addrspace(8) inreg, <4 x i16> inreg) #0 { %a_tmp = call <4 x i16> @llvm.amdgcn.wqm.v4i16(<4 x i16> %1) %a_half_tmp = bitcast <4 x i16> %a_tmp to <4 x half> %a_half = call <4 x half> @llvm.amdgcn.wqm.v4f16(<4 x half> %a_half_tmp) %a_i32 = bitcast <4 x half> %a_half to <2 x i32> - call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %a_i32, <4 x i32> %0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %a_i32, ptr addrspace(8) %0, i32 0, i32 0, i32 0) ret void } declare <4 x half> @llvm.amdgcn.wqm.v4f16(<4 x half>) #1 declare <4 x i16> @llvm.amdgcn.wqm.v4i16(<4 x i16>) #1 -declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, ptr addrspace(8), i32, i32, i32) #0 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll --- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll @@ -4,7 +4,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s -define amdgpu_ps void @buffer_atomic_fadd_f32_offset_no_rtn(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps void @buffer_atomic_fadd_f32_offset_no_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; GFX908_GFX11-LABEL: name: buffer_atomic_fadd_f32_offset_no_rtn ; GFX908_GFX11: bb.0 (%ir-block.0): ; GFX908_GFX11-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 @@ -15,8 +15,14 @@ ; GFX908_GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX908_GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX908_GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 - ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX908_GFX11-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX908_GFX11-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX908_GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX908_GFX11-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX908_GFX11-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX908_GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 + ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_offset_no_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): @@ -28,14 +34,20 @@ ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @buffer_atomic_fadd_f32_offen_no_rtn(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @buffer_atomic_fadd_f32_offen_no_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; GFX908_GFX11-LABEL: name: buffer_atomic_fadd_f32_offen_no_rtn ; GFX908_GFX11: bb.0 (%ir-block.0): ; GFX908_GFX11-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 @@ -47,8 +59,14 @@ ; GFX908_GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX908_GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX908_GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX908_GFX11-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX908_GFX11-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX908_GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX908_GFX11-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX908_GFX11-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX908_GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_offen_no_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): @@ -61,14 +79,20 @@ ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @buffer_atomic_fadd_f32_idxen_no_rtn(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { +define amdgpu_ps void @buffer_atomic_fadd_f32_idxen_no_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) { ; GFX908_GFX11-LABEL: name: buffer_atomic_fadd_f32_idxen_no_rtn ; GFX908_GFX11: bb.0 (%ir-block.0): ; GFX908_GFX11-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 @@ -80,8 +104,14 @@ ; GFX908_GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX908_GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX908_GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX908_GFX11-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX908_GFX11-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX908_GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX908_GFX11-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX908_GFX11-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX908_GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_idxen_no_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): @@ -94,14 +124,20 @@ ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @buffer_atomic_fadd_f32_bothen_no_rtn(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @buffer_atomic_fadd_f32_bothen_no_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; GFX908_GFX11-LABEL: name: buffer_atomic_fadd_f32_bothen_no_rtn ; GFX908_GFX11: bb.0 (%ir-block.0): ; GFX908_GFX11-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4 @@ -114,9 +150,15 @@ ; GFX908_GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX908_GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX908_GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX908_GFX11-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX908_GFX11-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX908_GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX908_GFX11-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX908_GFX11-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX908_GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; GFX908_GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_bothen_no_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): @@ -130,13 +172,19 @@ ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void } -declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) -declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) +declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, ptr addrspace(8), i32, i32, i32 immarg) +declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll --- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll @@ -3,7 +3,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX11 %s -define amdgpu_ps float @buffer_atomic_fadd_f32_offset_rtn(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps float @buffer_atomic_fadd_f32_offset_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_offset_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 @@ -14,8 +14,14 @@ ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; GFX11-LABEL: name: buffer_atomic_fadd_f32_offset_rtn @@ -28,15 +34,21 @@ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 - ; GFX11-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX11-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 + ; GFX11-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX11-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret float %ret } -define amdgpu_ps float @buffer_atomic_fadd_f32_offen_rtn(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @buffer_atomic_fadd_f32_offen_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_offen_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 @@ -48,8 +60,14 @@ ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; GFX11-LABEL: name: buffer_atomic_fadd_f32_offen_rtn @@ -63,15 +81,21 @@ ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX11-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX11-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX11-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX11-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret float %ret } -define amdgpu_ps float @buffer_atomic_fadd_f32_idxen_rtn(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { +define amdgpu_ps float @buffer_atomic_fadd_f32_idxen_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_idxen_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 @@ -83,8 +107,14 @@ ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; GFX11-LABEL: name: buffer_atomic_fadd_f32_idxen_rtn @@ -98,15 +128,21 @@ ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX11-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX11-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX11-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX11-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret float %ret } -define amdgpu_ps float @buffer_atomic_fadd_f32_bothen_rtn(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @buffer_atomic_fadd_f32_bothen_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_bothen_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4 @@ -119,9 +155,15 @@ ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; GFX11-LABEL: name: buffer_atomic_fadd_f32_bothen_rtn @@ -136,14 +178,20 @@ ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX11-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX11-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX11-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret float %ret } -declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) -declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) +declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, ptr addrspace(8), i32, i32, i32 immarg) +declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll --- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll @@ -2,7 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -define amdgpu_ps void @buffer_atomic_fadd_f64_offset_no_rtn(double %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps void @buffer_atomic_fadd_f64_offset_no_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_offset_no_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 @@ -14,16 +14,22 @@ ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_OFFSET killed [[COPY7]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_OFFSET killed [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @buffer_atomic_fadd_f64_offen_no_rtn(double %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @buffer_atomic_fadd_f64_offen_no_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_offen_no_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4 @@ -36,16 +42,22 @@ ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_OFFEN killed [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_OFFEN killed [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @buffer_atomic_fadd_f64_idxen_no_rtn(double %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { +define amdgpu_ps void @buffer_atomic_fadd_f64_idxen_no_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_idxen_no_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4 @@ -58,16 +70,22 @@ ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_IDXEN killed [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_IDXEN killed [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @buffer_atomic_fadd_f64_bothen_no_rtn(double %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @buffer_atomic_fadd_f64_bothen_no_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_bothen_no_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4 @@ -81,17 +99,23 @@ ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_BOTHEN killed [[COPY9]], killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_BOTHEN killed [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void } -define amdgpu_ps double @buffer_atomic_fadd_f64_offset_rtn(double %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps double @buffer_atomic_fadd_f64_offset_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_offset_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 @@ -103,20 +127,26 @@ ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY7]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY8]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY9]] + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0 + ; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1 + ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY12]] + ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY13]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret double %ret } -define amdgpu_ps double @buffer_atomic_fadd_f64_offen_rtn(double %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps double @buffer_atomic_fadd_f64_offen_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_offen_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4 @@ -129,20 +159,26 @@ ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY9]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY10]] + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0 + ; GFX90A_GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1 + ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY13]] + ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY14]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret double %ret } -define amdgpu_ps double @buffer_atomic_fadd_f64_idxen_rtn(double %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { +define amdgpu_ps double @buffer_atomic_fadd_f64_idxen_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_idxen_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4 @@ -155,20 +191,26 @@ ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY9]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY10]] + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0 + ; GFX90A_GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1 + ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY13]] + ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY14]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret double %ret } -define amdgpu_ps double @buffer_atomic_fadd_f64_bothen_rtn(double %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps double @buffer_atomic_fadd_f64_bothen_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_bothen_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4 @@ -182,19 +224,25 @@ ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY9]], killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY10]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY11]] + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0 + ; GFX90A_GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1 + ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY14]] + ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY15]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret double %ret } -declare double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32 immarg) -declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32 immarg) +declare double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i32 immarg) +declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll --- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll @@ -3,7 +3,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -define amdgpu_ps void @buffer_atomic_fadd_v2f16_offset_no_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps void @buffer_atomic_fadd_v2f16_offset_no_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; GFX908-LABEL: name: buffer_atomic_fadd_v2f16_offset_no_rtn ; GFX908: bb.0 (%ir-block.0): ; GFX908-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 @@ -14,8 +14,14 @@ ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 - ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX908-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX908-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX908-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_offset_no_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): @@ -27,14 +33,20 @@ ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 4095, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @buffer_atomic_fadd_v2f16_offen_no_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @buffer_atomic_fadd_v2f16_offen_no_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; GFX908-LABEL: name: buffer_atomic_fadd_v2f16_offen_no_rtn ; GFX908: bb.0 (%ir-block.0): ; GFX908-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 @@ -46,8 +58,14 @@ ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX908-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX908-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX908-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX908-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_offen_no_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): @@ -60,14 +78,20 @@ ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @buffer_atomic_fadd_v2f16_idxen_no_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { +define amdgpu_ps void @buffer_atomic_fadd_v2f16_idxen_no_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) { ; GFX908-LABEL: name: buffer_atomic_fadd_v2f16_idxen_no_rtn ; GFX908: bb.0 (%ir-block.0): ; GFX908-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 @@ -79,8 +103,14 @@ ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX908-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX908-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX908-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX908-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_idxen_no_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): @@ -93,14 +123,20 @@ ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @buffer_atomic_fadd_v2f16_bothen_no_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @buffer_atomic_fadd_v2f16_bothen_no_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; GFX908-LABEL: name: buffer_atomic_fadd_v2f16_bothen_no_rtn ; GFX908: bb.0 (%ir-block.0): ; GFX908-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4 @@ -113,9 +149,15 @@ ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX908-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX908-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX908-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX908-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_bothen_no_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): @@ -129,13 +171,19 @@ ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) + %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void } -declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) -declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) +declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32 immarg) +declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll --- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll @@ -2,7 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_offset_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_offset_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_offset_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 @@ -13,15 +13,21 @@ ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret <2 x half> %ret } -define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_offen_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_offen_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_offen_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 @@ -33,15 +39,21 @@ ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret <2 x half> %ret } -define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_idxen_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { +define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_idxen_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_idxen_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 @@ -53,15 +65,21 @@ ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret <2 x half> %ret } -define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_bothen_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_bothen_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_bothen_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4 @@ -74,14 +92,20 @@ ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) + %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret <2 x half> %ret } -declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) -declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) +declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32 immarg) +declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll --- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll @@ -10,371 +10,380 @@ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 - ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM killed [[REG_SEQUENCE]], 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg0, addrspace 6) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 16, align 1, addrspace 8) + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[REG_SEQUENCE]], 0, 0 :: (dereferenceable invariant load (s64) from %ir.arg0, align 16, addrspace 6) + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 + ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 8 + ; GCN-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], killed [[S_MOV_B32_1]], implicit-def dead $scc + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE killed [[S_ADD_I32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[REG_SEQUENCE1]], 0, 0 :: (dereferenceable invariant load (s64) from %ir.arg0 + 8, basealign 16, addrspace 6) + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 + ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1, killed [[COPY5]], %subreg.sub2, killed [[COPY4]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp0 + 16, align 1, addrspace 8) ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 32, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp0 + 32, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 48, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp0 + 48, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 64, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 64, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp0 + 64, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 64, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 64, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 80, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp0 + 80, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 96, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp0 + 96, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE3]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec - ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 112, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp0 + 112, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 128, align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 64 - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_1]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 128, align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 128 - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 128, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY2]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 144, align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 72 - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_3]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 144, align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 144 - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 144, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY3]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp0 + 128, align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 64 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], killed [[S_MOV_B32_2]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp0 + 128, align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 128 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp0 + 128, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[COPY6]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 160, align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 80 - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 160, align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 160 - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 160, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY4]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp0 + 144, align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 72 + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], killed [[S_MOV_B32_4]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp0 + 144, align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 144 + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp0 + 144, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[COPY7]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 176, align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 88 - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 176, align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 176 - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 176, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[COPY5]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp0 + 160, align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 80 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE2]], killed [[S_MOV_B32_6]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp0 + 160, align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 160 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp0 + 160, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE2]], [[COPY8]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 192, align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 96 - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 192, align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 192 - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 192, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp0 + 176, align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 88 + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], killed [[S_MOV_B32_8]], 88, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp0 + 176, align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 176 + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[S_MOV_B32_9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp0 + 176, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE3]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY9]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 208, align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 104 - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_11]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 208, align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 208 - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 208, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY7]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp0 + 192, align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 96 + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[REG_SEQUENCE2]], killed [[S_MOV_B32_10]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp0 + 192, align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 192 + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[REG_SEQUENCE2]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp0 + 192, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[REG_SEQUENCE2]], [[COPY10]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY8]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 224, align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 112 - ; GCN-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY9]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_13]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 224, align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 224 - ; GCN-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY10]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 224, align 1, addrspace 8) - ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY11]], [[S_LOAD_DWORDX4_IMM]], [[COPY12]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp0 + 208, align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 104 + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[REG_SEQUENCE2]], killed [[S_MOV_B32_12]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp0 + 208, align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 208 + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[REG_SEQUENCE2]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp0 + 208, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[REG_SEQUENCE2]], [[COPY11]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ + ; GCN-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY12]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp0 + 224, align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 112 ; GCN-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY13]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 240, align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 120 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY13]], [[REG_SEQUENCE2]], killed [[S_MOV_B32_14]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp0 + 224, align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 224 ; GCN-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY14]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_15]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 240, align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_16:%[0-9]+]]:sreg_32 = S_MOV_B32 240 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY14]], [[REG_SEQUENCE2]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp0 + 224, align 1, addrspace 8) + ; GCN-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY15]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 240, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY16]], [[S_LOAD_DWORDX4_IMM]], [[COPY17]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY15]], [[REG_SEQUENCE2]], [[COPY16]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ + ; GCN-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY17]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp0 + 240, align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_16:%[0-9]+]]:sreg_32 = S_MOV_B32 120 ; GCN-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY18]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 256, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY18]], [[REG_SEQUENCE2]], killed [[S_MOV_B32_16]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp0 + 240, align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_17:%[0-9]+]]:sreg_32 = S_MOV_B32 240 ; GCN-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 256, align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_17:%[0-9]+]]:sreg_32 = S_MOV_B32 256 + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY19]], [[REG_SEQUENCE2]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp0 + 240, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 256, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[COPY22:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY21]], [[S_LOAD_DWORDX4_IMM]], [[COPY22]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY20]], [[REG_SEQUENCE2]], [[COPY21]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ + ; GCN-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY22]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp0 + 256, align 1, addrspace 8) ; GCN-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY23]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 272, align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_18:%[0-9]+]]:sreg_32 = S_MOV_B32 136 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY23]], [[REG_SEQUENCE2]], [[S_MOV_B32_3]], 128, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp0 + 256, align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_18:%[0-9]+]]:sreg_32 = S_MOV_B32 256 ; GCN-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY24]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_18]], 136, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 272, align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_19:%[0-9]+]]:sreg_32 = S_MOV_B32 272 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY24]], [[REG_SEQUENCE2]], [[S_MOV_B32_18]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp0 + 256, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_18]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY25]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 272, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[COPY27:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY26]], [[S_LOAD_DWORDX4_IMM]], [[COPY27]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: [[COPY26:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY25]], [[REG_SEQUENCE2]], [[COPY26]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ + ; GCN-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY27]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp0 + 272, align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_19:%[0-9]+]]:sreg_32 = S_MOV_B32 136 ; GCN-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 288, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY28]], [[REG_SEQUENCE2]], killed [[S_MOV_B32_19]], 136, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp0 + 272, align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_20:%[0-9]+]]:sreg_32 = S_MOV_B32 272 ; GCN-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 288, align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_20:%[0-9]+]]:sreg_32 = S_MOV_B32 288 + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY29]], [[REG_SEQUENCE2]], [[S_MOV_B32_20]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp0 + 272, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_20]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 288, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[COPY32:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY31]], [[S_LOAD_DWORDX4_IMM]], [[COPY32]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: [[COPY31:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY30]], [[REG_SEQUENCE2]], [[COPY31]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ + ; GCN-NEXT: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY32]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp0 + 288, align 1, addrspace 8) ; GCN-NEXT: [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY33]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 304, align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 152 + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY33]], [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp0 + 288, align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 288 ; GCN-NEXT: [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY34]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_21]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 304, align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_22:%[0-9]+]]:sreg_32 = S_MOV_B32 304 + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[COPY34]], [[REG_SEQUENCE2]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp0 + 288, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[COPY35]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 304, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[COPY37:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY36]], [[S_LOAD_DWORDX4_IMM]], [[COPY37]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: [[COPY36:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY35]], [[REG_SEQUENCE2]], [[COPY36]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ + ; GCN-NEXT: [[COPY37:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY37]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp0 + 304, align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_22:%[0-9]+]]:sreg_32 = S_MOV_B32 152 + ; GCN-NEXT: [[COPY38:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY38]], [[REG_SEQUENCE2]], killed [[S_MOV_B32_22]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp0 + 304, align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_23:%[0-9]+]]:sreg_32 = S_MOV_B32 304 + ; GCN-NEXT: [[COPY39:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[COPY39]], [[REG_SEQUENCE2]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp0 + 304, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: [[COPY40:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: [[COPY41:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY40]], [[REG_SEQUENCE2]], [[COPY41]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: S_ENDPGM 0 bb.0: - %tmp0 = load <4 x i32>, ptr addrspace(6) %arg0, align 16, !invariant.load !0 - %buffer0 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 16, i1 false, i1 false) #0 - %buffer1 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 %arg1, i1 false, i1 false) #0 - %buffer2 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %tmp0, i32 1, i32 16, i1 false, i1 false) #0 - %buffer3 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %tmp0, i32 %arg1, i32 16, i1 false, i1 false) #0 + %tmp0 = load ptr addrspace(8), ptr addrspace(6) %arg0, align 16, !invariant.load !0 + %buffer0 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %tmp0, i32 0, i32 16, i1 false, i1 false) #0 + %buffer1 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %tmp0, i32 0, i32 %arg1, i1 false, i1 false) #0 + %buffer2 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %tmp0, i32 1, i32 16, i1 false, i1 false) #0 + %buffer3 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %tmp0, i32 %arg1, i32 16, i1 false, i1 false) #0 ; Insert inline asm to keep the different instruction types from being mixed. This makes the output easier to read. call void asm sideeffect "", "" () - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %buffer0, <4 x i32> %tmp0, i32 0, i32 32, i1 false, i1 false) #1 - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %buffer1, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false, i1 false) #1 - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %buffer2, <4 x i32> %tmp0, i32 1, i32 32, i1 false, i1 false) #1 - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %buffer3, <4 x i32> %tmp0, i32 %arg1, i32 32, i1 false, i1 false) #1 + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %buffer0, ptr addrspace(8) %tmp0, i32 0, i32 32, i1 false, i1 false) #1 + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %buffer1, ptr addrspace(8) %tmp0, i32 0, i32 %arg1, i1 false, i1 false) #1 + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %buffer2, ptr addrspace(8) %tmp0, i32 1, i32 32, i1 false, i1 false) #1 + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %buffer3, ptr addrspace(8) %tmp0, i32 %arg1, i32 32, i1 false, i1 false) #1 call void asm sideeffect "", "" () - %buffer_format0 = call nsz <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 0, i32 48, i1 false, i1 false) #0 - %buffer_format1 = call nsz <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 0, i32 %arg1, i1 false, i1 false) #0 - %buffer_format2 = call nsz <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 1, i32 48, i1 false, i1 false) #0 - %buffer_format3 = call nsz <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 %arg1, i32 48, i1 false, i1 false) #0 + %buffer_format0 = call nsz <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) %tmp0, i32 0, i32 48, i1 false, i1 false) #0 + %buffer_format1 = call nsz <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) %tmp0, i32 0, i32 %arg1, i1 false, i1 false) #0 + %buffer_format2 = call nsz <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) %tmp0, i32 1, i32 48, i1 false, i1 false) #0 + %buffer_format3 = call nsz <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) %tmp0, i32 %arg1, i32 48, i1 false, i1 false) #0 call void asm sideeffect "", "" () - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %buffer_format0, <4 x i32> %tmp0, i32 0, i32 64, i1 false, i1 false) #1 - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %buffer_format1, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false, i1 false) #1 - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %buffer_format2, <4 x i32> %tmp0, i32 1, i32 64, i1 false, i1 false) #1 - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %buffer_format3, <4 x i32> %tmp0, i32 %arg1, i32 64, i1 false, i1 false) #1 + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %buffer_format0, ptr addrspace(8) %tmp0, i32 0, i32 64, i1 false, i1 false) #1 + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %buffer_format1, ptr addrspace(8) %tmp0, i32 0, i32 %arg1, i1 false, i1 false) #1 + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %buffer_format2, ptr addrspace(8) %tmp0, i32 1, i32 64, i1 false, i1 false) #1 + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %buffer_format3, ptr addrspace(8) %tmp0, i32 %arg1, i32 64, i1 false, i1 false) #1 call void asm sideeffect "", "" () - %atomic_add0 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 0, i32 80, i1 false) #2 - %atomic_add1 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false) #2 - %atomic_add2 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 1, i32 80, i1 false) #2 - %atomic_add3 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 %arg1, i32 80, i1 false) #2 + %atomic_add0 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %arg1, ptr addrspace(8) %tmp0, i32 0, i32 80, i1 false) #2 + %atomic_add1 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %arg1, ptr addrspace(8) %tmp0, i32 0, i32 %arg1, i1 false) #2 + %atomic_add2 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %arg1, ptr addrspace(8) %tmp0, i32 1, i32 80, i1 false) #2 + %atomic_add3 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %arg1, ptr addrspace(8) %tmp0, i32 %arg1, i32 80, i1 false) #2 call void asm sideeffect "", "" () - %atomic_cmpswap0 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 0, i32 96, i1 false) #2 - %atomic_cmpswap1 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false) #2 - %atomic_cmpswap2 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 1, i32 96, i1 false) #2 - %atomic_cmpswap3 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 %arg1, i32 96, i1 false) #2 + %atomic_cmpswap0 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %arg1, i32 %arg1, ptr addrspace(8) %tmp0, i32 0, i32 96, i1 false) #2 + %atomic_cmpswap1 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %arg1, i32 %arg1, ptr addrspace(8) %tmp0, i32 0, i32 %arg1, i1 false) #2 + %atomic_cmpswap2 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %arg1, i32 %arg1, ptr addrspace(8) %tmp0, i32 1, i32 96, i1 false) #2 + %atomic_cmpswap3 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %arg1, i32 %arg1, ptr addrspace(8) %tmp0, i32 %arg1, i32 96, i1 false) #2 call void asm sideeffect "", "" () - %fadd1 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 112, i1 false) #2 - %fadd2 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false) #2 - %fadd3 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 1, i32 112, i1 false) #2 - %fadd4 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 %arg1, i32 112, i1 false) #2 + %fadd1 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, ptr addrspace(8) %tmp0, i32 0, i32 112, i1 false) #2 + %fadd2 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, ptr addrspace(8) %tmp0, i32 0, i32 %arg1, i1 false) #2 + %fadd3 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, ptr addrspace(8) %tmp0, i32 1, i32 112, i1 false) #2 + %fadd4 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, ptr addrspace(8) %tmp0, i32 %arg1, i32 112, i1 false) #2 call void asm sideeffect "", "" () ; rsrc, offset, soffset, cachepolicy - %raw_buffer0 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %tmp0, i32 128, i32 0, i32 0) #0 - %raw_buffer1 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %tmp0, i32 64, i32 64, i32 0) #0 - %raw_buffer2 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 128, i32 0) #0 - %raw_buffer3 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %tmp0, i32 %arg1, i32 128, i32 0) #0 - %raw_buffer4 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %tmp0, i32 128, i32 %arg1, i32 0) #0 + %raw_buffer0 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %tmp0, i32 128, i32 0, i32 0) #0 + %raw_buffer1 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %tmp0, i32 64, i32 64, i32 0) #0 + %raw_buffer2 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %tmp0, i32 0, i32 128, i32 0) #0 + %raw_buffer3 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %tmp0, i32 %arg1, i32 128, i32 0) #0 + %raw_buffer4 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %tmp0, i32 128, i32 %arg1, i32 0) #0 call void asm sideeffect "", "" () - %raw_buffer_format0 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 144, i32 0, i32 0) #0 - %raw_buffer_format1 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 72, i32 72, i32 0) #0 - %raw_buffer_format2 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 0, i32 144, i32 0) #0 - %raw_buffer_format3 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 %arg1, i32 144, i32 0) #0 - %raw_buffer_format4 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 144, i32 %arg1, i32 0) #0 + %raw_buffer_format0 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %tmp0, i32 144, i32 0, i32 0) #0 + %raw_buffer_format1 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %tmp0, i32 72, i32 72, i32 0) #0 + %raw_buffer_format2 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %tmp0, i32 0, i32 144, i32 0) #0 + %raw_buffer_format3 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %tmp0, i32 %arg1, i32 144, i32 0) #0 + %raw_buffer_format4 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %tmp0, i32 144, i32 %arg1, i32 0) #0 call void asm sideeffect "", "" () - %raw_atomic_add0 = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 160, i32 0, i32 0) #2 - %raw_atomic_add1 = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 80, i32 80, i32 0) #2 - %raw_atomic_add2 = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 0, i32 160, i32 0) #2 - %raw_atomic_add3 = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 %arg1, i32 160, i32 0) #2 - %raw_atomic_add4 = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 160, i32 %arg1, i32 0) #2 + %raw_atomic_add0 = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %arg1, ptr addrspace(8) %tmp0, i32 160, i32 0, i32 0) #2 + %raw_atomic_add1 = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %arg1, ptr addrspace(8) %tmp0, i32 80, i32 80, i32 0) #2 + %raw_atomic_add2 = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %arg1, ptr addrspace(8) %tmp0, i32 0, i32 160, i32 0) #2 + %raw_atomic_add3 = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %arg1, ptr addrspace(8) %tmp0, i32 %arg1, i32 160, i32 0) #2 + %raw_atomic_add4 = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %arg1, ptr addrspace(8) %tmp0, i32 160, i32 %arg1, i32 0) #2 call void asm sideeffect "", "" () - %raw_atomic_cmpswap0 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 176, i32 0, i32 0) #2 - %raw_atomic_cmpswap1 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 88, i32 88, i32 0) #2 - %raw_atomic_cmpswap2 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 0, i32 176, i32 0) #2 - %raw_atomic_cmpswap3 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 %arg1, i32 176, i32 0) #2 - %raw_atomic_cmpswap4 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 176, i32 %arg1, i32 0) #2 + %raw_atomic_cmpswap0 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, ptr addrspace(8) %tmp0, i32 176, i32 0, i32 0) #2 + %raw_atomic_cmpswap1 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, ptr addrspace(8) %tmp0, i32 88, i32 88, i32 0) #2 + %raw_atomic_cmpswap2 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, ptr addrspace(8) %tmp0, i32 0, i32 176, i32 0) #2 + %raw_atomic_cmpswap3 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, ptr addrspace(8) %tmp0, i32 %arg1, i32 176, i32 0) #2 + %raw_atomic_cmpswap4 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, ptr addrspace(8) %tmp0, i32 176, i32 %arg1, i32 0) #2 call void asm sideeffect "", "" () - call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %raw_buffer0, <4 x i32> %tmp0, i32 192, i32 0, i32 0) #2 - call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %raw_buffer1, <4 x i32> %tmp0, i32 96, i32 96, i32 0) #2 - call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %raw_buffer2, <4 x i32> %tmp0, i32 0, i32 192, i32 0) #2 - call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %raw_buffer3, <4 x i32> %tmp0, i32 %arg1, i32 192, i32 0) #2 - call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %raw_buffer4, <4 x i32> %tmp0, i32 192, i32 %arg1, i32 0) #2 + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %raw_buffer0, ptr addrspace(8) %tmp0, i32 192, i32 0, i32 0) #2 + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %raw_buffer1, ptr addrspace(8) %tmp0, i32 96, i32 96, i32 0) #2 + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %raw_buffer2, ptr addrspace(8) %tmp0, i32 0, i32 192, i32 0) #2 + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %raw_buffer3, ptr addrspace(8) %tmp0, i32 %arg1, i32 192, i32 0) #2 + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %raw_buffer4, ptr addrspace(8) %tmp0, i32 192, i32 %arg1, i32 0) #2 call void asm sideeffect "", "" () - call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %raw_buffer_format0, <4 x i32> %tmp0, i32 208, i32 0, i32 0) #2 - call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %raw_buffer_format1, <4 x i32> %tmp0, i32 104, i32 104, i32 0) #2 - call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %raw_buffer_format2, <4 x i32> %tmp0, i32 0, i32 208, i32 0) #2 - call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %raw_buffer_format3, <4 x i32> %tmp0, i32 %arg1, i32 208, i32 0) #2 - call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %raw_buffer_format4, <4 x i32> %tmp0, i32 208, i32 %arg1, i32 0) #2 + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %raw_buffer_format0, ptr addrspace(8) %tmp0, i32 208, i32 0, i32 0) #2 + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %raw_buffer_format1, ptr addrspace(8) %tmp0, i32 104, i32 104, i32 0) #2 + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %raw_buffer_format2, ptr addrspace(8) %tmp0, i32 0, i32 208, i32 0) #2 + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %raw_buffer_format3, ptr addrspace(8) %tmp0, i32 %arg1, i32 208, i32 0) #2 + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %raw_buffer_format4, ptr addrspace(8) %tmp0, i32 208, i32 %arg1, i32 0) #2 call void asm sideeffect "", "" () ; rsrc, vindex, offset, soffset, cachepolicy - %struct_buffer0 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 224, i32 0, i32 0) #0 - %struct_buffer1 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 112, i32 112, i32 0) #0 - %struct_buffer2 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 0, i32 224, i32 0) #0 - %struct_buffer3 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 %arg1, i32 224, i32 0) #0 - %struct_buffer4 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 224, i32 %arg1, i32 0) #0 - %struct_buffer5 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %tmp0, i32 1, i32 224, i32 0, i32 0) #0 - %struct_buffer6 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %tmp0, i32 %arg1, i32 224, i32 0, i32 0) #0 + %struct_buffer0 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %tmp0, i32 0, i32 224, i32 0, i32 0) #0 + %struct_buffer1 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %tmp0, i32 0, i32 112, i32 112, i32 0) #0 + %struct_buffer2 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %tmp0, i32 0, i32 0, i32 224, i32 0) #0 + %struct_buffer3 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %tmp0, i32 0, i32 %arg1, i32 224, i32 0) #0 + %struct_buffer4 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %tmp0, i32 0, i32 224, i32 %arg1, i32 0) #0 + %struct_buffer5 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %tmp0, i32 1, i32 224, i32 0, i32 0) #0 + %struct_buffer6 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %tmp0, i32 %arg1, i32 224, i32 0, i32 0) #0 call void asm sideeffect "", "" () - %struct_buffer_format0 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 0, i32 240, i32 0, i32 0) #0 - %struct_buffer_format1 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 0, i32 120, i32 120, i32 0) #0 - %struct_buffer_format2 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 0, i32 0, i32 240, i32 0) #0 - %struct_buffer_format3 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 0, i32 %arg1, i32 240, i32 0) #0 - %struct_buffer_format4 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 0, i32 240, i32 %arg1, i32 0) #0 - %struct_buffer_format5 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 1, i32 240, i32 0, i32 0) #0 - %struct_buffer_format6 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 %arg1, i32 240, i32 0, i32 0) #0 + %struct_buffer_format0 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %tmp0, i32 0, i32 240, i32 0, i32 0) #0 + %struct_buffer_format1 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %tmp0, i32 0, i32 120, i32 120, i32 0) #0 + %struct_buffer_format2 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %tmp0, i32 0, i32 0, i32 240, i32 0) #0 + %struct_buffer_format3 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %tmp0, i32 0, i32 %arg1, i32 240, i32 0) #0 + %struct_buffer_format4 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %tmp0, i32 0, i32 240, i32 %arg1, i32 0) #0 + %struct_buffer_format5 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %tmp0, i32 1, i32 240, i32 0, i32 0) #0 + %struct_buffer_format6 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %tmp0, i32 %arg1, i32 240, i32 0, i32 0) #0 call void asm sideeffect "", "" () - %struct_atomic_add0 = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 0, i32 256, i32 0, i32 0) #2 - %struct_atomic_add1 = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 0, i32 128, i32 128, i32 0) #2 - %struct_atomic_add2 = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 0, i32 0, i32 256, i32 0) #2 - %struct_atomic_add3 = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 0, i32 %arg1, i32 256, i32 0) #2 - %struct_atomic_add4 = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 0, i32 256, i32 %arg1, i32 0) #2 - %struct_atomic_add5 = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 1, i32 256, i32 0, i32 0) #2 - %struct_atomic_add6 = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 %arg1, i32 256, i32 0, i32 0) #2 + %struct_atomic_add0 = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %arg1, ptr addrspace(8) %tmp0, i32 0, i32 256, i32 0, i32 0) #2 + %struct_atomic_add1 = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %arg1, ptr addrspace(8) %tmp0, i32 0, i32 128, i32 128, i32 0) #2 + %struct_atomic_add2 = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %arg1, ptr addrspace(8) %tmp0, i32 0, i32 0, i32 256, i32 0) #2 + %struct_atomic_add3 = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %arg1, ptr addrspace(8) %tmp0, i32 0, i32 %arg1, i32 256, i32 0) #2 + %struct_atomic_add4 = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %arg1, ptr addrspace(8) %tmp0, i32 0, i32 256, i32 %arg1, i32 0) #2 + %struct_atomic_add5 = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %arg1, ptr addrspace(8) %tmp0, i32 1, i32 256, i32 0, i32 0) #2 + %struct_atomic_add6 = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %arg1, ptr addrspace(8) %tmp0, i32 %arg1, i32 256, i32 0, i32 0) #2 call void asm sideeffect "", "" () - %struct_atomic_cmpswap0 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 0, i32 272, i32 0, i32 0) #2 - %struct_atomic_cmpswap1 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 0, i32 136, i32 136, i32 0) #2 - %struct_atomic_cmpswap2 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 0, i32 0, i32 272, i32 0) #2 - %struct_atomic_cmpswap3 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 0, i32 %arg1, i32 272, i32 0) #2 - %struct_atomic_cmpswap4 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 0, i32 272, i32 %arg1, i32 0) #2 - %struct_atomic_cmpswap5 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 1, i32 272, i32 0, i32 0) #2 - %struct_atomic_cmpswap6 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 %arg1, i32 272, i32 0, i32 0) #2 + %struct_atomic_cmpswap0 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, ptr addrspace(8) %tmp0, i32 0, i32 272, i32 0, i32 0) #2 + %struct_atomic_cmpswap1 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, ptr addrspace(8) %tmp0, i32 0, i32 136, i32 136, i32 0) #2 + %struct_atomic_cmpswap2 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, ptr addrspace(8) %tmp0, i32 0, i32 0, i32 272, i32 0) #2 + %struct_atomic_cmpswap3 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, ptr addrspace(8) %tmp0, i32 0, i32 %arg1, i32 272, i32 0) #2 + %struct_atomic_cmpswap4 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, ptr addrspace(8) %tmp0, i32 0, i32 272, i32 %arg1, i32 0) #2 + %struct_atomic_cmpswap5 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, ptr addrspace(8) %tmp0, i32 1, i32 272, i32 0, i32 0) #2 + %struct_atomic_cmpswap6 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, ptr addrspace(8) %tmp0, i32 %arg1, i32 272, i32 0, i32 0) #2 call void asm sideeffect "", "" () - call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %struct_buffer0, <4 x i32> %tmp0, i32 0, i32 288, i32 0, i32 0) #2 - call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %struct_buffer1, <4 x i32> %tmp0, i32 0, i32 144, i32 144, i32 0) #2 - call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %struct_buffer2, <4 x i32> %tmp0, i32 0, i32 0, i32 288, i32 0) #2 - call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %struct_buffer3, <4 x i32> %tmp0, i32 0, i32 %arg1, i32 288, i32 0) #2 - call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %struct_buffer4, <4 x i32> %tmp0, i32 0, i32 288, i32 %arg1, i32 0) #2 - call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %struct_buffer5, <4 x i32> %tmp0, i32 1, i32 288, i32 0, i32 0) #2 - call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %struct_buffer6, <4 x i32> %tmp0, i32 %arg1, i32 288, i32 0, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %struct_buffer0, ptr addrspace(8) %tmp0, i32 0, i32 288, i32 0, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %struct_buffer1, ptr addrspace(8) %tmp0, i32 0, i32 144, i32 144, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %struct_buffer2, ptr addrspace(8) %tmp0, i32 0, i32 0, i32 288, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %struct_buffer3, ptr addrspace(8) %tmp0, i32 0, i32 %arg1, i32 288, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %struct_buffer4, ptr addrspace(8) %tmp0, i32 0, i32 288, i32 %arg1, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %struct_buffer5, ptr addrspace(8) %tmp0, i32 1, i32 288, i32 0, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %struct_buffer6, ptr addrspace(8) %tmp0, i32 %arg1, i32 288, i32 0, i32 0) #2 call void asm sideeffect "", "" () - call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %struct_buffer_format0, <4 x i32> %tmp0, i32 0, i32 304, i32 0, i32 0) #2 - call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %struct_buffer_format1, <4 x i32> %tmp0, i32 0, i32 152, i32 152, i32 0) #2 - call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %struct_buffer_format2, <4 x i32> %tmp0, i32 0, i32 0, i32 304, i32 0) #2 - call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %struct_buffer_format3, <4 x i32> %tmp0, i32 0, i32 %arg1, i32 304, i32 0) #2 - call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %struct_buffer_format4, <4 x i32> %tmp0, i32 0, i32 304, i32 %arg1, i32 0) #2 - call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %struct_buffer_format5, <4 x i32> %tmp0, i32 1, i32 304, i32 0, i32 0) #2 - call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %struct_buffer_format6, <4 x i32> %tmp0, i32 %arg1, i32 304, i32 0, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %struct_buffer_format0, ptr addrspace(8) %tmp0, i32 0, i32 304, i32 0, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %struct_buffer_format1, ptr addrspace(8) %tmp0, i32 0, i32 152, i32 152, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %struct_buffer_format2, ptr addrspace(8) %tmp0, i32 0, i32 0, i32 304, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %struct_buffer_format3, ptr addrspace(8) %tmp0, i32 0, i32 %arg1, i32 304, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %struct_buffer_format4, ptr addrspace(8) %tmp0, i32 0, i32 304, i32 %arg1, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %struct_buffer_format5, ptr addrspace(8) %tmp0, i32 1, i32 304, i32 0, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %struct_buffer_format6, ptr addrspace(8) %tmp0, i32 %arg1, i32 304, i32 0, i32 0) #2 ret void } -declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 -declare i32 @llvm.amdgcn.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i1) #2 -declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #2 -declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) #2 -declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0 -declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32) #2 -declare i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32) #2 -declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #2 -declare void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #2 - -declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32, i32) #2 -declare i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32, i32) #2 -declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #2 -declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #2 +declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8), i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8), i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i1, i1) #1 +declare i32 @llvm.amdgcn.buffer.atomic.add.i32(i32, ptr addrspace(8), i32, i32, i1) #2 +declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, ptr addrspace(8), i32, i32, i1) #2 +declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, ptr addrspace(8), i32, i32, i1) #2 +declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8), i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8), i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, ptr addrspace(8), i32, i32, i32) #2 +declare i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32, i32, ptr addrspace(8), i32, i32, i32) #2 +declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32) #2 +declare void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32) #2 + +declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8), i32, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8), i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #2 +declare i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32, i32, ptr addrspace(8), i32, i32, i32, i32) #2 +declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32, i32) #2 +declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32, i32) #2 attributes #0 = { nounwind readonly } attributes #1 = { nounwind writeonly } diff --git a/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll b/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll @@ -0,0 +1,113 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=SDAG %s + +define amdgpu_kernel void @buffer_ptr_vector_ops(ptr addrspace(1) %somewhere) { +; GISEL-LABEL: buffer_ptr_vector_ops: +; GISEL: ; %bb.0: ; %main_body +; GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GISEL-NEXT: v_mov_b32_e32 v8, 0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v4, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GISEL-NEXT: v_mov_b32_e32 v5, s5 +; GISEL-NEXT: v_mov_b32_e32 v6, s6 +; GISEL-NEXT: v_mov_b32_e32 v7, s7 +; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GISEL-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:32 +; GISEL-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9] offset:48 +; GISEL-NEXT: s_endpgm +; +; SDAG-LABEL: buffer_ptr_vector_ops: +; SDAG: ; %bb.0: ; %main_body +; SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v0, s0 +; SDAG-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-NEXT: v_mov_b32_e32 v2, s2 +; SDAG-NEXT: v_mov_b32_e32 v3, s3 +; SDAG-NEXT: v_mov_b32_e32 v4, s4 +; SDAG-NEXT: v_mov_b32_e32 v5, s5 +; SDAG-NEXT: v_mov_b32_e32 v6, s6 +; SDAG-NEXT: v_mov_b32_e32 v7, s7 +; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9] offset:48 +; SDAG-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:32 +; SDAG-NEXT: s_endpgm +main_body: + %buffers = load <2 x ptr addrspace(8)>, ptr addrspace(1) %somewhere + %buf1 = extractelement <2 x ptr addrspace(8)> %buffers, i32 0 + %buf2 = extractelement <2 x ptr addrspace(8)> %buffers, i32 1 + %buf1.int = ptrtoint ptr addrspace(8) %buf1 to i128 + %buf1.vec = bitcast i128 %buf1.int to <4 x i32> + call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %buf1.vec, ptr addrspace(8) %buf2, i32 0, i32 0, i32 0) + %shuffled = shufflevector <2 x ptr addrspace(8)> %buffers, <2 x ptr addrspace(8)> undef, <2 x i32> + %somewhere.next = getelementptr <2 x ptr addrspace(8)>, ptr addrspace(1) %somewhere, i64 1 + store <2 x ptr addrspace(8)> %shuffled, ptr addrspace(1) %somewhere.next + ret void +} + +%fat_buffer_struct = type {ptr addrspace(8), i32} + +define amdgpu_kernel void @buffer_structs(%fat_buffer_struct %arg, ptr addrspace(1) %dest) { +; GISEL-LABEL: buffer_structs: +; GISEL: ; %bb.0: ; %main_body +; GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GISEL-NEXT: v_mov_b32_e32 v5, 0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: s_ashr_i32 s3, s2, 31 +; GISEL-NEXT: s_lshl_b64 s[0:1], s[2:3], 5 +; GISEL-NEXT: s_add_u32 s0, s8, s0 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v4, s2 +; GISEL-NEXT: s_addc_u32 s1, s9, s1 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GISEL-NEXT: buffer_store_dword v4, v4, s[4:7], 0 offen +; GISEL-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GISEL-NEXT: global_store_dword v5, v4, s[0:1] offset:16 +; GISEL-NEXT: s_endpgm +; +; SDAG-LABEL: buffer_structs: +; SDAG: ; %bb.0: ; %main_body +; SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: s_ashr_i32 s3, s2, 31 +; SDAG-NEXT: s_lshl_b64 s[0:1], s[2:3], 5 +; SDAG-NEXT: s_add_u32 s0, s8, s0 +; SDAG-NEXT: v_mov_b32_e32 v0, s2 +; SDAG-NEXT: s_addc_u32 s1, s9, s1 +; SDAG-NEXT: buffer_store_dword v0, v0, s[4:7], 0 offen +; SDAG-NEXT: global_store_dword v4, v0, s[0:1] offset:16 +; SDAG-NEXT: v_mov_b32_e32 v0, s4 +; SDAG-NEXT: v_mov_b32_e32 v1, s5 +; SDAG-NEXT: v_mov_b32_e32 v2, s6 +; SDAG-NEXT: v_mov_b32_e32 v3, s7 +; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; SDAG-NEXT: s_endpgm +main_body: + %buffer = extractvalue %fat_buffer_struct %arg, 0 + %offset = extractvalue %fat_buffer_struct %arg, 1 + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %offset, ptr addrspace(8) %buffer, i32 %offset, i32 0, i32 0) + ; Confirm the alignment of this struct is 32 bytes + %dest.next = getelementptr %fat_buffer_struct, ptr addrspace(1) %dest, i32 %offset + store %fat_buffer_struct %arg, ptr addrspace(1) %dest.next + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.i32(i32, ptr addrspace(8), i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, ptr addrspace(8), i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/buffer-schedule.ll b/llvm/test/CodeGen/AMDGPU/buffer-schedule.ll --- a/llvm/test/CodeGen/AMDGPU/buffer-schedule.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-schedule.ll @@ -21,26 +21,26 @@ %tmp10 = extractelement <3 x i32> %tmp9, i32 0 %tmp11 = shl i32 %tmp10, 2 %tmp12 = inttoptr i64 undef to ptr addrspace(4) - %tmp13 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16 - %tmp14 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp13, i32 0, i32 %tmp11, i1 false, i1 false) #0 - %tmp17 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16 - call void @llvm.amdgcn.buffer.store.f32(float %tmp14, <4 x i32> %tmp17, i32 0, i32 %tmp11, i1 false, i1 false) #0 - %tmp20 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16 - %tmp21 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp20, i32 0, i32 %tmp11, i1 false, i1 false) #0 + %tmp13 = load ptr addrspace(8), ptr addrspace(4) %tmp12, align 16 + %tmp14 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %tmp13, i32 0, i32 %tmp11, i1 false, i1 false) #0 + %tmp17 = load ptr addrspace(8), ptr addrspace(4) %tmp12, align 16 + call void @llvm.amdgcn.buffer.store.f32(float %tmp14, ptr addrspace(8) %tmp17, i32 0, i32 %tmp11, i1 false, i1 false) #0 + %tmp20 = load ptr addrspace(8), ptr addrspace(4) %tmp12, align 16 + %tmp21 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %tmp20, i32 0, i32 %tmp11, i1 false, i1 false) #0 %tmp22 = fadd reassoc nnan arcp contract float %tmp21, 1.000000e+00 - call void @llvm.amdgcn.buffer.store.f32(float %tmp22, <4 x i32> %tmp20, i32 0, i32 %tmp11, i1 false, i1 false) #0 - %tmp25 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16 - %tmp26 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp25, i32 0, i32 %tmp11, i1 false, i1 false) #0 + call void @llvm.amdgcn.buffer.store.f32(float %tmp22, ptr addrspace(8) %tmp20, i32 0, i32 %tmp11, i1 false, i1 false) #0 + %tmp25 = load ptr addrspace(8), ptr addrspace(4) %tmp12, align 16 + %tmp26 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %tmp25, i32 0, i32 %tmp11, i1 false, i1 false) #0 %tmp27 = fadd reassoc nnan arcp contract float %tmp26, 1.000000e+00 - call void @llvm.amdgcn.buffer.store.f32(float %tmp27, <4 x i32> %tmp25, i32 0, i32 %tmp11, i1 false, i1 false) #0 - %tmp30 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16 - %tmp31 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp30, i32 0, i32 %tmp11, i1 false, i1 false) #0 + call void @llvm.amdgcn.buffer.store.f32(float %tmp27, ptr addrspace(8) %tmp25, i32 0, i32 %tmp11, i1 false, i1 false) #0 + %tmp30 = load ptr addrspace(8), ptr addrspace(4) %tmp12, align 16 + %tmp31 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %tmp30, i32 0, i32 %tmp11, i1 false, i1 false) #0 %tmp32 = fadd reassoc nnan arcp contract float %tmp31, 1.000000e+00 - call void @llvm.amdgcn.buffer.store.f32(float %tmp32, <4 x i32> %tmp30, i32 0, i32 %tmp11, i1 false, i1 false) #0 - %tmp35 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16 - %tmp36 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp35, i32 0, i32 %tmp11, i1 false, i1 false) #0 + call void @llvm.amdgcn.buffer.store.f32(float %tmp32, ptr addrspace(8) %tmp30, i32 0, i32 %tmp11, i1 false, i1 false) #0 + %tmp35 = load ptr addrspace(8), ptr addrspace(4) %tmp12, align 16 + %tmp36 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %tmp35, i32 0, i32 %tmp11, i1 false, i1 false) #0 %tmp37 = fadd reassoc nnan arcp contract float %tmp36, 1.000000e+00 - call void @llvm.amdgcn.buffer.store.f32(float %tmp37, <4 x i32> %tmp35, i32 0, i32 %tmp11, i1 false, i1 false) #0 + call void @llvm.amdgcn.buffer.store.f32(float %tmp37, ptr addrspace(8) %tmp35, i32 0, i32 %tmp11, i1 false, i1 false) #0 ret void } @@ -48,21 +48,21 @@ ; GCN: buffer_store_dword ; GCN: buffer_load_dword ; GCN: buffer_store_dword -define amdgpu_cs void @test1(<4 x i32> inreg %buf, i32 %off) { +define amdgpu_cs void @test1(ptr addrspace(8) inreg %buf, i32 %off) { .entry: - call void @llvm.amdgcn.raw.buffer.store.i32(i32 0, <4 x i32> %buf, i32 8, i32 0, i32 0) - %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %buf, i32 %off, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %buf, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i32(i32 0, ptr addrspace(8) %buf, i32 8, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) %buf, i32 %off, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, ptr addrspace(8) %buf, i32 0, i32 0, i32 0) ret void } -declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2 +declare float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8), i32, i32, i1, i1) #2 -declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #3 +declare void @llvm.amdgcn.buffer.store.f32(float, ptr addrspace(8), i32, i32, i1, i1) #3 -declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32) #2 +declare i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8), i32, i32, i32) #2 -declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32) #3 +declare void @llvm.amdgcn.raw.buffer.store.i32(i32, ptr addrspace(8), i32, i32, i32) #3 attributes #2 = { nounwind readonly } attributes #3 = { nounwind writeonly } diff --git a/llvm/test/CodeGen/AMDGPU/bug-vopc-commute.ll b/llvm/test/CodeGen/AMDGPU/bug-vopc-commute.ll --- a/llvm/test/CodeGen/AMDGPU/bug-vopc-commute.ll +++ b/llvm/test/CodeGen/AMDGPU/bug-vopc-commute.ll @@ -8,8 +8,8 @@ ; of which were in SGPRs. define amdgpu_vs float @main(i32 %v) { main_body: - %d1 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 960, i32 0) - %d2 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 976, i32 0) + %d1 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 960, i32 0) + %d2 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 976, i32 0) br i1 undef, label %ENDIF56, label %IF57 IF57: ; preds = %ENDIF @@ -41,7 +41,7 @@ } ; Function Attrs: nounwind readnone -declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #0 +declare float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8), i32, i32) #0 attributes #0 = { nounwind readnone } attributes #1 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/cc-sgpr-limit.ll b/llvm/test/CodeGen/AMDGPU/cc-sgpr-limit.ll --- a/llvm/test/CodeGen/AMDGPU/cc-sgpr-limit.ll +++ b/llvm/test/CodeGen/AMDGPU/cc-sgpr-limit.ll @@ -111,7 +111,7 @@ ; CHECK: s_xor_b64 s[0:1], s[0:1], s[30:31] ; CHECK: s_xor_b64 s[0:1], s[0:1], s[32:33] ; CHECK: s_xor_b64 s[0:1], s[0:1], s[34:35] -define amdgpu_gs void @_amdgpu_gs_sgpr_limit_i64(i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, <4 x i32> inreg %addr) { +define amdgpu_gs void @_amdgpu_gs_sgpr_limit_i64(i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, ptr addrspace(8) inreg %addr) { .entry: %19 = xor i64 %0, %1 %20 = xor i64 %19, %2 @@ -131,8 +131,8 @@ %34 = xor i64 %33, %16 %35 = xor i64 %34, %17 %36 = bitcast i64 %35 to <2 x i32> - call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %36, <4 x i32> %addr, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %36, ptr addrspace(8) %addr, i32 4, i32 0, i32 0) ret void } -declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, ptr addrspace(8), i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/cc-sgpr-over-limit.ll b/llvm/test/CodeGen/AMDGPU/cc-sgpr-over-limit.ll --- a/llvm/test/CodeGen/AMDGPU/cc-sgpr-over-limit.ll +++ b/llvm/test/CodeGen/AMDGPU/cc-sgpr-over-limit.ll @@ -98,4 +98,4 @@ ret { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %135 } -declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, ptr addrspace(8), i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll --- a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll @@ -68,7 +68,7 @@ br i1 undef, label %bb9, label %bb bb: ; preds = %.a - %.i3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> undef, i32 %.2, i32 64, i32 1) + %.i3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) undef, i32 %.2, i32 64, i32 1) %i5 = icmp eq i32 %.i3, 0 br label %bb9 @@ -150,7 +150,7 @@ br i1 undef, label %bb9, label %bb bb: ; preds = %.a - %.i3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> undef, i32 %.2, i32 64, i32 1) + %.i3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) undef, i32 %.2, i32 64, i32 1) %i5 = icmp eq i32 %.i3, 0 br label %bb9 @@ -235,7 +235,7 @@ br i1 undef, label %bb9, label %bb bb: ; preds = %.a - %.i3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> undef, i32 %.2, i32 64, i32 1) + %.i3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) undef, i32 %.2, i32 64, i32 1) %i5 = icmp eq i32 %.i3, 0 br label %bb9 @@ -321,7 +321,7 @@ br i1 undef, label %bb9, label %bb bb: ; preds = %.a - %.i3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> undef, i32 %.2, i32 64, i32 1) + %.i3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) undef, i32 %.2, i32 64, i32 1) %i5 = icmp eq i32 %.i3, 0 br label %bb9 @@ -402,7 +402,7 @@ br i1 undef, label %bb9, label %bb bb: ; preds = %.a - %.i3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> undef, i32 %.2, i32 64, i32 1) + %.i3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) undef, i32 %.2, i32 64, i32 1) %i5 = icmp eq i32 %.i3, 0 br label %bb9 @@ -483,7 +483,7 @@ br i1 undef, label %bb9, label %bb bb: ; preds = %.a - %.i3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> undef, i32 %.2, i32 64, i32 1) + %.i3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) undef, i32 %.2, i32 64, i32 1) %i5 = icmp eq i32 %.i3, 0 br label %bb9 @@ -502,7 +502,7 @@ ; Function Attrs: nounwind readonly willreturn -declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg) #0 +declare i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8), i32, i32, i32 immarg) #0 attributes #0 = { nounwind readonly willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll --- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -305,8 +305,8 @@ ; GCN: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[[[READLANE]]:[[ZERO]]] define amdgpu_vs float @vgpr_arg_src(ptr addrspace(6) %arg) { main_body: - %tmp9 = load <4 x i32>, ptr addrspace(6) %arg - %tmp10 = call nsz float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %tmp9, i32 undef, i32 0, i32 0, i32 0) #1 + %tmp9 = load ptr addrspace(8), ptr addrspace(6) %arg + %tmp10 = call nsz float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) %tmp9, i32 undef, i32 0, i32 0, i32 0) #1 ret float %tmp10 } @@ -316,7 +316,7 @@ ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #7 -declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32) #7 +declare float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8), i32, i32, i32, i32) #7 !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll --- a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll +++ b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll @@ -26,9 +26,9 @@ ; GCN-NEXT: s_endpgm entry: ; preds = %1009 %0 = load i32, ptr addrspace(1) %in, align 4 - %1 = load <4 x i32>, ptr addrspace(4) %addrSrc, align 16 + %1 = load ptr addrspace(8), ptr addrspace(4) %addrSrc, align 16 %2 = icmp ne i32 %0, 0 - %3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %1, i32 252, i32 0, i32 0) + %3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) %1, i32 252, i32 0, i32 0) %4 = icmp ne i32 %3, 0 %5 = xor i1 %2, %4 %result = select i1 %5, i32 2, i32 3 @@ -36,4 +36,4 @@ ret void } -declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg) +declare i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8), i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll --- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -591,7 +591,7 @@ ret void } -define protected amdgpu_kernel void @buffer.atomic.swap(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.atomic.swap(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.swap: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -606,14 +606,14 @@ ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 ret void } -define protected amdgpu_kernel void @buffer.atomic.add(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.atomic.add(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -628,14 +628,14 @@ ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 ret void } -define protected amdgpu_kernel void @buffer.atomic.sub(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.atomic.sub(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.sub: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -650,14 +650,14 @@ ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 ret void } -define protected amdgpu_kernel void @buffer.atomic.smin(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.atomic.smin(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.smin: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -672,14 +672,14 @@ ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 ret void } -define protected amdgpu_kernel void @buffer.atomic.smax(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.atomic.smax(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.smax: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -694,14 +694,14 @@ ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 ret void } -define protected amdgpu_kernel void @buffer.atomic.umin(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.atomic.umin(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.umin: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -716,14 +716,14 @@ ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 ret void } -define protected amdgpu_kernel void @buffer.atomic.umax(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.atomic.umax(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.umax: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -738,14 +738,14 @@ ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 ret void } -define protected amdgpu_kernel void @buffer.atomic.and(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.atomic.and(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.and: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -760,14 +760,14 @@ ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 ret void } -define protected amdgpu_kernel void @buffer.atomic.or(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.atomic.or(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.or: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -782,14 +782,14 @@ ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 ret void } -define protected amdgpu_kernel void @buffer.atomic.xor(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.atomic.xor(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.xor: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -804,14 +804,14 @@ ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 ret void } -define protected amdgpu_kernel void @buffer.atomic.inc(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.atomic.inc(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.inc: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -826,14 +826,14 @@ ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.inc.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.inc.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 ret void } -define protected amdgpu_kernel void @buffer.atomic.dec(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.atomic.dec(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.dec: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -848,14 +848,14 @@ ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.dec.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.dec.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 ret void } -define protected amdgpu_kernel void @buffer.atomic.cmpswap(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.atomic.cmpswap(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.cmpswap: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -871,14 +871,14 @@ ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 1, i32 2, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 1, i32 2, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 ret void } -define protected amdgpu_kernel void @buffer.atomic.fadd(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.atomic.fadd(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.fadd: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -894,7 +894,7 @@ ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %f32 = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %f32 = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float 1.0, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) %n32 = fptoui float %f32 to i32 %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 @@ -902,7 +902,7 @@ ret void } -define protected amdgpu_kernel void @buffer.atomic.fmin(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.atomic.fmin(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.fmin: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -919,7 +919,7 @@ ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %f64 = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double 1.0, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %f64 = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double 1.0, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) %n32 = fptoui double %f64 to i32 %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 @@ -927,7 +927,7 @@ ret void } -define protected amdgpu_kernel void @buffer.atomic.fmax(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.atomic.fmax(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.fmax: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -944,7 +944,7 @@ ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %f64 = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double 1.0, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %f64 = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double 1.0, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) %n32 = fptoui double %f64 to i32 %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 @@ -956,19 +956,19 @@ declare i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1), i32, i32 immarg, i32 immarg, i1 immarg) declare double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1), double) declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1), double) -declare i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i32) -declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32) -declare i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i32) -declare i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32, <4 x i32>, i32, i32, i32) -declare i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32, <4 x i32>, i32, i32, i32) -declare i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32, <4 x i32>, i32, i32, i32) -declare i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i32) -declare i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i32) -declare i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i32) -declare i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i32) -declare i32 @llvm.amdgcn.raw.buffer.atomic.inc.i32(i32, <4 x i32>, i32, i32, i32) -declare i32 @llvm.amdgcn.raw.buffer.atomic.dec.i32(i32, <4 x i32>, i32, i32, i32) -declare i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32) -declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32) -declare double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32) -declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32) +declare i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32, ptr addrspace(8), i32, i32, i32) +declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, ptr addrspace(8), i32, i32, i32) +declare i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32, ptr addrspace(8), i32, i32, i32) +declare i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32, ptr addrspace(8), i32, i32, i32) +declare i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32, ptr addrspace(8), i32, i32, i32) +declare i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32, ptr addrspace(8), i32, i32, i32) +declare i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32, ptr addrspace(8), i32, i32, i32) +declare i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32, ptr addrspace(8), i32, i32, i32) +declare i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32, ptr addrspace(8), i32, i32, i32) +declare i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32, ptr addrspace(8), i32, i32, i32) +declare i32 @llvm.amdgcn.raw.buffer.atomic.inc.i32(i32, ptr addrspace(8), i32, i32, i32) +declare i32 @llvm.amdgcn.raw.buffer.atomic.dec.i32(i32, ptr addrspace(8), i32, i32, i32) +declare i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32, i32, ptr addrspace(8), i32, i32, i32) +declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, ptr addrspace(8), i32, i32, i32) +declare double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double, ptr addrspace(8), i32, i32, i32) +declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll @@ -38,7 +38,7 @@ ; GCN-NEXT: v_sub_f32_e32 v8, s0, v1 ; GCN-NEXT: v_fma_f32 v7, -s2, v6, s6 ; GCN-NEXT: v_fma_f32 v5, v6, v5, 1.0 -; GCN-NEXT: v_mad_f32 v10, s2, v6, v2 +; GCN-NEXT: v_fma_f32 v10, s2, v6, v2 ; GCN-NEXT: s_mov_b32 s0, 0x3c23d70a ; GCN-NEXT: v_fmac_f32_e32 v1, v6, v8 ; GCN-NEXT: v_fmac_f32_e32 v10, v7, v6 @@ -69,7 +69,7 @@ .entry: %0 = call <3 x float> @llvm.amdgcn.image.sample.2d.v3f32.f32(i32 7, float undef, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) %.i2243 = extractelement <3 x float> %0, i32 2 - %1 = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> undef, i32 0, i32 0) + %1 = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(ptr addrspace(8) undef, i32 0, i32 0) %2 = shufflevector <3 x i32> %1, <3 x i32> poison, <4 x i32> %3 = bitcast <4 x i32> %2 to <4 x float> %.i2248 = extractelement <4 x float> %3, i32 2 @@ -82,17 +82,17 @@ %.i1408 = extractelement <2 x float> %7, i32 1 %.i0364 = extractelement <2 x float> %7, i32 0 %8 = call float @llvm.amdgcn.image.sample.2d.f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) - %9 = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> undef, i32 112, i32 0) + %9 = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(ptr addrspace(8) undef, i32 112, i32 0) %10 = shufflevector <3 x i32> %9, <3 x i32> poison, <4 x i32> %11 = bitcast <4 x i32> %10 to <4 x float> %.i2360 = extractelement <4 x float> %11, i32 2 %.i2363 = fmul reassoc nnan nsz arcp contract afn float %.i2360, %8 - %12 = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> undef, i32 96, i32 0) + %12 = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(ptr addrspace(8) undef, i32 96, i32 0) %13 = shufflevector <3 x i32> %12, <3 x i32> poison, <4 x i32> %14 = bitcast <4 x i32> %13 to <4 x float> %.i2367 = extractelement <4 x float> %14, i32 2 %.i2370 = fmul reassoc nnan nsz arcp contract afn float %.i0364, %.i2367 - %15 = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> undef, i32 32, i32 0) + %15 = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(ptr addrspace(8) undef, i32 32, i32 0) %16 = shufflevector <3 x i32> %15, <3 x i32> poison, <4 x i32> %17 = bitcast <4 x i32> %16 to <4 x float> %.i2373 = extractelement <4 x float> %17, i32 2 @@ -105,19 +105,19 @@ %.i2397 = fmul reassoc nnan nsz arcp contract afn float %.i2363, %18 %.i2404 = fmul reassoc nnan nsz arcp contract afn float %.i2394, %4 %.i2407 = fadd reassoc nnan nsz arcp contract afn float %.i2397, %.i2404 - %20 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 92, i32 0) + %20 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) undef, i32 92, i32 0) %21 = bitcast i32 %20 to float - %22 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 124, i32 0) + %22 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) undef, i32 124, i32 0) %23 = bitcast i32 %22 to float %24 = fsub reassoc nnan nsz arcp contract afn float %21, %23 %25 = fmul reassoc nnan nsz arcp contract afn float %.i1408, %24 %26 = fadd reassoc nnan nsz arcp contract afn float %25, %23 - %27 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 44, i32 0) + %27 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) undef, i32 44, i32 0) %28 = bitcast i32 %27 to float %29 = fsub reassoc nnan nsz arcp contract afn float %28, %26 %30 = fmul reassoc nnan nsz arcp contract afn float %6, %29 %31 = fadd reassoc nnan nsz arcp contract afn float %26, %30 - %32 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 192, i32 0) + %32 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) undef, i32 192, i32 0) %33 = bitcast i32 %32 to float %34 = fadd reassoc nnan nsz arcp contract afn float %33, -1.000000e+00 %35 = fmul reassoc nnan nsz arcp contract afn float %18, %34 @@ -135,12 +135,12 @@ %.i2465 = extractelement <3 x float> %44, i32 2 %.i2466 = fmul reassoc nnan nsz arcp contract afn float %.i2465, %43 %.i2469 = fmul reassoc nnan nsz arcp contract afn float %.i2415, %.i2466 - %45 = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> undef, i32 64, i32 0) + %45 = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(ptr addrspace(8) undef, i32 64, i32 0) %46 = shufflevector <3 x i32> %45, <3 x i32> poison, <4 x i32> %47 = bitcast <4 x i32> %46 to <4 x float> %.i2476 = extractelement <4 x float> %47, i32 2 %.i2479 = fmul reassoc nnan nsz arcp contract afn float %.i2476, %18 - %48 = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> undef, i32 80, i32 0) + %48 = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(ptr addrspace(8) undef, i32 80, i32 0) %49 = shufflevector <3 x i32> %48, <3 x i32> poison, <4 x i32> %50 = bitcast <4 x i32> %49 to <4 x float> %.i2482 = extractelement <4 x float> %50, i32 2 @@ -153,7 +153,7 @@ %.i2516 = fadd reassoc nnan nsz arcp contract afn float %.i2515, %.i2494 %.i2522 = fadd reassoc nnan nsz arcp contract afn float %.i2521, %.i2516 %.i2525 = fmul reassoc nnan nsz arcp contract afn float %.i2522, %43 - %52 = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> undef, i32 16, i32 0) + %52 = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(ptr addrspace(8) undef, i32 16, i32 0) %53 = shufflevector <3 x i32> %52, <3 x i32> poison, <4 x i32> %54 = bitcast <4 x i32> %53 to <4 x float> %.i2530 = extractelement <4 x float> %54, i32 2 @@ -276,10 +276,10 @@ declare <3 x float> @llvm.amdgcn.image.load.mip.2d.v3f32.i32(i32 immarg, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #3 ; Function Attrs: nounwind readnone willreturn -declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg) #3 +declare i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8), i32, i32 immarg) #3 ; Function Attrs: nounwind readnone willreturn -declare <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32>, i32, i32 immarg) #3 +declare <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(ptr addrspace(8), i32, i32 immarg) #3 attributes #0 = { "denormal-fp-math-f32"="preserve-sign" } attributes #1 = { nofree nosync nounwind readnone speculatable willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/else.ll b/llvm/test/CodeGen/AMDGPU/else.ll --- a/llvm/test/CodeGen/AMDGPU/else.ll +++ b/llvm/test/CodeGen/AMDGPU/else.ll @@ -48,11 +48,11 @@ end: %r = phi float [ %v.if, %if ], [ %v.else, %else ] - call void @llvm.amdgcn.raw.buffer.store.f32(float %r, <4 x i32> undef, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %r, ptr addrspace(8) undef, i32 0, i32 0, i32 0) ret void } -declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #1 +declare void @llvm.amdgcn.raw.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32 immarg) #1 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 immarg, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #2 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll b/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll --- a/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll @@ -24,13 +24,13 @@ ; GCN-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact killed [[COPY4]], [[COPY5]], [[DEF3]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 8) ; GCN-NEXT: S_ENDPGM 0 main_body: - %tmp25 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> undef, i32 undef, i32 0, i32 0) + %tmp25 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) undef, i32 undef, i32 0, i32 0) %tmp27 = bitcast <4 x float> %tmp25 to <16 x i8> %tmp28 = shufflevector <16 x i8> %tmp27, <16 x i8> undef, <12 x i32> %tmp29 = bitcast <12 x i8> %tmp28 to <3 x i32> - call void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32> %tmp29, <4 x i32> undef, i32 undef, i32 0, i32 0) #3 + call void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32> %tmp29, ptr addrspace(8) undef, i32 undef, i32 0, i32 0) #3 ret void } -declare void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32 immarg) -declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg) +declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8), i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/fail-select-buffer-atomic-fadd.ll b/llvm/test/CodeGen/AMDGPU/fail-select-buffer-atomic-fadd.ll --- a/llvm/test/CodeGen/AMDGPU/fail-select-buffer-atomic-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/fail-select-buffer-atomic-fadd.ll @@ -9,11 +9,11 @@ ; FIXME: Should also really make sure the v2f16 version fails. ; FAIL: LLVM ERROR: Cannot select: {{.+}}: f32,ch = BUFFER_ATOMIC_FADD -define amdgpu_cs void @atomic_fadd(<4 x i32> inreg %arg0) { - %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %arg0, i32 0, i32 112, i1 false) +define amdgpu_cs void @atomic_fadd(ptr addrspace(8) inreg %arg0) { + %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, ptr addrspace(8) %arg0, i32 0, i32 112, i1 false) ret void } -declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1 immarg) #0 +declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, ptr addrspace(8), i32, i32, i1 immarg) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll --- a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll @@ -3,7 +3,7 @@ ; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead. ; GCN-LABEL: wwm: -define amdgpu_hs void @wwm(i32 inreg %arg, <4 x i32> inreg %buffer) { +define amdgpu_hs void @wwm(i32 inreg %arg, ptr addrspace(8) inreg %buffer) { entry: br label %work @@ -16,7 +16,7 @@ br i1 %tmp607, label %bb49, label %bb54 bb49: - call void @llvm.amdgcn.raw.tbuffer.store.f32(float 1.0, <4 x i32> %buffer, i32 4, i32 1, i32 116, i32 1) + call void @llvm.amdgcn.raw.tbuffer.store.f32(float 1.0, ptr addrspace(8) %buffer, i32 4, i32 1, i32 116, i32 1) ret void bb54: @@ -40,7 +40,7 @@ } ; GCN-LABEL: strict_wwm: -define amdgpu_hs void @strict_wwm(i32 inreg %arg, <4 x i32> inreg %buffer) { +define amdgpu_hs void @strict_wwm(i32 inreg %arg, ptr addrspace(8) inreg %buffer) { entry: br label %work @@ -53,7 +53,7 @@ br i1 %tmp607, label %bb49, label %bb54 bb49: - call void @llvm.amdgcn.raw.tbuffer.store.f32(float 1.0, <4 x i32> %buffer, i32 4, i32 1, i32 116, i32 1) + call void @llvm.amdgcn.raw.tbuffer.store.f32(float 1.0, ptr addrspace(8) %buffer, i32 4, i32 1, i32 116, i32 1) ret void bb54: @@ -79,7 +79,7 @@ declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0 declare i32 @llvm.amdgcn.wwm.i32(i32) #1 declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #1 -declare void @llvm.amdgcn.raw.tbuffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg, i32 immarg) #2 +declare void @llvm.amdgcn.raw.tbuffer.store.f32(float, ptr addrspace(8), i32, i32, i32 immarg, i32 immarg) #2 attributes #0 = { convergent nounwind readnone willreturn } attributes #1 = { convergent nounwind readnone speculatable willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll @@ -11,11 +11,11 @@ ; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX1030 ; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX1100 -declare float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float, <4 x i32>, i32, i32, i32 immarg) -declare float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float, <4 x i32>, i32, i32, i32 immarg) +declare float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float, ptr addrspace(8), i32, i32, i32 immarg) +declare float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float, ptr addrspace(8), i32, i32, i32 immarg) -define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { +define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_min_noret_f32: ; SI: ; %bb.0: ; %main_body ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd @@ -122,11 +122,11 @@ ; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) ret void } -define amdgpu_ps void @raw_buffer_atomic_min_rtn_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { +define amdgpu_ps void @raw_buffer_atomic_min_rtn_f32(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_min_rtn_f32: ; SI: ; %bb.0: ; %main_body ; SI-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc @@ -207,12 +207,12 @@ ; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) store float %ret, ptr addrspace(1) undef ret void } -define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, ptr addrspace(3) %out) { +define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body ; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd @@ -364,12 +364,12 @@ ; G_GFX1100-NEXT: s_endpgm ; GFX1010-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: main_body: - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) store float %ret, ptr addrspace(3) %out, align 8 ret void } -define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { +define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_max_noret_f32: ; SI: ; %bb.0: ; %main_body ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd @@ -476,11 +476,11 @@ ; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) ret void } -define amdgpu_ps void @raw_buffer_atomic_max_rtn_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { +define amdgpu_ps void @raw_buffer_atomic_max_rtn_f32(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_max_rtn_f32: ; SI: ; %bb.0: ; %main_body ; SI-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc @@ -561,12 +561,12 @@ ; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) store float %ret, ptr addrspace(1) undef ret void } -define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex, ptr addrspace(1) %out) { ; SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 @@ -697,7 +697,7 @@ ; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) store float %ret, ptr addrspace(1) %out, align 8 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -2,13 +2,13 @@ ; RUN: llc < %s -march=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GFX90A ; RUN: llc < %s -march=amdgcn -mcpu=gfx940 -verify-machineinstrs | FileCheck %s -check-prefix=GFX940 -declare double @llvm.amdgcn.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i1) -declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32 immarg) -declare double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32 immarg) -declare double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32, i32 immarg) -declare double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32 immarg) -declare double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32, i32 immarg) -declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32 immarg) +declare double @llvm.amdgcn.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i1) +declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i32 immarg) +declare double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double, ptr addrspace(8), i32, i32, i32 immarg) +declare double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg) +declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32 immarg) declare double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) declare double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) @@ -18,7 +18,7 @@ declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32, i32, i1) -define amdgpu_kernel void @buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @buffer_atomic_add_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -41,11 +41,11 @@ ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) ret void } -define amdgpu_ps void @buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_ps void @buffer_atomic_add_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: buffer_atomic_add_rtn_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen glc @@ -60,12 +60,12 @@ ; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX940-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) store double %ret, ptr undef ret void } -define amdgpu_kernel void @buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -96,12 +96,12 @@ ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX940-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) + %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i1 1) store double %ret, ptr addrspace(1) %out, align 8 ret void } -define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -124,11 +124,11 @@ ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) ret void } -define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen glc @@ -143,12 +143,12 @@ ; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX940-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr undef ret void } -define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -179,12 +179,12 @@ ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX940-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 ret void } -define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -207,11 +207,11 @@ ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void } -define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen glc @@ -226,12 +226,12 @@ ; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX940-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr undef ret void } -define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -262,12 +262,12 @@ ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX940-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 ret void } -define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -290,11 +290,11 @@ ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) ret void } -define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen glc @@ -309,12 +309,12 @@ ; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX940-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr undef ret void } -define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -345,12 +345,12 @@ ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX940-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 ret void } -define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -373,11 +373,11 @@ ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void } -define amdgpu_ps void @struct_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_ps void @struct_buffer_atomic_min_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen glc @@ -392,12 +392,12 @@ ; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX940-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr undef ret void } -define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -428,12 +428,12 @@ ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX940-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 ret void } -define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -456,11 +456,11 @@ ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) ret void } -define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen glc @@ -475,12 +475,12 @@ ; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX940-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr undef ret void } -define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -511,12 +511,12 @@ ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX940-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 ret void } -define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -539,11 +539,11 @@ ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void } -define amdgpu_ps void @struct_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_ps void @struct_buffer_atomic_max_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen glc @@ -558,12 +558,12 @@ ; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX940-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr undef ret void } -define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -594,7 +594,7 @@ ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX940-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll --- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll @@ -9,11 +9,11 @@ ; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX10 ; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX1030 -declare double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32 immarg) -declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32 immarg) +declare double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double, ptr addrspace(8), i32, i32, i32 immarg) +declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32 immarg) -define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_min_noret_f64: ; SI: ; %bb.0: ; %main_body ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd @@ -114,11 +114,11 @@ ; G_GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen ; G_GFX1030-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) ret void } -define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_min_rtn_f64: ; SI: ; %bb.0: ; %main_body ; SI-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc @@ -179,12 +179,12 @@ ; G_GFX1030-NEXT: ds_write_b64 v0, v[0:1] ; G_GFX1030-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr addrspace(3) undef ret void } -define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(3) %out) { +define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: ; SI: ; %bb.0: ; %main_body ; SI-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc @@ -245,12 +245,12 @@ ; G_GFX1030-NEXT: ds_write_b64 v3, v[0:1] ; G_GFX1030-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(3) %out, align 8 ret void } -define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_max_noret_f64: ; SI: ; %bb.0: ; %main_body ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd @@ -351,11 +351,11 @@ ; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen ; G_GFX1030-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) ret void } -define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_max_rtn_f64: ; SI: ; %bb.0: ; %main_body ; SI-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc @@ -416,12 +416,12 @@ ; G_GFX1030-NEXT: ds_write_b64 v0, v[0:1] ; G_GFX1030-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr addrspace(3) undef ret void } -define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(3) %out) { +define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; SI: ; %bb.0: ; %main_body ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 @@ -530,7 +530,7 @@ ; G_GFX1030-NEXT: ds_write_b64 v2, v[0:1] ; G_GFX1030-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(3) %out, align 8 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll b/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll --- a/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll @@ -5,20 +5,22 @@ ; GFX9-DAG: buffer_load_format_d16_xyzw v[{{[0-9:]+}}], v{{[0-9]+}}, s[{{[0-9:]+}}], 0 idxen ; encoding: ; GFX908-DAG: v_mfma_i32_4x4x4i8 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}] ; encoding: [{{0x..,0x0.,}} ; GFX90A-DAG: v_mfma_i32_4x4x4i8 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}] ; encoding: [{{0x..,0x8.,}} -define amdgpu_kernel void @test(<4 x i32> %x) #0 { +define amdgpu_kernel void @test(ptr addrspace(8) %x) #0 { %id = tail call i32 @llvm.amdgcn.workitem.id.x() - %r1 = tail call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %x, i32 %id, i32 0, i32 0, i32 0) + %r1 = tail call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %x, i32 %id, i32 0, i32 0, i32 0) store volatile <4 x float> %r1, ptr undef - %r2 = tail call <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(<4 x i32> %x, i32 %id, i32 0, i32 0, i32 0) + %r2 = tail call <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(ptr addrspace(8) %x, i32 %id, i32 0, i32 0, i32 0) store volatile <4 x half> %r2, ptr undef - %r3 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %x, i32 0, i32 0, i32 0) + %x.int = ptrtoint ptr addrspace(8) %x to i128 + %x.vec = bitcast i128 %x.int to <4 x i32> + %r3 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %x.vec, i32 0, i32 0, i32 0) store <4 x i32> %r3, ptr undef ret void } declare i32 @llvm.amdgcn.workitem.id.x() #0 -declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32 immarg) #1 -declare <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(<4 x i32>, i32, i32, i32, i32 immarg) #1 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8), i32, i32, i32, i32 immarg) #1 +declare <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(ptr addrspace(8), i32, i32, i32, i32 immarg) #1 declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32 immarg, i32 immarg, i32 immarg) #2 attributes #0 = { nounwind readnone speculatable willreturn "amdgpu-flat-work-group-size"="1,256" } diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll --- a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) { +define amdgpu_ps void @i1_copy_from_loop(ptr addrspace(8) inreg %rsrc, i32 %tid) { ; SI-LABEL: i1_copy_from_loop: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_mov_b32 s14, 0 @@ -61,7 +61,7 @@ br i1 %cc, label %mid.loop, label %for.end mid.loop: - %v = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %tid, i32 %i, i32 0, i32 0) + %v = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %tid, i32 %i, i32 0, i32 0) %cc2 = fcmp oge float %v, 0.0 br i1 %cc2, label %end.loop, label %for.end @@ -80,7 +80,7 @@ ret void } -declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32 immarg) #0 +declare float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll @@ -1,22 +1,22 @@ ; RUN: llc < %s -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=GCN ; RUN: llc < %s -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs | FileCheck %s -check-prefix=GCN -declare i32 @llvm.amdgcn.buffer.atomic.csub(i32, <4 x i32>, i32, i32, i1) +declare i32 @llvm.amdgcn.buffer.atomic.csub(i32, ptr addrspace(8), i32, i32, i1) declare i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1), i32) ; GCN-LABEL: {{^}}buffer_atomic_csub: ; GCN: buffer_atomic_csub v0, v1, s[0:3], 0 idxen glc -define amdgpu_ps void @buffer_atomic_csub(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) { +define amdgpu_ps void @buffer_atomic_csub(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %vindex) { main_body: - %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) ret void } ; GCN-LABEL: {{^}}buffer_atomic_csub_off4_slc: ; GCN: buffer_atomic_csub v0, v1, s[0:3], 0 idxen offset:4 glc slc -define amdgpu_ps void @buffer_atomic_csub_off4_slc(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) { +define amdgpu_ps void @buffer_atomic_csub_off4_slc(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %vindex) { main_body: - %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) + %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i1 1) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll @@ -1,8 +1,8 @@ ; RUN: llc < %s -march=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GFX90A ; RUN: not --crash llc < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 -declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) -declare <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1) +declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, ptr addrspace(8), i32, i32, i1) +declare <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i1) declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float) declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>) @@ -10,33 +10,33 @@ ; GFX90A-LABEL: {{^}}buffer_atomic_add_f32: ; GFX90A: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen glc -define amdgpu_ps float @buffer_atomic_add_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { +define amdgpu_ps float @buffer_atomic_add_f32(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex) { main_body: - %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) ret float %ret } ; GFX90A-LABEL: {{^}}buffer_atomic_add_f32_off4_slc: ; GFX90A: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen offset:4 glc slc -define amdgpu_ps float @buffer_atomic_add_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { +define amdgpu_ps float @buffer_atomic_add_f32_off4_slc(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex) { main_body: - %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) + %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i1 1) ret float %ret } ; GFX90A-LABEL: {{^}}buffer_atomic_pk_add_v2f16: ; GFX90A: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen glc -define amdgpu_ps <2 x half> @buffer_atomic_pk_add_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) { +define amdgpu_ps <2 x half> @buffer_atomic_pk_add_v2f16(ptr addrspace(8) inreg %rsrc, <2 x half> %data, i32 %vindex) { main_body: - %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) ret <2 x half> %ret } ; GFX90A-LABEL: {{^}}buffer_atomic_pk_add_v2f16_off4_slc: ; GFX90A: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen offset:4 glc slc -define amdgpu_ps <2 x half> @buffer_atomic_pk_add_v2f16_off4_slc(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) { +define amdgpu_ps <2 x half> @buffer_atomic_pk_add_v2f16_off4_slc(ptr addrspace(8) inreg %rsrc, <2 x half> %data, i32 %vindex) { main_body: - %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) + %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i1 1) ret <2 x half> %ret } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll @@ -1,41 +1,41 @@ ; RUN: llc < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs | FileCheck %s -check-prefix=GCN ; RUN: llc < %s -march=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GCN -declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) -declare <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1) +declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, ptr addrspace(8), i32, i32, i1) +declare <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i1) declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float) declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>) declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr, float) ; GCN-LABEL: {{^}}buffer_atomic_add_f32: ; GCN: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen -define amdgpu_ps void @buffer_atomic_add_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { +define amdgpu_ps void @buffer_atomic_add_f32(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex) { main_body: - %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) ret void } ; GCN-LABEL: {{^}}buffer_atomic_add_f32_off4_slc: ; GCN: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen offset:4 slc -define amdgpu_ps void @buffer_atomic_add_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { +define amdgpu_ps void @buffer_atomic_add_f32_off4_slc(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex) { main_body: - %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) + %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i1 1) ret void } ; GCN-LABEL: {{^}}buffer_atomic_pk_add_v2f16: ; GCN: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen -define amdgpu_ps void @buffer_atomic_pk_add_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) { +define amdgpu_ps void @buffer_atomic_pk_add_v2f16(ptr addrspace(8) inreg %rsrc, <2 x half> %data, i32 %vindex) { main_body: - %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) ret void } ; GCN-LABEL: {{^}}buffer_atomic_pk_add_v2f16_off4_slc: ; GCN: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen offset:4 slc -define amdgpu_ps void @buffer_atomic_pk_add_v2f16_off4_slc(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) { +define amdgpu_ps void @buffer_atomic_pk_add_v2f16_off4_slc(ptr addrspace(8) inreg %rsrc, <2 x half> %data, i32 %vindex) { main_body: - %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) + %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i1 1) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -19,16 +19,16 @@ ;VI: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:4 glc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_swap v0, off, s[0:3], 0{{$}} -define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) { +define amdgpu_ps float @test1(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) { main_body: - %o1 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %o2 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %o3 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o2, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0) - %o4 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o3, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0) + %o1 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) + %o2 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) + %o3 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o2, ptr addrspace(8) %rsrc, i32 0, i32 %voffset, i1 0) + %o4 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o3, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i1 0) %ofs.5 = add i32 %voffset, 42 - %o5 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o4, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0) - %o6 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o5, <4 x i32> %rsrc, i32 0, i32 8192, i1 0) - %unused = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o6, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %o5 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o4, ptr addrspace(8) %rsrc, i32 0, i32 %ofs.5, i1 0) + %o6 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o5, ptr addrspace(8) %rsrc, i32 0, i32 8192, i1 0) + %unused = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o6, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) %out = bitcast i32 %o6 to float ret float %out } @@ -50,17 +50,17 @@ ;VI: buffer_atomic_swap_x2 v[3:4], off, s[0:3], [[SOFS]] offset:4 glc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_swap_x2 v[3:4], off, s[0:3], 0{{$}} -define amdgpu_ps float @test11(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) { +define amdgpu_ps float @test11(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) { main_body: %o0 = sext i32 %data to i64 - %o1 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o0, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %o2 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %o3 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o2, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0) - %o4 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o3, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0) + %o1 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o0, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) + %o2 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) + %o3 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o2, ptr addrspace(8) %rsrc, i32 0, i32 %voffset, i1 0) + %o4 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o3, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i1 0) %ofs.5 = add i32 %voffset, 42 - %o5 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o4, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0) - %o6 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o5, <4 x i32> %rsrc, i32 0, i32 8192, i1 0) - %unused = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o6, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %o5 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o4, ptr addrspace(8) %rsrc, i32 0, i32 %ofs.5, i1 0) + %o6 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o5, ptr addrspace(8) %rsrc, i32 0, i32 8192, i1 0) + %unused = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o6, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) %o7 = trunc i64 %o6 to i32 %out = bitcast i32 %o7 to float ret float %out @@ -85,17 +85,17 @@ ;CHECK: buffer_atomic_or v0, v1, s[0:3], 0 idxen glc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_xor v0, v1, s[0:3], 0 idxen glc -define amdgpu_ps float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) { +define amdgpu_ps float @test2(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %vindex) { main_body: - %t1 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t2 = call i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32 %t1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t3 = call i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32 %t2, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t4 = call i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32 %t3, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t5 = call i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32 %t4, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t6 = call i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32 %t5, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t7 = call i32 @llvm.amdgcn.buffer.atomic.and.i32(i32 %t6, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t8 = call i32 @llvm.amdgcn.buffer.atomic.or.i32(i32 %t7, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t9 = call i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32 %t8, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + %t1 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) + %t2 = call i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32 %t1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) + %t3 = call i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32 %t2, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) + %t4 = call i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32 %t3, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) + %t5 = call i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32 %t4, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) + %t6 = call i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32 %t5, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) + %t7 = call i32 @llvm.amdgcn.buffer.atomic.and.i32(i32 %t6, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) + %t8 = call i32 @llvm.amdgcn.buffer.atomic.or.i32(i32 %t7, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) + %t9 = call i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32 %t8, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) %out = bitcast i32 %t9 to float ret float %out } @@ -119,18 +119,18 @@ ;CHECK: buffer_atomic_or_x2 v[0:1], v2, s[0:3], 0 idxen glc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_xor_x2 v[0:1], v2, s[0:3], 0 idxen glc -define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) { +define amdgpu_ps float @test3(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %vindex) { main_body: %t0 = sext i32 %data to i64 - %t1 = call i64 @llvm.amdgcn.buffer.atomic.add.i64(i64 %t0, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t2 = call i64 @llvm.amdgcn.buffer.atomic.sub.i64(i64 %t1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t3 = call i64 @llvm.amdgcn.buffer.atomic.smin.i64(i64 %t2, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t4 = call i64 @llvm.amdgcn.buffer.atomic.umin.i64(i64 %t3, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t5 = call i64 @llvm.amdgcn.buffer.atomic.smax.i64(i64 %t4, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t6 = call i64 @llvm.amdgcn.buffer.atomic.umax.i64(i64 %t5, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t7 = call i64 @llvm.amdgcn.buffer.atomic.and.i64(i64 %t6, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t8 = call i64 @llvm.amdgcn.buffer.atomic.or.i64(i64 %t7, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t9 = call i64 @llvm.amdgcn.buffer.atomic.xor.i64(i64 %t8, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + %t1 = call i64 @llvm.amdgcn.buffer.atomic.add.i64(i64 %t0, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) + %t2 = call i64 @llvm.amdgcn.buffer.atomic.sub.i64(i64 %t1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) + %t3 = call i64 @llvm.amdgcn.buffer.atomic.smin.i64(i64 %t2, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) + %t4 = call i64 @llvm.amdgcn.buffer.atomic.umin.i64(i64 %t3, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) + %t5 = call i64 @llvm.amdgcn.buffer.atomic.smax.i64(i64 %t4, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) + %t6 = call i64 @llvm.amdgcn.buffer.atomic.umax.i64(i64 %t5, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) + %t7 = call i64 @llvm.amdgcn.buffer.atomic.and.i64(i64 %t6, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) + %t8 = call i64 @llvm.amdgcn.buffer.atomic.or.i64(i64 %t7, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) + %t9 = call i64 @llvm.amdgcn.buffer.atomic.xor.i64(i64 %t8, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) %t10 = trunc i64 %t9 to i32 %out = bitcast i32 %t10 to float ret float %out @@ -155,22 +155,22 @@ ;CHECK-DAG: s_waitcnt vmcnt(0) ;SICI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen glc ;VI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:4 glc -define amdgpu_ps float @test4(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) { +define amdgpu_ps float @test4(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) { main_body: - %o1 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %o2 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %o3 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o2, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0) - %o4 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o3, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0) + %o1 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) + %o2 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o1, i32 %cmp, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i1 0) + %o3 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o2, i32 %cmp, ptr addrspace(8) %rsrc, i32 0, i32 %voffset, i1 0) + %o4 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o3, i32 %cmp, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i1 0) %ofs.5 = add i32 %voffset, 44 - %o5 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o4, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0) - %o6 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o5, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 8192, i1 0) + %o5 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o4, i32 %cmp, ptr addrspace(8) %rsrc, i32 0, i32 %ofs.5, i1 0) + %o6 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o5, i32 %cmp, ptr addrspace(8) %rsrc, i32 0, i32 8192, i1 0) ; Detecting the no-return variant doesn't work right now because of how the ; intrinsic is replaced by an instruction that feeds into an EXTRACT_SUBREG. ; Since there probably isn't a reasonable use-case of cmpswap that discards ; the return value, that seems okay. ; -; %unused = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o6, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0) +; %unused = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o6, i32 %cmp, ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0) %out = bitcast i32 %o6 to float ret float %out } @@ -179,31 +179,31 @@ ;CHECK: buffer_atomic_add v0, define amdgpu_ps float @test7() { main_body: - %v = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 1, <4 x i32> undef, i32 0, i32 4, i1 false) + %v = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 1, ptr addrspace(8) undef, i32 0, i32 4, i1 false) %v.float = bitcast i32 %v to float ret float %v.float } -declare i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #0 -declare i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64, <4 x i32>, i32, i32, i1) #0 -declare i64 @llvm.amdgcn.buffer.atomic.add.i64(i64, <4 x i32>, i32, i32, i1) #0 -declare i64 @llvm.amdgcn.buffer.atomic.sub.i64(i64, <4 x i32>, i32, i32, i1) #0 -declare i64 @llvm.amdgcn.buffer.atomic.smin.i64(i64, <4 x i32>, i32, i32, i1) #0 -declare i64 @llvm.amdgcn.buffer.atomic.umin.i64(i64, <4 x i32>, i32, i32, i1) #0 -declare i64 @llvm.amdgcn.buffer.atomic.smax.i64(i64, <4 x i32>, i32, i32, i1) #0 -declare i64 @llvm.amdgcn.buffer.atomic.umax.i64(i64, <4 x i32>, i32, i32, i1) #0 -declare i64 @llvm.amdgcn.buffer.atomic.and.i64(i64, <4 x i32>, i32, i32, i1) #0 -declare i64 @llvm.amdgcn.buffer.atomic.or.i64(i64, <4 x i32>, i32, i32, i1) #0 -declare i64 @llvm.amdgcn.buffer.atomic.xor.i64(i64, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.add.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.and.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.or.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32, ptr addrspace(8), i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, ptr addrspace(8), i32, i32, i1) #0 +declare i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64, ptr addrspace(8), i32, i32, i1) #0 +declare i64 @llvm.amdgcn.buffer.atomic.add.i64(i64, ptr addrspace(8), i32, i32, i1) #0 +declare i64 @llvm.amdgcn.buffer.atomic.sub.i64(i64, ptr addrspace(8), i32, i32, i1) #0 +declare i64 @llvm.amdgcn.buffer.atomic.smin.i64(i64, ptr addrspace(8), i32, i32, i1) #0 +declare i64 @llvm.amdgcn.buffer.atomic.umin.i64(i64, ptr addrspace(8), i32, i32, i1) #0 +declare i64 @llvm.amdgcn.buffer.atomic.smax.i64(i64, ptr addrspace(8), i32, i32, i1) #0 +declare i64 @llvm.amdgcn.buffer.atomic.umax.i64(i64, ptr addrspace(8), i32, i32, i1) #0 +declare i64 @llvm.amdgcn.buffer.atomic.and.i64(i64, ptr addrspace(8), i32, i32, i1) #0 +declare i64 @llvm.amdgcn.buffer.atomic.or.i64(i64, ptr addrspace(8), i32, i32, i1) #0 +declare i64 @llvm.amdgcn.buffer.atomic.xor.i64(i64, ptr addrspace(8), i32, i32, i1) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll @@ -5,9 +5,9 @@ ;SI: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42 ;GCNX3: buffer_load_format_xyz v[0:2], off, s[0:3], 0 offset:42 ;CHECK: s_waitcnt -define amdgpu_ps <3 x float> @buffer_load_format_immoffs_x3(<4 x i32> inreg) { +define amdgpu_ps <3 x float> @buffer_load_format_immoffs_x3(ptr addrspace(8) inreg) { main_body: - %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0) + %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(ptr addrspace(8) %0, i32 0, i32 42, i1 0, i1 0) ret <3 x float> %data } @@ -15,9 +15,9 @@ ;SI: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40 ;GCNX3: buffer_load_dwordx3 v[0:2], off, s[0:3], 0 offset:40 ;CHECK: s_waitcnt -define amdgpu_ps <3 x float> @buffer_load_immoffs_x3(<4 x i32> inreg) { +define amdgpu_ps <3 x float> @buffer_load_immoffs_x3(ptr addrspace(8) inreg) { main_body: - %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %0, i32 0, i32 40, i1 0, i1 0) + %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) %0, i32 0, i32 40, i1 0, i1 0) ret <3 x float> %data } @@ -25,9 +25,9 @@ ;SI: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40 ;GCNX3: buffer_load_dwordx3 v[0:2], off, s[0:3], 0 offset:40 ;CHECK: s_waitcnt -define amdgpu_ps <3 x float> @buffer_raw_load_immoffs_x3(<4 x i32> inreg) { +define amdgpu_ps <3 x float> @buffer_raw_load_immoffs_x3(ptr addrspace(8) inreg) { main_body: - %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %0, i32 40, i32 0, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(ptr addrspace(8) %0, i32 40, i32 0, i32 0) ret <3 x float> %data } @@ -35,9 +35,9 @@ ;SI: buffer_load_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42 ;GCNX3: buffer_load_format_xyz v[0:2], {{v[0-9]+}}, s[0:3], 0 idxen offset:42 ;CHECK: s_waitcnt -define amdgpu_ps <3 x float> @buffer_struct_load_format_immoffs_x3(<4 x i32> inreg) { +define amdgpu_ps <3 x float> @buffer_struct_load_format_immoffs_x3(ptr addrspace(8) inreg) { main_body: - %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %0, i32 0, i32 42, i32 0, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8) %0, i32 0, i32 42, i32 0, i32 0) ret <3 x float> %data } @@ -45,16 +45,16 @@ ;SI: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:40 ;GCNX3: buffer_load_dwordx3 v[0:2], {{v[0-9]+}}, s[0:3], 0 idxen offset:40 ;CHECK: s_waitcnt -define amdgpu_ps <3 x float> @struct_buffer_load_immoffs_x3(<4 x i32> inreg) { +define amdgpu_ps <3 x float> @struct_buffer_load_immoffs_x3(ptr addrspace(8) inreg) { main_body: - %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %0, i32 0, i32 40, i32 0, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(ptr addrspace(8) %0, i32 0, i32 40, i32 0, i32 0) ret <3 x float> %data } -declare <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32>, i32, i32, i1, i1) #0 -declare <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32>, i32, i32, i1, i1) #0 -declare <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32) #0 -declare <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32>, i32, i32, i32) #0 -declare <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32, i32) #0 -declare <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32>, i32, i32, i32, i32) #0 +declare <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(ptr addrspace(8), i32, i32, i1, i1) #0 +declare <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8), i32, i32, i1, i1) #0 +declare <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8), i32, i32, i32) #0 +declare <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(ptr addrspace(8), i32, i32, i32) #0 +declare <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8), i32, i32, i32, i32) #0 +declare <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(ptr addrspace(8), i32, i32, i32, i32) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll @@ -4,9 +4,9 @@ ; GCN-LABEL: {{^}}buffer_load_format_d16_x: ; GCN: buffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 -define amdgpu_ps half @buffer_load_format_d16_x(<4 x i32> inreg %rsrc) { +define amdgpu_ps half @buffer_load_format_d16_x(ptr addrspace(8) inreg %rsrc) { main_body: - %data = call half @llvm.amdgcn.buffer.load.format.f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + %data = call half @llvm.amdgcn.buffer.load.format.f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0, i1 0) ret half %data } @@ -16,9 +16,9 @@ ; PACKED: buffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] -define amdgpu_ps half @buffer_load_format_d16_xy(<4 x i32> inreg %rsrc) { +define amdgpu_ps half @buffer_load_format_d16_xy(ptr addrspace(8) inreg %rsrc) { main_body: - %data = call <2 x half> @llvm.amdgcn.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + %data = call <2 x half> @llvm.amdgcn.buffer.load.format.v2f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0, i1 0) %elt = extractelement <2 x half> %data, i32 1 ret half %elt } @@ -29,9 +29,9 @@ ; PACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 ; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] -define amdgpu_ps half @buffer_load_format_d16_xyz(<4 x i32> inreg %rsrc) { +define amdgpu_ps half @buffer_load_format_d16_xyz(ptr addrspace(8) inreg %rsrc) { main_body: - %data = call <3 x half> @llvm.amdgcn.buffer.load.format.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + %data = call <3 x half> @llvm.amdgcn.buffer.load.format.v3f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0, i1 0) %elt = extractelement <3 x half> %data, i32 2 ret half %elt } @@ -42,14 +42,14 @@ ; PACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] -define amdgpu_ps half @buffer_load_format_d16_xyzw(<4 x i32> inreg %rsrc) { +define amdgpu_ps half @buffer_load_format_d16_xyzw(ptr addrspace(8) inreg %rsrc) { main_body: - %data = call <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + %data = call <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0, i1 0) %elt = extractelement <4 x half> %data, i32 3 ret half %elt } -declare half @llvm.amdgcn.buffer.load.format.f16(<4 x i32>, i32, i32, i1, i1) -declare <2 x half> @llvm.amdgcn.buffer.load.format.v2f16(<4 x i32>, i32, i32, i1, i1) -declare <3 x half> @llvm.amdgcn.buffer.load.format.v3f16(<4 x i32>, i32, i32, i1, i1) -declare <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32>, i32, i32, i1, i1) +declare half @llvm.amdgcn.buffer.load.format.f16(ptr addrspace(8), i32, i32, i1, i1) +declare <2 x half> @llvm.amdgcn.buffer.load.format.v2f16(ptr addrspace(8), i32, i32, i1, i1) +declare <3 x half> @llvm.amdgcn.buffer.load.format.v3f16(ptr addrspace(8), i32, i32, i1, i1) +declare <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(ptr addrspace(8), i32, i32, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll @@ -6,11 +6,11 @@ ;CHECK: buffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc ;CHECK: buffer_load_format_xyzw v[8:11], off, s[0:3], 0 slc ;CHECK: s_waitcnt -define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(ptr addrspace(8) inreg) { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0) - %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0) - %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 1) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i1 0, i1 0) + %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i1 1, i1 0) + %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i1 0, i1 1) %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 @@ -20,9 +20,9 @@ ;CHECK-LABEL: {{^}}buffer_load_immoffs: ;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42 ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_immoffs(ptr addrspace(8) inreg) { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 42, i1 0, i1 0) ret <4 x float> %data } @@ -37,11 +37,11 @@ ;VI-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8ffc ;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:4 ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_immoffs_large(ptr addrspace(8) inreg) { main_body: - %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4152, i1 0, i1 0) - %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36856, i1 0, i1 0) - %d.2 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36864, i1 0, i1 0) + %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 4152, i1 0, i1 0) + %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 36856, i1 0, i1 0) + %d.2 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 36864, i1 0, i1 0) %d.3 = fadd <4 x float> %d.0, %d.1 %data = fadd <4 x float> %d.2, %d.3 ret <4 x float> %data @@ -53,10 +53,10 @@ ;VI-NOT: s_mov ;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:84 ;VI: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_immoffs_reuse(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_immoffs_reuse(ptr addrspace(8) inreg) { main_body: - %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4160, i1 0, i1 0) - %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4176, i1 0, i1 0) + %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 4160, i1 0, i1 0) + %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 4176, i1 0, i1 0) %data = fadd <4 x float> %d.0, %d.1 ret <4 x float> %data } @@ -64,37 +64,37 @@ ;CHECK-LABEL: {{^}}buffer_load_idx: ;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) { +define amdgpu_ps <4 x float> @buffer_load_idx(ptr addrspace(8) inreg, i32) { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %1, i32 0, i1 0, i1 0) ret <4 x float> %data } ;CHECK-LABEL: {{^}}buffer_load_ofs: ;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { +define amdgpu_ps <4 x float> @buffer_load_ofs(ptr addrspace(8) inreg, i32) { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 %1, i1 0, i1 0) ret <4 x float> %data } ;CHECK-LABEL: {{^}}buffer_load_ofs_imm: ;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60 ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { +define amdgpu_ps <4 x float> @buffer_load_ofs_imm(ptr addrspace(8) inreg, i32) { main_body: %ofs = add i32 %1, 60 - %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 %ofs, i1 0, i1 0) ret <4 x float> %data } ;CHECK-LABEL: {{^}}buffer_load_both: ;CHECK: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) { +define amdgpu_ps <4 x float> @buffer_load_both(ptr addrspace(8) inreg, i32, i32) { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %1, i32 %2, i1 0, i1 0) ret <4 x float> %data } @@ -102,32 +102,32 @@ ;CHECK: v_mov_b32_e32 v2, v0 ;CHECK: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) { +define amdgpu_ps <4 x float> @buffer_load_both_reversed(ptr addrspace(8) inreg, i32, i32) { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %2, i32 %1, i1 0, i1 0) ret <4 x float> %data } ;CHECK-LABEL: {{^}}buffer_load_x: ;CHECK: buffer_load_format_x v0, off, s[0:3], 0 ;CHECK: s_waitcnt -define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_x(ptr addrspace(8) inreg %rsrc) { main_body: - %data = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + %data = call float @llvm.amdgcn.buffer.load.format.f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0, i1 0) ret float %data } ;CHECK-LABEL: {{^}}buffer_load_xy: ;CHECK: buffer_load_format_xy v[0:1], off, s[0:3], 0 ;CHECK: s_waitcnt -define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { +define amdgpu_ps <2 x float> @buffer_load_xy(ptr addrspace(8) inreg %rsrc) { main_body: - %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0, i1 0) ret <2 x float> %data } -declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #0 -declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32>, i32, i32, i1, i1) #0 -declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #0 +declare float @llvm.amdgcn.buffer.load.format.f32(ptr addrspace(8), i32, i32, i1, i1) #0 +declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(ptr addrspace(8), i32, i32, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8), i32, i32, i1, i1) #0 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll @@ -6,11 +6,11 @@ ;CHECK: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc ;CHECK: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc ;CHECK: s_waitcnt -define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(ptr addrspace(8) inreg) { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0) - %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0) - %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 1) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i1 0, i1 0) + %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i1 1, i1 0) + %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i1 0, i1 1) %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 @@ -20,9 +20,9 @@ ;CHECK-LABEL: {{^}}buffer_load_immoffs: ;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40 ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_immoffs(ptr addrspace(8) inreg) { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 40, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 40, i1 0, i1 0) ret <4 x float> %data } @@ -31,46 +31,46 @@ ;VI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc ;VI: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:4 ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_immoffs_large(ptr addrspace(8) inreg) { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 8192, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 8192, i1 0, i1 0) ret <4 x float> %data } ;CHECK-LABEL: {{^}}buffer_load_idx: ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) { +define amdgpu_ps <4 x float> @buffer_load_idx(ptr addrspace(8) inreg, i32) { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %0, i32 %1, i32 0, i1 0, i1 0) ret <4 x float> %data } ;CHECK-LABEL: {{^}}buffer_load_ofs: ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { +define amdgpu_ps <4 x float> @buffer_load_ofs(ptr addrspace(8) inreg, i32) { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 %1, i1 0, i1 0) ret <4 x float> %data } ;CHECK-LABEL: {{^}}buffer_load_ofs_imm: ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:60 ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { +define amdgpu_ps <4 x float> @buffer_load_ofs_imm(ptr addrspace(8) inreg, i32) { main_body: %ofs = add i32 %1, 60 - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 %ofs, i1 0, i1 0) ret <4 x float> %data } ;CHECK-LABEL: {{^}}buffer_load_both: ;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) { +define amdgpu_ps <4 x float> @buffer_load_both(ptr addrspace(8) inreg, i32, i32) { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %0, i32 %1, i32 %2, i1 0, i1 0) ret <4 x float> %data } @@ -78,37 +78,37 @@ ;CHECK: v_mov_b32_e32 v2, v0 ;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) { +define amdgpu_ps <4 x float> @buffer_load_both_reversed(ptr addrspace(8) inreg, i32, i32) { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %0, i32 %2, i32 %1, i1 0, i1 0) ret <4 x float> %data } ;CHECK-LABEL: {{^}}buffer_load_x1: ;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +define amdgpu_ps float @buffer_load_x1(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) { main_body: - %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0) + %data = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0) ret float %data } ;CHECK-LABEL: {{^}}buffer_load_x2: ;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +define amdgpu_ps <2 x float> @buffer_load_x2(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) { main_body: - %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0) + %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0) ret <2 x float> %data } ;CHECK-LABEL: {{^}}buffer_load_negative_offset: ;CHECK: v_add_{{[iu]}}32_e32 [[VOFS:v[0-9]+]], vcc, -16, v0 ;CHECK: buffer_load_dwordx4 v[0:3], [[VOFS]], s[0:3], 0 offen -define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) { +define amdgpu_ps <4 x float> @buffer_load_negative_offset(ptr addrspace(8) inreg, i32 %ofs) { main_body: %ofs.1 = add i32 %ofs, -16 - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs.1, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 %ofs.1, i1 0, i1 0) ret <4 x float> %data } @@ -117,10 +117,10 @@ ; CHECK-LABEL: buffer_load_mmo: ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4 -define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, ptr addrspace(3) %lds) { +define amdgpu_ps float @buffer_load_mmo(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %lds) { entry: store float 0.0, ptr addrspace(3) %lds - %val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + %val = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i1 0, i1 0) %tmp2 = getelementptr float, ptr addrspace(3) %lds, i32 4 store float 0.0, ptr addrspace(3) %tmp2 ret float %val @@ -131,7 +131,7 @@ ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 ;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a) { +define amdgpu_ps void @buffer_load_x1_offen_merged(ptr addrspace(8) inreg %rsrc, i32 %a) { main_body: %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 @@ -139,12 +139,12 @@ %a4 = add i32 %a, 16 %a5 = add i32 %a, 28 %a6 = add i32 %a, 32 - %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) - %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) - %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a3, i1 0, i1 0) - %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a4, i1 0, i1 0) - %r5 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a5, i1 0, i1 0) - %r6 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a6, i1 0, i1 0) + %r1 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 %a1, i1 0, i1 0) + %r2 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 %a2, i1 0, i1 0) + %r3 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 %a3, i1 0, i1 0) + %r4 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 %a4, i1 0, i1 0) + %r5 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 %a5, i1 0, i1 0) + %r6 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 %a6, i1 0, i1 0) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) ret void @@ -156,7 +156,7 @@ ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}} ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}} ;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a) { +define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(ptr addrspace(8) inreg %rsrc, i32 %a) { main_body: %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 @@ -164,12 +164,12 @@ %a4 = add i32 %a, 16 %a5 = add i32 %a, 28 %a6 = add i32 %a, 32 - %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) - %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) - %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a3, i1 1, i1 0) - %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a4, i1 1, i1 0) - %r5 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a5, i1 1, i1 1) - %r6 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a6, i1 1, i1 1) + %r1 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 %a1, i1 0, i1 0) + %r2 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 %a2, i1 0, i1 0) + %r3 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 %a3, i1 1, i1 0) + %r4 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 %a4, i1 1, i1 0) + %r5 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 %a5, i1 1, i1 1) + %r6 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 %a6, i1 1, i1 1) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) ret void @@ -179,12 +179,12 @@ ;CHECK-NEXT: %bb. ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 ;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a) { +define amdgpu_ps void @buffer_load_x2_offen_merged(ptr addrspace(8) inreg %rsrc, i32 %a) { main_body: %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 - %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) - %vr2 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) + %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 0, i32 %a1, i1 0, i1 0) + %vr2 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 0, i32 %a2, i1 0, i1 0) %r1 = extractelement <2 x float> %vr1, i32 0 %r2 = extractelement <2 x float> %vr1, i32 1 %r3 = extractelement <2 x float> %vr2, i32 0 @@ -197,12 +197,12 @@ ;CHECK-NEXT: %bb. ;VI-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 ;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a) { +define amdgpu_ps void @buffer_load_x3_offen_merged(ptr addrspace(8) inreg %rsrc, i32 %a) { main_body: %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 - %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) - %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) + %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 0, i32 %a1, i1 0, i1 0) + %r3 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 %a2, i1 0, i1 0) %r1 = extractelement <2 x float> %vr1, i32 0 %r2 = extractelement <2 x float> %vr1, i32 1 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true) @@ -214,14 +214,14 @@ ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 ;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) { +define amdgpu_ps void @buffer_load_x1_offset_merged(ptr addrspace(8) inreg %rsrc) { main_body: - %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) - %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) - %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) - %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) - %r5 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 28, i1 0, i1 0) - %r6 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 32, i1 0, i1 0) + %r1 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 4, i1 0, i1 0) + %r2 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 8, i1 0, i1 0) + %r3 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 12, i1 0, i1 0) + %r4 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 16, i1 0, i1 0) + %r5 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 28, i1 0, i1 0) + %r6 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 32, i1 0, i1 0) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) ret void @@ -231,10 +231,10 @@ ;CHECK-NEXT: %bb. ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 ;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x2_offset_merged(<4 x i32> inreg %rsrc) { +define amdgpu_ps void @buffer_load_x2_offset_merged(ptr addrspace(8) inreg %rsrc) { main_body: - %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) - %vr2 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) + %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 0, i32 4, i1 0, i1 0) + %vr2 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 0, i32 12, i1 0, i1 0) %r1 = extractelement <2 x float> %vr1, i32 0 %r2 = extractelement <2 x float> %vr1, i32 1 %r3 = extractelement <2 x float> %vr2, i32 0 @@ -247,10 +247,10 @@ ;CHECK-NEXT: %bb. ;VI-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 ;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x3_offset_merged(<4 x i32> inreg %rsrc) { +define amdgpu_ps void @buffer_load_x3_offset_merged(ptr addrspace(8) inreg %rsrc) { main_body: - %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) - %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) + %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 0, i32 4, i1 0, i1 0) + %r3 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 12, i1 0, i1 0) %r1 = extractelement <2 x float> %vr1, i32 0 %r2 = extractelement <2 x float> %vr1, i32 1 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true) @@ -263,9 +263,9 @@ ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_ubyte(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_ubyte(ptr addrspace(8) inreg %rsrc) { main_body: - %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(ptr addrspace(8) %rsrc, i32 0, i32 8, i1 0, i1 0) %val = uitofp i8 %tmp to float ret float %val } @@ -276,9 +276,9 @@ ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0 ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_ushort(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_ushort(ptr addrspace(8) inreg %rsrc) { main_body: - %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) + %tmp = call i16 @llvm.amdgcn.buffer.load.i16(ptr addrspace(8) %rsrc, i32 0, i32 16, i1 0, i1 0) %tmp2 = zext i16 %tmp to i32 %val = uitofp i32 %tmp2 to float ret float %val @@ -290,9 +290,9 @@ ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_sbyte(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_sbyte(ptr addrspace(8) inreg %rsrc) { main_body: - %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(ptr addrspace(8) %rsrc, i32 0, i32 8, i1 0, i1 0) %tmp2 = sext i8 %tmp to i32 %val = sitofp i32 %tmp2 to float ret float %val @@ -304,9 +304,9 @@ ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_sshort(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_sshort(ptr addrspace(8) inreg %rsrc) { main_body: - %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) + %tmp = call i16 @llvm.amdgcn.buffer.load.i16(ptr addrspace(8) %rsrc, i32 0, i32 16, i1 0, i1 0) %tmp2 = sext i16 %tmp to i32 %val = sitofp i32 %tmp2 to float ret float %val @@ -317,9 +317,9 @@ ;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8 ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_ubyte_bitcast(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_ubyte_bitcast(ptr addrspace(8) inreg %rsrc) { main_body: - %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(ptr addrspace(8) %rsrc, i32 0, i32 8, i1 0, i1 0) %tmp2 = zext i8 %tmp to i32 %val = bitcast i32 %tmp2 to float ret float %val @@ -330,9 +330,9 @@ ;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 offset:8 ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_ushort_bitcast(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_ushort_bitcast(ptr addrspace(8) inreg %rsrc) { main_body: - %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp = call i16 @llvm.amdgcn.buffer.load.i16(ptr addrspace(8) %rsrc, i32 0, i32 8, i1 0, i1 0) %tmp2 = zext i16 %tmp to i32 %val = bitcast i32 %tmp2 to float ret float %val @@ -343,9 +343,9 @@ ;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0 offset:8 ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_sbyte_bitcast(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_sbyte_bitcast(ptr addrspace(8) inreg %rsrc) { main_body: - %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(ptr addrspace(8) %rsrc, i32 0, i32 8, i1 0, i1 0) %tmp2 = sext i8 %tmp to i32 %val = bitcast i32 %tmp2 to float ret float %val @@ -356,9 +356,9 @@ ;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0 offset:8 ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_sshort_bitcast(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_sshort_bitcast(ptr addrspace(8) inreg %rsrc) { main_body: - %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp = call i16 @llvm.amdgcn.buffer.load.i16(ptr addrspace(8) %rsrc, i32 0, i32 8, i1 0, i1 0) %tmp2 = sext i16 %tmp to i32 %val = bitcast i32 %tmp2 to float ret float %val @@ -370,9 +370,9 @@ ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_mul_u32_u24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}} ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_ubyte_mul_bitcast(<4 x i32> inreg %rsrc, i32 %idx) { +define amdgpu_ps float @buffer_load_ubyte_mul_bitcast(ptr addrspace(8) inreg %rsrc, i32 %idx) { main_body: - %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 8, i1 0, i1 0) + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(ptr addrspace(8) %rsrc, i32 %idx, i32 8, i1 0, i1 0) %tmp2 = zext i8 %tmp to i32 %tmp3 = mul i32 %tmp2, 255 %val = bitcast i32 %tmp3 to float @@ -385,9 +385,9 @@ ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_mul_u32_u24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}} ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_ushort_mul_bitcast(<4 x i32> inreg %rsrc, i32 %idx) { +define amdgpu_ps float @buffer_load_ushort_mul_bitcast(ptr addrspace(8) inreg %rsrc, i32 %idx) { main_body: - %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 8, i1 0, i1 0) + %tmp = call i16 @llvm.amdgcn.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %idx, i32 8, i1 0, i1 0) %tmp2 = zext i16 %tmp to i32 %tmp3 = mul i32 %tmp2, 255 %val = bitcast i32 %tmp3 to float @@ -400,9 +400,9 @@ ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_mul_i32_i24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}} ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_sbyte_mul_bitcast(<4 x i32> inreg %rsrc, i32 %idx) { +define amdgpu_ps float @buffer_load_sbyte_mul_bitcast(ptr addrspace(8) inreg %rsrc, i32 %idx) { main_body: - %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 8, i1 0, i1 0) + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(ptr addrspace(8) %rsrc, i32 %idx, i32 8, i1 0, i1 0) %tmp2 = sext i8 %tmp to i32 %tmp3 = mul i32 %tmp2, 255 %val = bitcast i32 %tmp3 to float @@ -415,9 +415,9 @@ ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_mul_i32_i24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}} ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_sshort_mul_bitcast(<4 x i32> inreg %rsrc, i32 %idx) { +define amdgpu_ps float @buffer_load_sshort_mul_bitcast(ptr addrspace(8) inreg %rsrc, i32 %idx) { main_body: - %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 8, i1 0, i1 0) + %tmp = call i16 @llvm.amdgcn.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %idx, i32 8, i1 0, i1 0) %tmp2 = sext i16 %tmp to i32 %tmp3 = mul i32 %tmp2, 255 %val = bitcast i32 %tmp3 to float @@ -430,9 +430,9 @@ ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_bfe_i32 v{{[0-9]}}, v{{[0-9]}}, 0, 5 ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_sbyte_type_check(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_sbyte_type_check(ptr addrspace(8) inreg %rsrc) { main_body: - %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(ptr addrspace(8) %rsrc, i32 0, i32 8, i1 0, i1 0) %tmp2 = zext i8 %tmp to i32 %tmp3 = shl i32 %tmp2, 27 %tmp4 = ashr i32 %tmp3, 27 @@ -446,11 +446,11 @@ ; CHECK-LABEL: {{^}}no_fold_fi_imm_soffset: ; CHECK: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} ; CHECK-NEXT: buffer_load_dword v0, [[FI]], s{{\[[0-9]+:[0-9]+\]}}, 0 idxen -define amdgpu_ps float @no_fold_fi_imm_soffset(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @no_fold_fi_imm_soffset(ptr addrspace(8) inreg %rsrc) { %alloca = alloca i32, addrspace(5) %alloca.cast = ptrtoint ptr addrspace(5) %alloca to i32 - %ret.val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %alloca.cast, i32 0, i1 false, i1 false) + %ret.val = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %alloca.cast, i32 0, i1 false, i1 false) ret float %ret.val } @@ -458,19 +458,19 @@ ; CHECK-DAG: v_mov_b32_e32 v[[FI:[0-9]+]], 4{{$}} ; CHECK-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s ; CHECK: buffer_load_dword v0, v[[[FI]]:[[HI]] -define amdgpu_ps float @no_fold_fi_reg_soffset(<4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps float @no_fold_fi_reg_soffset(ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { %alloca = alloca i32, addrspace(5) %alloca.cast = ptrtoint ptr addrspace(5) %alloca to i32 - %ret.val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %alloca.cast, i32 %soffset, i1 false, i1 false) + %ret.val = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %alloca.cast, i32 %soffset, i1 false, i1 false) ret float %ret.val } -declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0 -declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0 -declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0 -declare i8 @llvm.amdgcn.buffer.load.i8(<4 x i32>, i32, i32, i1, i1) #0 -declare i16 @llvm.amdgcn.buffer.load.i16(<4 x i32>, i32, i32, i1, i1) #0 +declare float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8), i32, i32, i1, i1) #0 +declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8), i32, i32, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8), i32, i32, i1, i1) #0 +declare i8 @llvm.amdgcn.buffer.load.i8(ptr addrspace(8), i32, i32, i1, i1) #0 +declare i16 @llvm.amdgcn.buffer.load.i16(ptr addrspace(8), i32, i32, i1, i1) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.dwordx3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.dwordx3.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.dwordx3.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.dwordx3.ll @@ -3,51 +3,51 @@ ;CHECK-LABEL: {{^}}buffer_store_format_immoffs_x3: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyz v[0:2], off, s[0:3], 0 offset:42 -define amdgpu_ps void @buffer_store_format_immoffs_x3(<4 x i32> inreg, <3 x float>) { +define amdgpu_ps void @buffer_store_format_immoffs_x3(ptr addrspace(8) inreg, <3 x float>) { main_body: - call void @llvm.amdgcn.buffer.store.format.v3f32(<3 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.format.v3f32(<3 x float> %1, ptr addrspace(8) %0, i32 0, i32 42, i1 0, i1 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_immoffs_x3: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 offset:42 -define amdgpu_ps void @buffer_store_immoffs_x3(<4 x i32> inreg, <3 x float>) { +define amdgpu_ps void @buffer_store_immoffs_x3(ptr addrspace(8) inreg, <3 x float>) { main_body: - call void @llvm.amdgcn.buffer.store.v3f32(<3 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v3f32(<3 x float> %1, ptr addrspace(8) %0, i32 0, i32 42, i1 0, i1 0) ret void } ;CHECK-LABEL: {{^}}raw_buffer_store_format_immoffs_x3: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyz v[0:2], off, s[0:3], 0 offset:42 -define amdgpu_ps void @raw_buffer_store_format_immoffs_x3(<4 x i32> inreg, <3 x float>) { +define amdgpu_ps void @raw_buffer_store_format_immoffs_x3(ptr addrspace(8) inreg, <3 x float>) { main_body: - call void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float> %1, <4 x i32> %0, i32 42, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float> %1, ptr addrspace(8) %0, i32 42, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}raw_buffer_store_immoffs_x3: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 offset:42 -define amdgpu_ps void @raw_buffer_store_immoffs_x3(<4 x i32> inreg, <3 x float>) { +define amdgpu_ps void @raw_buffer_store_immoffs_x3(ptr addrspace(8) inreg, <3 x float>) { main_body: - call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %1, <4 x i32> %0, i32 42, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %1, ptr addrspace(8) %0, i32 42, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}struct_buffer_store_immoffs_x3: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx3 v[0:2], {{v[0-9]+}}, s[0:3], 0 idxen offset:42 -define amdgpu_ps void @struct_buffer_store_immoffs_x3(<4 x i32> inreg, <3 x float>) { +define amdgpu_ps void @struct_buffer_store_immoffs_x3(ptr addrspace(8) inreg, <3 x float>) { main_body: - call void @llvm.amdgcn.struct.buffer.store.v3f32(<3 x float> %1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v3f32(<3 x float> %1, ptr addrspace(8) %0, i32 0, i32 42, i32 0, i32 0) ret void } -declare void @llvm.amdgcn.buffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.buffer.store.format.v3f32(<3 x float>, <4 x i32>, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float>, <4 x i32>, i32, i32, i32) #0 -declare void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i32) #0 -declare void @llvm.amdgcn.struct.buffer.store.format.v3f32(<3 x float>, <4 x i32>, i32, i32, i32, i32) #0 -declare void @llvm.amdgcn.struct.buffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.buffer.store.v3f32(<3 x float>, ptr addrspace(8), i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.format.v3f32(<3 x float>, ptr addrspace(8), i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float>, ptr addrspace(8), i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float>, ptr addrspace(8), i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.format.v3f32(<3 x float>, ptr addrspace(8), i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.v3f32(<3 x float>, ptr addrspace(8), i32, i32, i32, i32) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll @@ -6,9 +6,9 @@ ; GCN: s_load_dword s[[LO:[0-9]+]] ; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]] ; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen -define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %index) { +define amdgpu_kernel void @buffer_store_format_d16_x(ptr addrspace(8) %rsrc, [8 x i32], half %data, [8 x i32], i32 %index) { main_body: - call void @llvm.amdgcn.buffer.store.format.f16(half %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.format.f16(half %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i1 0, i1 0) ret void } @@ -22,15 +22,15 @@ ; UNPACKED: buffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen ; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen -define amdgpu_kernel void @buffer_store_format_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %index) { +define amdgpu_kernel void @buffer_store_format_d16_xy(ptr addrspace(8) %rsrc, <2 x half> %data, i32 %index) { main_body: - call void @llvm.amdgcn.buffer.store.format.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.format.v2f16(<2 x half> %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i1 0, i1 0) ret void } -define amdgpu_kernel void @buffer_store_format_d16_xyz(<4 x i32> %rsrc, <3 x half> %data, i32 %index) { +define amdgpu_kernel void @buffer_store_format_d16_xyz(ptr addrspace(8) %rsrc, <3 x half> %data, i32 %index) { main_body: - call void @llvm.amdgcn.buffer.store.format.v3f16(<3 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.format.v3f16(<3 x half> %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i1 0, i1 0) ret void } @@ -51,13 +51,13 @@ ; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] ; PACKED: buffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen -define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %index) { +define amdgpu_kernel void @buffer_store_format_d16_xyzw(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %index) { main_body: - call void @llvm.amdgcn.buffer.store.format.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.format.v4f16(<4 x half> %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i1 0, i1 0) ret void } -declare void @llvm.amdgcn.buffer.store.format.f16(half, <4 x i32>, i32, i32, i1, i1) -declare void @llvm.amdgcn.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i1, i1) -declare void @llvm.amdgcn.buffer.store.format.v3f16(<3 x half>, <4 x i32>, i32, i32, i1, i1) -declare void @llvm.amdgcn.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i1, i1) +declare void @llvm.amdgcn.buffer.store.format.f16(half, ptr addrspace(8), i32, i32, i1, i1) +declare void @llvm.amdgcn.buffer.store.format.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i1, i1) +declare void @llvm.amdgcn.buffer.store.format.v3f16(<3 x half>, ptr addrspace(8), i32, i32, i1, i1) +declare void @llvm.amdgcn.buffer.store.format.v4f16(<4 x half>, ptr addrspace(8), i32, i32, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll @@ -6,47 +6,47 @@ ;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0 ;CHECK: buffer_store_format_xyzw v[4:7], off, s[0:3], 0 glc ;CHECK: buffer_store_format_xyzw v[8:11], off, s[0:3], 0 slc -define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +define amdgpu_ps void @buffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x float>, <4 x float>) { main_body: - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i1 1, i1 0) - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i1 0, i1 1) + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %2, ptr addrspace(8) %0, i32 0, i32 0, i1 1, i1 0) + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %3, ptr addrspace(8) %0, i32 0, i32 0, i1 0, i1 1) ret void } ;CHECK-LABEL: {{^}}buffer_store_immoffs: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0 offset:42 -define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +define amdgpu_ps void @buffer_store_immoffs(ptr addrspace(8) inreg, <4 x float>) { main_body: - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 42, i1 0, i1 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_idx: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) { +define amdgpu_ps void @buffer_store_idx(ptr addrspace(8) inreg, <4 x float>, i32) { main_body: - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i1 0, i1 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_ofs: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 offen -define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { +define amdgpu_ps void @buffer_store_ofs(ptr addrspace(8) inreg, <4 x float>, i32) { main_body: - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 %2, i1 0, i1 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_both: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen -define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) { +define amdgpu_ps void @buffer_store_both(ptr addrspace(8) inreg, <4 x float>, i32, i32) { main_body: - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 %3, i1 0, i1 0) ret void } @@ -54,9 +54,9 @@ ;CHECK: v_mov_b32_e32 v6, v4 ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], v[5:6], s[0:3], 0 idxen offen -define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) { +define amdgpu_ps void @buffer_store_both_reversed(ptr addrspace(8) inreg, <4 x float>, i32, i32) { main_body: - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %3, i32 %2, i1 0, i1 0) ret void } @@ -69,36 +69,36 @@ ;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { +define amdgpu_ps void @buffer_store_wait(ptr addrspace(8) inreg, <4 x float>, i32, i32, i32) { main_body: - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0) - %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %3, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %data, ptr addrspace(8) %0, i32 %4, i32 0, i1 0, i1 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_x1: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) { +define amdgpu_ps void @buffer_store_x1(ptr addrspace(8) inreg %rsrc, float %data, i32 %index) { main_body: - call void @llvm.amdgcn.buffer.store.format.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.format.f32(float %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i1 0, i1 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_x2: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) { +define amdgpu_ps void @buffer_store_x2(ptr addrspace(8) inreg %rsrc, <2 x float> %data, i32 %index) { main_body: - call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i1 0, i1 0) ret void } -declare void @llvm.amdgcn.buffer.store.format.f32(float, <4 x i32>, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0 -declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1 +declare void @llvm.amdgcn.buffer.store.format.f32(float, ptr addrspace(8), i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float>, ptr addrspace(8), i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8), i32, i32, i1, i1) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll @@ -6,47 +6,47 @@ ;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ;CHECK: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc ;CHECK: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc -define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +define amdgpu_ps void @buffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x float>, <4 x float>) { main_body: - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i1 1, i1 0) - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i1 0, i1 1) + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %2, ptr addrspace(8) %0, i32 0, i32 0, i1 1, i1 0) + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %3, ptr addrspace(8) %0, i32 0, i32 0, i1 0, i1 1) ret void } ;CHECK-LABEL: {{^}}buffer_store_immoffs: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42 -define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +define amdgpu_ps void @buffer_store_immoffs(ptr addrspace(8) inreg, <4 x float>) { main_body: - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 42, i1 0, i1 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_idx: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) { +define amdgpu_ps void @buffer_store_idx(ptr addrspace(8) inreg, <4 x float>, i32) { main_body: - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i1 0, i1 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_ofs: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen -define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { +define amdgpu_ps void @buffer_store_ofs(ptr addrspace(8) inreg, <4 x float>, i32) { main_body: - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 %2, i1 0, i1 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_both: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen -define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) { +define amdgpu_ps void @buffer_store_both(ptr addrspace(8) inreg, <4 x float>, i32, i32) { main_body: - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 %3, i1 0, i1 0) ret void } @@ -54,9 +54,9 @@ ;CHECK: v_mov_b32_e32 v6, v4 ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen -define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) { +define amdgpu_ps void @buffer_store_both_reversed(ptr addrspace(8) inreg, <4 x float>, i32, i32) { main_body: - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %3, i32 %2, i1 0, i1 0) ret void } @@ -69,29 +69,29 @@ ;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { +define amdgpu_ps void @buffer_store_wait(ptr addrspace(8) inreg, <4 x float>, i32, i32, i32) { main_body: - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0) - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %0, i32 %3, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %0, i32 %4, i32 0, i1 0, i1 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_x1: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) { +define amdgpu_ps void @buffer_store_x1(ptr addrspace(8) inreg %rsrc, float %data, i32 %index) { main_body: - call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i1 0, i1 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_x2: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 { +define amdgpu_ps void @buffer_store_x2(ptr addrspace(8) inreg %rsrc, <2 x float> %data, i32 %index) #0 { main_body: - call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i1 0, i1 0) ret void } @@ -99,19 +99,19 @@ ;CHECK-NOT: s_waitcnt ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 -define amdgpu_ps void @buffer_store_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +define amdgpu_ps void @buffer_store_x1_offen_merged(ptr addrspace(8) inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 %a3 = add i32 %a, 12 %a4 = add i32 %a, 16 %a5 = add i32 %a, 28 %a6 = add i32 %a, 32 - call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 %a3, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 0, i32 %a4, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 0, i32 %a5, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 0, i32 %a6, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v1, ptr addrspace(8) %rsrc, i32 0, i32 %a1, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, ptr addrspace(8) %rsrc, i32 0, i32 %a2, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v3, ptr addrspace(8) %rsrc, i32 0, i32 %a3, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v4, ptr addrspace(8) %rsrc, i32 0, i32 %a4, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v5, ptr addrspace(8) %rsrc, i32 0, i32 %a5, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v6, ptr addrspace(8) %rsrc, i32 0, i32 %a6, i1 0, i1 0) ret void } @@ -120,65 +120,65 @@ ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}} ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}} ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}} -define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(ptr addrspace(8) inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 %a3 = add i32 %a, 12 %a4 = add i32 %a, 16 %a5 = add i32 %a, 28 %a6 = add i32 %a, 32 - call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 %a3, i1 1, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 0, i32 %a4, i1 1, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 0, i32 %a5, i1 1, i1 1) - call void @llvm.amdgcn.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 0, i32 %a6, i1 1, i1 1) + call void @llvm.amdgcn.buffer.store.f32(float %v1, ptr addrspace(8) %rsrc, i32 0, i32 %a1, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, ptr addrspace(8) %rsrc, i32 0, i32 %a2, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v3, ptr addrspace(8) %rsrc, i32 0, i32 %a3, i1 1, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v4, ptr addrspace(8) %rsrc, i32 0, i32 %a4, i1 1, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v5, ptr addrspace(8) %rsrc, i32 0, i32 %a5, i1 1, i1 1) + call void @llvm.amdgcn.buffer.store.f32(float %v6, ptr addrspace(8) %rsrc, i32 0, i32 %a6, i1 1, i1 1) ret void } ;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -define amdgpu_ps void @buffer_store_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) { +define amdgpu_ps void @buffer_store_x2_offen_merged(ptr addrspace(8) inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) { %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 - call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, ptr addrspace(8) %rsrc, i32 0, i32 %a1, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, ptr addrspace(8) %rsrc, i32 0, i32 %a2, i1 0, i1 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 -define amdgpu_ps void @buffer_store_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3) { +define amdgpu_ps void @buffer_store_x3_offen_merged(ptr addrspace(8) inreg %rsrc, i32 %a, float %v1, float %v2, float %v3) { %a1 = add i32 %a, 28 %a2 = add i32 %a, 32 %a3 = add i32 %a, 36 - call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 %a3, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v1, ptr addrspace(8) %rsrc, i32 0, i32 %a1, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, ptr addrspace(8) %rsrc, i32 0, i32 %a2, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v3, ptr addrspace(8) %rsrc, i32 0, i32 %a3, i1 0, i1 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged2: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -define amdgpu_ps void @buffer_store_x3_offen_merged2(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, float %v2) { +define amdgpu_ps void @buffer_store_x3_offen_merged2(ptr addrspace(8) inreg %rsrc, i32 %a, <2 x float> %v1, float %v2) { %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 - call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, ptr addrspace(8) %rsrc, i32 0, i32 %a1, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, ptr addrspace(8) %rsrc, i32 0, i32 %a2, i1 0, i1 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged3: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -define amdgpu_ps void @buffer_store_x3_offen_merged3(<4 x i32> inreg %rsrc, i32 %a, float %v1, <2 x float> %v2) { +define amdgpu_ps void @buffer_store_x3_offen_merged3(ptr addrspace(8) inreg %rsrc, i32 %a, float %v1, <2 x float> %v2) { %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 - call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v1, ptr addrspace(8) %rsrc, i32 0, i32 %a1, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, ptr addrspace(8) %rsrc, i32 0, i32 %a2, i1 0, i1 0) ret void } @@ -186,50 +186,50 @@ ;CHECK-NOT: s_waitcnt ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 -define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { - call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 0, i32 28, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 0, i32 32, i1 0, i1 0) +define amdgpu_ps void @buffer_store_x1_offset_merged(ptr addrspace(8) inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { + call void @llvm.amdgcn.buffer.store.f32(float %v1, ptr addrspace(8) %rsrc, i32 0, i32 4, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, ptr addrspace(8) %rsrc, i32 0, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v3, ptr addrspace(8) %rsrc, i32 0, i32 12, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v4, ptr addrspace(8) %rsrc, i32 0, i32 16, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v5, ptr addrspace(8) %rsrc, i32 0, i32 28, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v6, ptr addrspace(8) %rsrc, i32 0, i32 32, i1 0, i1 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1, <2 x float> %v2) { - call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) +define amdgpu_ps void @buffer_store_x2_offset_merged(ptr addrspace(8) inreg %rsrc, <2 x float> %v1, <2 x float> %v2) { + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, ptr addrspace(8) %rsrc, i32 0, i32 4, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, ptr addrspace(8) %rsrc, i32 0, i32 12, i1 0, i1 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged: ;CHECK-NOT: s_waitcnt ;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -define amdgpu_ps void @buffer_store_x3_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3) { - call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) +define amdgpu_ps void @buffer_store_x3_offset_merged(ptr addrspace(8) inreg %rsrc, float %v1, float %v2, float %v3) { + call void @llvm.amdgcn.buffer.store.f32(float %v1, ptr addrspace(8) %rsrc, i32 0, i32 4, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, ptr addrspace(8) %rsrc, i32 0, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v3, ptr addrspace(8) %rsrc, i32 0, i32 12, i1 0, i1 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged2: ;CHECK-NOT: s_waitcnt ;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -define amdgpu_ps void @buffer_store_x3_offset_merged2(<4 x i32> inreg %rsrc, float %v1, <2 x float> %v2) { - call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) +define amdgpu_ps void @buffer_store_x3_offset_merged2(ptr addrspace(8) inreg %rsrc, float %v1, <2 x float> %v2) { + call void @llvm.amdgcn.buffer.store.f32(float %v1, ptr addrspace(8) %rsrc, i32 0, i32 4, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, ptr addrspace(8) %rsrc, i32 0, i32 8, i1 0, i1 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged3: ;CHECK-NOT: s_waitcnt ;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:8 -define amdgpu_ps void @buffer_store_x3_offset_merged3(<4 x i32> inreg %rsrc, <2 x float> %v1, float %v2) { - call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) +define amdgpu_ps void @buffer_store_x3_offset_merged3(ptr addrspace(8) inreg %rsrc, <2 x float> %v1, float %v2) { + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, ptr addrspace(8) %rsrc, i32 0, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, ptr addrspace(8) %rsrc, i32 0, i32 16, i1 0, i1 0) ret void } @@ -237,11 +237,11 @@ ;CHECK-NOT: s_waitcnt ;CHECK-NEXT: %bb. ;CHECK: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0 offset:8 -define amdgpu_ps void @buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) { +define amdgpu_ps void @buffer_store_byte(ptr addrspace(8) inreg %rsrc, float %v1) { main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i8 - call void @llvm.amdgcn.buffer.store.i8(i8 %v3, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.i8(i8 %v3, ptr addrspace(8) %rsrc, i32 0, i32 8, i1 0, i1 0) ret void } @@ -249,20 +249,20 @@ ;CHECK-NOT: s_waitcnt ;CHECK-NEXT: %bb. ;CHECK: buffer_store_short v{{[0-9]}}, off, s[0:3], 0 offset:16 -define amdgpu_ps void @buffer_store_short(<4 x i32> inreg %rsrc, float %v1) { +define amdgpu_ps void @buffer_store_short(ptr addrspace(8) inreg %rsrc, float %v1) { main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i16 - call void @llvm.amdgcn.buffer.store.i16(i16 %v3, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.i16(i16 %v3, ptr addrspace(8) %rsrc, i32 0, i32 16, i1 0, i1 0) ret void } -declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.buffer.store.i8(i8, <4 x i32>, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.buffer.store.i16(i16, <4 x i32>, i32, i32, i1, i1) #0 -declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1 +declare void @llvm.amdgcn.buffer.store.f32(float, ptr addrspace(8), i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, ptr addrspace(8), i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.i8(i8, ptr addrspace(8), i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.i16(i16, ptr addrspace(8), i32, i32, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8), i32, i32, i1, i1) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll @@ -4,7 +4,7 @@ declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 declare void @llvm.amdgcn.exp.i32(i32, i32, i32, i32, i32, i32, i1, i1) #1 -declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #2 +declare float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8), i32, i32, i32) #2 ; GCN-LABEL: {{^}}test_export_zeroes_f32: ; GCN: exp mrt0 off, off, off, off{{$}} @@ -614,7 +614,7 @@ define amdgpu_kernel void @test_export_pos_before_param_across_load(i32 %idx) #0 { call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 1.0, float 1.0, float 1.0, i1 false, i1 false) call void @llvm.amdgcn.exp.f32(i32 33, i32 15, float 1.0, float 1.0, float 1.0, float 0.5, i1 false, i1 false) - %load = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0) + %load = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0) call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 0.0, float 0.0, float 0.0, float %load, i1 true, i1 false) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll @@ -18,7 +18,7 @@ ; GFX11: buffer_store_b32 ; GFX11: buffer_store_b32 ; GFX11: buffer_store_b32 -define amdgpu_ps void @lds_direct_load(<4 x i32> inreg %buf, i32 inreg %arg0, +define amdgpu_ps void @lds_direct_load(ptr addrspace(8) inreg %buf, i32 inreg %arg0, i32 inreg %arg1, i32 inreg %arg2) #0 { main_body: %p0 = call float @llvm.amdgcn.lds.direct.load(i32 %arg0) @@ -28,17 +28,17 @@ %p2 = call float @llvm.amdgcn.lds.direct.load(i32 %arg2) %p3 = call float @llvm.amdgcn.lds.direct.load(i32 %arg1) %p4 = call float @llvm.amdgcn.lds.direct.load(i32 %arg2) - call void @llvm.amdgcn.raw.buffer.store.f32(float %p5, <4 x i32> %buf, i32 4, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %p1, <4 x i32> %buf, i32 4, i32 1, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %p2, <4 x i32> %buf, i32 4, i32 2, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %p3, <4 x i32> %buf, i32 4, i32 3, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %p4, <4 x i32> %buf, i32 4, i32 4, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %p0, <4 x i32> %buf, i32 4, i32 5, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %p5, ptr addrspace(8) %buf, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %p1, ptr addrspace(8) %buf, i32 4, i32 1, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %p2, ptr addrspace(8) %buf, i32 4, i32 2, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %p3, ptr addrspace(8) %buf, i32 4, i32 3, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %p4, ptr addrspace(8) %buf, i32 4, i32 4, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %p0, ptr addrspace(8) %buf, i32 4, i32 5, i32 0) ret void } declare float @llvm.amdgcn.lds.direct.load(i32) #1 -declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32) attributes #0 = { nounwind } attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll @@ -20,7 +20,7 @@ ; GFX11: s_waitcnt expcnt(0) ; GFX11: buffer_store_b32 ; GFX11: buffer_store_b32 -define amdgpu_ps void @lds_param_load(<4 x i32> inreg %buf, i32 inreg %arg) #0 { +define amdgpu_ps void @lds_param_load(ptr addrspace(8) inreg %buf, i32 inreg %arg) #0 { main_body: %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %arg) ; Ensure memory clustering is occuring for lds_param_load @@ -29,17 +29,17 @@ %p2 = call float @llvm.amdgcn.lds.param.load(i32 2, i32 0, i32 %arg) %p3 = call float @llvm.amdgcn.lds.param.load(i32 3, i32 0, i32 %arg) %p4 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %arg) - call void @llvm.amdgcn.raw.buffer.store.f32(float %p5, <4 x i32> %buf, i32 4, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %p1, <4 x i32> %buf, i32 4, i32 1, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %p2, <4 x i32> %buf, i32 4, i32 2, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %p3, <4 x i32> %buf, i32 4, i32 3, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %p4, <4 x i32> %buf, i32 4, i32 4, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %p0, <4 x i32> %buf, i32 4, i32 5, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %p5, ptr addrspace(8) %buf, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %p1, ptr addrspace(8) %buf, i32 4, i32 1, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %p2, ptr addrspace(8) %buf, i32 4, i32 2, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %p3, ptr addrspace(8) %buf, i32 4, i32 3, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %p4, ptr addrspace(8) %buf, i32 4, i32 4, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %p0, ptr addrspace(8) %buf, i32 4, i32 5, i32 0) ret void } declare float @llvm.amdgcn.lds.param.load(i32, i32, i32) #1 -declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32) attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck %s -check-prefix=CHECK -define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_mov_b32 s11, s5 @@ -10,11 +10,11 @@ ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s6 offen ; CHECK-NEXT: s_endpgm - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 24) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 24) ret void } -define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { ; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_mov_b32 s11, s5 @@ -23,11 +23,11 @@ ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_add_f32 v0, off, s[8:11], s6 ; CHECK-NEXT: s_endpgm - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_mov_b32 s11, s5 @@ -36,11 +36,11 @@ ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], s6 offen ; CHECK-NEXT: s_endpgm - %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_mov_b32 s11, s5 @@ -49,11 +49,11 @@ ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, off, s[8:11], s6 offset:92 ; CHECK-NEXT: s_endpgm - %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_mov_b32 s11, s5 @@ -62,11 +62,11 @@ ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s6 offen slc ; CHECK-NEXT: s_endpgm - %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 2) ret void } -declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0 -declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) #0 +declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, ptr addrspace(8), i32, i32, i32 immarg) #0 +declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32 immarg) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll @@ -14,16 +14,16 @@ ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_swap v0, off, s[0:3], 0{{$}} ;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc -define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %voffset) { +define amdgpu_ps float @test1(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %voffset) { main_body: - %o1 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) - %o3 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 %o1, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + %o1 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) + %o3 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 %o1, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 0) %off5 = add i32 %voffset, 42 - %o5 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 %o3, <4 x i32> %rsrc, i32 %off5, i32 0, i32 0) - %o6 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 %o5, <4 x i32> %rsrc, i32 4, i32 8188, i32 0) - %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 %o6, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + %o5 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 %o3, ptr addrspace(8) %rsrc, i32 %off5, i32 0, i32 0) + %o6 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 %o5, ptr addrspace(8) %rsrc, i32 4, i32 8188, i32 0) + %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 %o6, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) %o7 = bitcast i32 %o6 to float - %out = call float @llvm.amdgcn.raw.buffer.atomic.swap.f32(float %o7, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + %out = call float @llvm.amdgcn.raw.buffer.atomic.swap.f32(float %o7, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) ret float %out } @@ -50,19 +50,19 @@ ;CHECK: buffer_atomic_inc v0, v1, s[0:3], 0 offen glc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_dec v0, v1, s[0:3], 0 offen glc -define amdgpu_ps float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %voffset) { +define amdgpu_ps float @test2(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %voffset) { main_body: - %t1 = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %data, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) - %t2 = call i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32 %t1, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 2) - %t3 = call i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32 %t2, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) - %t4 = call i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32 %t3, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 2) - %t5 = call i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32 %t4, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) - %t6 = call i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32 %t5, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 2) - %t7 = call i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32 %t6, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) - %t8 = call i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32 %t7, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 2) - %t9 = call i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32 %t8, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) - %t10 = call i32 @llvm.amdgcn.raw.buffer.atomic.inc.i32(i32 %t9, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) - %t11 = call i32 @llvm.amdgcn.raw.buffer.atomic.dec.i32(i32 %t10, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + %t1 = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %data, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 0) + %t2 = call i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32 %t1, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 2) + %t3 = call i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32 %t2, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 0) + %t4 = call i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32 %t3, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 2) + %t5 = call i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32 %t4, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 0) + %t6 = call i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32 %t5, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 2) + %t7 = call i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32 %t6, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 0) + %t8 = call i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32 %t7, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 2) + %t9 = call i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32 %t8, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 0) + %t10 = call i32 @llvm.amdgcn.raw.buffer.atomic.inc.i32(i32 %t9, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 0) + %t11 = call i32 @llvm.amdgcn.raw.buffer.atomic.dec.i32(i32 %t10, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 0) %out = bitcast i32 %t11 to float ret float %out } @@ -81,20 +81,20 @@ ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 offen offset:44 glc ;CHECK-DAG: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:4 glc -define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) { +define amdgpu_ps float @test3(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) { main_body: - %o1 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0) - %o3 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + %o1 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %data, i32 %cmp, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) + %o3 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %o1, i32 %cmp, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 0) %ofs.5 = add i32 %voffset, 44 - %o5 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %o3, i32 %cmp, <4 x i32> %rsrc, i32 %ofs.5, i32 0, i32 0) - %o6 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %o5, i32 %cmp, <4 x i32> %rsrc, i32 4, i32 8188, i32 0) + %o5 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %o3, i32 %cmp, ptr addrspace(8) %rsrc, i32 %ofs.5, i32 0, i32 0) + %o6 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %o5, i32 %cmp, ptr addrspace(8) %rsrc, i32 4, i32 8188, i32 0) ; Detecting the no-return variant doesn't work right now because of how the ; intrinsic is replaced by an instruction that feeds into an EXTRACT_SUBREG. ; Since there probably isn't a reasonable use-case of cmpswap that discards ; the return value, that seems okay. ; -; %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %o6, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0) +; %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %o6, i32 %cmp, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) %out = bitcast i32 %o6 to float ret float %out } @@ -103,24 +103,24 @@ ;CHECK: buffer_atomic_add v0, define amdgpu_ps float @test4() { main_body: - %v = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 1, <4 x i32> undef, i32 4, i32 0, i32 0) + %v = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 1, ptr addrspace(8) undef, i32 4, i32 0, i32 0) %v.float = bitcast i32 %v to float ret float %v.float } -declare i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i32) #0 -declare float @llvm.amdgcn.raw.buffer.atomic.swap.f32(float, <4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32, <4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32, <4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32, <4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.raw.buffer.atomic.inc.i32(i32, <4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.raw.buffer.atomic.dec.i32(i32, <4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32, ptr addrspace(8), i32, i32, i32) #0 +declare float @llvm.amdgcn.raw.buffer.atomic.swap.f32(float, ptr addrspace(8), i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, ptr addrspace(8), i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32, ptr addrspace(8), i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32, ptr addrspace(8), i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32, ptr addrspace(8), i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32, ptr addrspace(8), i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32, ptr addrspace(8), i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32, ptr addrspace(8), i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32, ptr addrspace(8), i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32, ptr addrspace(8), i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.inc.i32(i32, ptr addrspace(8), i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.dec.i32(i32, ptr addrspace(8), i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32, i32, ptr addrspace(8), i32, i32, i32) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.d16.ll @@ -4,9 +4,9 @@ ; GCN-LABEL: {{^}}buffer_load_format_d16_x: ; GCN: buffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 -define amdgpu_ps half @buffer_load_format_d16_x(<4 x i32> inreg %rsrc) { +define amdgpu_ps half @buffer_load_format_d16_x(ptr addrspace(8) inreg %rsrc) { main_body: - %data = call half @llvm.amdgcn.raw.buffer.load.format.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %data = call half @llvm.amdgcn.raw.buffer.load.format.f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) ret half %data } @@ -16,9 +16,9 @@ ; PACKED: buffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] -define amdgpu_ps half @buffer_load_format_d16_xy(<4 x i32> inreg %rsrc) { +define amdgpu_ps half @buffer_load_format_d16_xy(ptr addrspace(8) inreg %rsrc) { main_body: - %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.format.v2f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) %elt = extractelement <2 x half> %data, i32 1 ret half %elt } @@ -28,9 +28,9 @@ ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] ; PACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 -define amdgpu_ps half @buffer_load_format_d16_xyz(<4 x i32> inreg %rsrc) { +define amdgpu_ps half @buffer_load_format_d16_xyz(ptr addrspace(8) inreg %rsrc) { main_body: - %data = call <3 x half> @llvm.amdgcn.raw.buffer.load.format.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %data = call <3 x half> @llvm.amdgcn.raw.buffer.load.format.v3f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) %elt = extractelement <3 x half> %data, i32 2 ret half %elt } @@ -41,14 +41,14 @@ ; PACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] -define amdgpu_ps half @buffer_load_format_d16_xyzw(<4 x i32> inreg %rsrc) { +define amdgpu_ps half @buffer_load_format_d16_xyzw(ptr addrspace(8) inreg %rsrc) { main_body: - %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.format.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.format.v4f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) %elt = extractelement <4 x half> %data, i32 3 ret half %elt } -declare half @llvm.amdgcn.raw.buffer.load.format.f16(<4 x i32>, i32, i32, i32) -declare <2 x half> @llvm.amdgcn.raw.buffer.load.format.v2f16(<4 x i32>, i32, i32, i32) -declare <3 x half> @llvm.amdgcn.raw.buffer.load.format.v3f16(<4 x i32>, i32, i32, i32) -declare <4 x half> @llvm.amdgcn.raw.buffer.load.format.v4f16(<4 x i32>, i32, i32, i32) +declare half @llvm.amdgcn.raw.buffer.load.format.f16(ptr addrspace(8), i32, i32, i32) +declare <2 x half> @llvm.amdgcn.raw.buffer.load.format.v2f16(ptr addrspace(8), i32, i32, i32) +declare <3 x half> @llvm.amdgcn.raw.buffer.load.format.v3f16(ptr addrspace(8), i32, i32, i32) +declare <4 x half> @llvm.amdgcn.raw.buffer.load.format.v4f16(ptr addrspace(8), i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll @@ -6,11 +6,11 @@ ;CHECK: buffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc ;CHECK: buffer_load_format_xyzw v[8:11], off, s[0:3], 0 slc ;CHECK: s_waitcnt -define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(ptr addrspace(8) inreg) { main_body: - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0) - %data_glc = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 1) - %data_slc = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 2) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0) + %data_glc = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 1) + %data_slc = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 2) %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 @@ -20,9 +20,9 @@ ;CHECK-LABEL: {{^}}buffer_load_immoffs: ;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42 ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_immoffs(ptr addrspace(8) inreg) { main_body: - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 42, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 42, i32 0, i32 0) ret <4 x float> %data } @@ -33,11 +33,11 @@ ;CHECK-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8ffc ;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:4 ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_immoffs_large(ptr addrspace(8) inreg) { main_body: - %d.0 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 4092, i32 60, i32 0) - %d.1 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 4092, i32 32764, i32 0) - %d.2 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 4, i32 36860, i32 0) + %d.0 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 4092, i32 60, i32 0) + %d.1 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 4092, i32 32764, i32 0) + %d.2 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 4, i32 36860, i32 0) %d.3 = fadd <4 x float> %d.0, %d.1 %data = fadd <4 x float> %d.2, %d.3 ret <4 x float> %data @@ -46,42 +46,42 @@ ;CHECK-LABEL: {{^}}buffer_load_ofs: ;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { +define amdgpu_ps <4 x float> @buffer_load_ofs(ptr addrspace(8) inreg, i32) { main_body: - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %1, i32 0, i32 0) ret <4 x float> %data } ;CHECK-LABEL: {{^}}buffer_load_ofs_imm: ;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60 ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { +define amdgpu_ps <4 x float> @buffer_load_ofs_imm(ptr addrspace(8) inreg, i32) { main_body: %ofs = add i32 %1, 60 - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 %ofs, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %ofs, i32 0, i32 0) ret <4 x float> %data } ;CHECK-LABEL: {{^}}buffer_load_x: ;CHECK: buffer_load_format_x v0, off, s[0:3], 0 ;CHECK: s_waitcnt -define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_x(ptr addrspace(8) inreg %rsrc) { main_body: - %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) ret float %data } ;CHECK-LABEL: {{^}}buffer_load_xy: ;CHECK: buffer_load_format_xy v[0:1], off, s[0:3], 0 ;CHECK: s_waitcnt -define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { +define amdgpu_ps <2 x float> @buffer_load_xy(ptr addrspace(8) inreg %rsrc) { main_body: - %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) ret <2 x float> %data } -declare float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32>, i32, i32, i32) #0 -declare <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32) #0 -declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) #0 +declare float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8), i32, i32, i32) #0 +declare <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8), i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8), i32, i32, i32) #0 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.lds.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.lds.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.lds.ll @@ -2,9 +2,9 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN -declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux) +declare void @llvm.amdgcn.raw.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux) -define amdgpu_ps float @buffer_load_lds_dword(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) { +define amdgpu_ps float @buffer_load_lds_dword(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) { ; GCN-LABEL: buffer_load_lds_dword: ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b32 m0, s4 @@ -18,14 +18,14 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: - call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 4, i32 1) - call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 8, i32 2) + call void @llvm.amdgcn.raw.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 4, i32 1) + call void @llvm.amdgcn.raw.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 8, i32 2) %res = load float, ptr addrspace(3) %lds ret float %res } -define amdgpu_ps void @buffer_load_lds_dword_imm_voffset(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) { +define amdgpu_ps void @buffer_load_lds_dword_imm_voffset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) { ; GCN-LABEL: buffer_load_lds_dword_imm_voffset: ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: v_mov_b32_e32 v0, 0x800 @@ -34,11 +34,11 @@ ; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds ; GCN-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 2048, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 2048, i32 0, i32 0, i32 0) ret void } -define amdgpu_ps void @buffer_load_lds_dword_v_offset(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset) { +define amdgpu_ps void @buffer_load_lds_dword_v_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset) { ; GCN-LABEL: buffer_load_lds_dword_v_offset: ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b32 m0, s4 @@ -46,11 +46,11 @@ ; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds ; GCN-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 %voffset, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 %voffset, i32 0, i32 0, i32 0) ret void } -define amdgpu_ps void @buffer_load_lds_dword_s_offset(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds, i32 inreg %soffset) { +define amdgpu_ps void @buffer_load_lds_dword_s_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 inreg %soffset) { ; GCN-LABEL: buffer_load_lds_dword_s_offset: ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b32 m0, s4 @@ -58,11 +58,11 @@ ; GCN-NEXT: buffer_load_dword off, s[0:3], s5 lds ; GCN-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 %soffset, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 %soffset, i32 0, i32 0) ret void } -define amdgpu_ps void @buffer_load_lds_dword_vs_offset(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @buffer_load_lds_dword_vs_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset, i32 inreg %soffset) { ; GCN-LABEL: buffer_load_lds_dword_vs_offset: ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b32 m0, s4 @@ -70,11 +70,11 @@ ; GCN-NEXT: buffer_load_dword v0, s[0:3], s5 offen lds ; GCN-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 %voffset, i32 %soffset, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 %voffset, i32 %soffset, i32 0, i32 0) ret void } -define amdgpu_ps void @buffer_load_lds_dword_vs_imm_offset(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @buffer_load_lds_dword_vs_imm_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset, i32 inreg %soffset) { ; GCN-LABEL: buffer_load_lds_dword_vs_imm_offset: ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b32 m0, s4 @@ -82,11 +82,11 @@ ; GCN-NEXT: buffer_load_dword v0, s[0:3], s5 offen offset:2048 lds ; GCN-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 %voffset, i32 %soffset, i32 2048, i32 0) + call void @llvm.amdgcn.raw.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 %voffset, i32 %soffset, i32 2048, i32 0) ret void } -define amdgpu_ps void @buffer_load_lds_ushort(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) { +define amdgpu_ps void @buffer_load_lds_ushort(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) { ; GCN-LABEL: buffer_load_lds_ushort: ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: v_mov_b32_e32 v0, 0x800 @@ -95,11 +95,11 @@ ; GCN-NEXT: buffer_load_ushort v0, s[0:3], 0 offen lds ; GCN-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 2, i32 2048, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 2, i32 2048, i32 0, i32 0, i32 0) ret void } -define amdgpu_ps void @buffer_load_lds_ubyte(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) { +define amdgpu_ps void @buffer_load_lds_ubyte(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) { ; GCN-LABEL: buffer_load_lds_ubyte: ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b32 m0, s4 @@ -107,6 +107,6 @@ ; GCN-NEXT: buffer_load_ubyte off, s[0:3], 0 offset:2048 lds ; GCN-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 1, i32 0, i32 0, i32 2048, i32 0) + call void @llvm.amdgcn.raw.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 1, i32 0, i32 0, i32 2048, i32 0) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll @@ -4,7 +4,7 @@ ;RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX10 ;RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11 -define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(ptr addrspace(8) inreg) { ; PREGFX10-LABEL: buffer_load: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 @@ -31,16 +31,16 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0) - %data_glc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 1) - %data_slc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 2) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0) + %data_glc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 1) + %data_slc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 2) %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 ret {<4 x float>, <4 x float>, <4 x float>} %r2 } -define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_dlc(<4 x i32> inreg) { +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_dlc(ptr addrspace(8) inreg) { ; PREGFX10-LABEL: buffer_load_dlc: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 @@ -67,16 +67,16 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 4) - %data_glc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 5) - %data_slc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 6) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 4) + %data_glc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 5) + %data_slc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 6) %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 ret {<4 x float>, <4 x float>, <4 x float>} %r2 } -define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_immoffs(ptr addrspace(8) inreg) { ; PREGFX10-LABEL: buffer_load_immoffs: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40 @@ -95,11 +95,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 40, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %0, i32 40, i32 0, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_immoffs_large(ptr addrspace(8) inreg) { ; PREGFX10-LABEL: buffer_load_immoffs_large: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: s_movk_i32 s4, 0x1ffc @@ -121,11 +121,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 4, i32 8188, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %0, i32 4, i32 8188, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { +define amdgpu_ps <4 x float> @buffer_load_ofs(ptr addrspace(8) inreg, i32) { ; PREGFX10-LABEL: buffer_load_ofs: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen @@ -144,11 +144,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %0, i32 %1, i32 0, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { +define amdgpu_ps <4 x float> @buffer_load_ofs_imm(ptr addrspace(8) inreg, i32) { ; PREGFX10-LABEL: buffer_load_ofs_imm: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:60 @@ -168,11 +168,11 @@ ; GFX11-NEXT: ; return to shader part epilog main_body: %ofs = add i32 %1, 60 - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %ofs, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %0, i32 %ofs, i32 0, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(ptr addrspace(8) inreg) { ; PREGFX10-LABEL: buffer_load_voffset_large_12bit: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4092 @@ -191,11 +191,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 4092, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %0, i32 4092, i32 0, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @buffer_load_voffset_large_13bit(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_voffset_large_13bit(ptr addrspace(8) inreg) { ; PREGFX10-LABEL: buffer_load_voffset_large_13bit: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: v_mov_b32_e32 v0, 0x1000 @@ -217,11 +217,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 8188, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %0, i32 8188, i32 0, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @buffer_load_voffset_large_16bit(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_voffset_large_16bit(ptr addrspace(8) inreg) { ; PREGFX10-LABEL: buffer_load_voffset_large_16bit: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: v_mov_b32_e32 v0, 0xf000 @@ -243,11 +243,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 65532, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %0, i32 65532, i32 0, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @buffer_load_voffset_large_23bit(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_voffset_large_23bit(ptr addrspace(8) inreg) { ; PREGFX10-LABEL: buffer_load_voffset_large_23bit: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: v_mov_b32_e32 v0, 0x7ff000 @@ -269,11 +269,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 8388604, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %0, i32 8388604, i32 0, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @buffer_load_voffset_large_24bit(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_voffset_large_24bit(ptr addrspace(8) inreg) { ; PREGFX10-LABEL: buffer_load_voffset_large_24bit: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: v_mov_b32_e32 v0, 0xfff000 @@ -295,12 +295,12 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 16777212, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %0, i32 16777212, i32 0, i32 0) ret <4 x float> %data } -define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %ofs) { +define amdgpu_ps float @buffer_load_x1(ptr addrspace(8) inreg %rsrc, i32 %ofs) { ; PREGFX10-LABEL: buffer_load_x1: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen @@ -319,11 +319,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0, i32 0) + %data = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0, i32 0) ret float %data } -define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %ofs) { +define amdgpu_ps <2 x float> @buffer_load_x2(ptr addrspace(8) inreg %rsrc, i32 %ofs) { ; PREGFX10-LABEL: buffer_load_x2: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[0:3], 0 offen @@ -342,11 +342,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0, i32 0) + %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0, i32 0) ret <2 x float> %data } -define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) { +define amdgpu_ps <4 x float> @buffer_load_negative_offset(ptr addrspace(8) inreg, i32 %ofs) { ; GFX10-LABEL: buffer_load_negative_offset: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_add_nc_u32_e32 v0, -16, v0 @@ -362,11 +362,11 @@ ; GFX11-NEXT: ; return to shader part epilog main_body: %ofs.1 = add i32 %ofs, -16 - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %ofs.1, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %0, i32 %ofs.1, i32 0, i32 0) ret <4 x float> %data } -define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, ptr addrspace(3) %lds) { +define amdgpu_ps float @buffer_load_mmo(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %lds) { ; GFX10-LABEL: buffer_load_mmo: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0 @@ -388,13 +388,13 @@ ; GFX11-NEXT: ; return to shader part epilog entry: store float 0.0, ptr addrspace(3) %lds - %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %val = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) %tmp2 = getelementptr float, ptr addrspace(3) %lds, i32 4 store float 0.0, ptr addrspace(3) %tmp2 ret float %val } -define amdgpu_ps void @buffer_load_x1_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a) { +define amdgpu_ps void @buffer_load_x1_offen_merged_and(ptr addrspace(8) inreg %rsrc, i32 %a) { ; PREGFX10-LABEL: buffer_load_x1_offen_merged_and: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 @@ -433,18 +433,18 @@ %a4 = add i32 %a, 16 %a5 = add i32 %a, 28 %a6 = add i32 %a, 32 - %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a1, i32 0, i32 0) - %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a2, i32 0, i32 0) - %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a3, i32 0, i32 0) - %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a4, i32 0, i32 0) - %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a5, i32 0, i32 0) - %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a6, i32 0, i32 0) + %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a1, i32 0, i32 0) + %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a2, i32 0, i32 0) + %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a3, i32 0, i32 0) + %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a4, i32 0, i32 0) + %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a5, i32 0, i32 0) + %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a6, i32 0, i32 0) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) ret void } -define amdgpu_ps void @buffer_load_x1_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp) { +define amdgpu_ps void @buffer_load_x1_offen_merged_or(ptr addrspace(8) inreg %rsrc, i32 %inp) { ; PREGFX10-LABEL: buffer_load_x1_offen_merged_or: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: v_lshlrev_b32_e32 v4, 6, v0 @@ -487,18 +487,18 @@ %a4 = or i32 %a, 16 %a5 = or i32 %a, 28 %a6 = or i32 %a, 32 - %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a1, i32 0, i32 0) - %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a2, i32 0, i32 0) - %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a3, i32 0, i32 0) - %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a4, i32 0, i32 0) - %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a5, i32 0, i32 0) - %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a6, i32 0, i32 0) + %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a1, i32 0, i32 0) + %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a2, i32 0, i32 0) + %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a3, i32 0, i32 0) + %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a4, i32 0, i32 0) + %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a5, i32 0, i32 0) + %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a6, i32 0, i32 0) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) ret void } -define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a) { +define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(ptr addrspace(8) inreg %rsrc, i32 %a) { ; PREGFX10-LABEL: buffer_load_x1_offen_merged_glc_slc: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_dwordx2 v[1:2], v0, s[0:3], 0 offen offset:4 @@ -540,18 +540,18 @@ %a4 = add i32 %a, 16 %a5 = add i32 %a, 28 %a6 = add i32 %a, 32 - %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a1, i32 0, i32 0) - %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a2, i32 0, i32 0) - %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a3, i32 0, i32 1) - %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a4, i32 0, i32 1) - %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a5, i32 0, i32 3) - %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a6, i32 0, i32 3) + %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a1, i32 0, i32 0) + %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a2, i32 0, i32 0) + %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a3, i32 0, i32 1) + %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a4, i32 0, i32 1) + %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a5, i32 0, i32 3) + %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a6, i32 0, i32 3) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) ret void } -define amdgpu_ps void @buffer_load_x2_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a) { +define amdgpu_ps void @buffer_load_x2_offen_merged_and(ptr addrspace(8) inreg %rsrc, i32 %a) { ; PREGFX10-LABEL: buffer_load_x2_offen_merged_and: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:4 @@ -575,8 +575,8 @@ main_body: %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 - %vr1 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %a1, i32 0, i32 0) - %vr2 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %a2, i32 0, i32 0) + %vr1 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %a1, i32 0, i32 0) + %vr2 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %a2, i32 0, i32 0) %r1 = extractelement <2 x float> %vr1, i32 0 %r2 = extractelement <2 x float> %vr1, i32 1 %r3 = extractelement <2 x float> %vr2, i32 0 @@ -585,7 +585,7 @@ ret void } -define amdgpu_ps void @buffer_load_x2_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp) { +define amdgpu_ps void @buffer_load_x2_offen_merged_or(ptr addrspace(8) inreg %rsrc, i32 %inp) { ; PREGFX10-LABEL: buffer_load_x2_offen_merged_or: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -613,8 +613,8 @@ %a = shl i32 %inp, 4 %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 - %vr1 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %a1, i32 0, i32 0) - %vr2 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %a2, i32 0, i32 0) + %vr1 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %a1, i32 0, i32 0) + %vr2 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %a2, i32 0, i32 0) %r1 = extractelement <2 x float> %vr1, i32 0 %r2 = extractelement <2 x float> %vr1, i32 1 %r3 = extractelement <2 x float> %vr2, i32 0 @@ -623,7 +623,7 @@ ret void } -define amdgpu_ps void @buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) { +define amdgpu_ps void @buffer_load_x1_offset_merged(ptr addrspace(8) inreg %rsrc) { ; PREGFX10-LABEL: buffer_load_x1_offset_merged: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4 @@ -656,18 +656,18 @@ ; GFX11-NEXT: exp mrt0 v4, v5, v0, v0 done ; GFX11-NEXT: s_endpgm main_body: - %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0) - %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 0) - %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 12, i32 0, i32 0) - %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 16, i32 0, i32 0) - %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 28, i32 0, i32 0) - %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 32, i32 0, i32 0) + %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0) + %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 8, i32 0, i32 0) + %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 12, i32 0, i32 0) + %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 16, i32 0, i32 0) + %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 28, i32 0, i32 0) + %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 32, i32 0, i32 0) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) ret void } -define amdgpu_ps void @buffer_load_x2_offset_merged(<4 x i32> inreg %rsrc) { +define amdgpu_ps void @buffer_load_x2_offset_merged(ptr addrspace(8) inreg %rsrc) { ; PREGFX10-LABEL: buffer_load_x2_offset_merged: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4 @@ -689,8 +689,8 @@ ; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done ; GFX11-NEXT: s_endpgm main_body: - %vr1 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0) - %vr2 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 12, i32 0, i32 0) + %vr1 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0) + %vr2 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 12, i32 0, i32 0) %r1 = extractelement <2 x float> %vr1, i32 0 %r2 = extractelement <2 x float> %vr1, i32 1 %r3 = extractelement <2 x float> %vr2, i32 0 @@ -699,7 +699,7 @@ ret void } -define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(<4 x i32> inreg) { +define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(ptr addrspace(8) inreg) { ; PREGFX10-LABEL: buffer_load_int: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 @@ -726,9 +726,9 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0) - %data_glc = call <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32> %0, i32 0, i32 0, i32 1) - %data_slc = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %0, i32 0, i32 0, i32 2) + %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 0, i32 0) + %data_glc = call <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(ptr addrspace(8) %0, i32 0, i32 0, i32 1) + %data_slc = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) %0, i32 0, i32 0, i32 2) %fdata = bitcast <4 x i32> %data to <4 x float> %fdata_glc = bitcast <2 x i32> %data_glc to <2 x float> %fdata_slc = bitcast i32 %data_slc to float @@ -738,7 +738,7 @@ ret {<4 x float>, <2 x float>, float} %r2 } -define amdgpu_ps float @raw_buffer_load_ubyte(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @raw_buffer_load_ubyte(ptr addrspace(8) inreg %rsrc) { ; PREGFX10-LABEL: raw_buffer_load_ubyte: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 @@ -760,13 +760,13 @@ ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: ; return to shader part epilog main_body: - %tmp = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %tmp = call i8 @llvm.amdgcn.raw.buffer.load.i8(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) %tmp2 = zext i8 %tmp to i32 %val = uitofp i32 %tmp2 to float ret float %val } -define amdgpu_ps float @raw_buffer_load_i16(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @raw_buffer_load_i16(ptr addrspace(8) inreg %rsrc) { ; PREGFX10-LABEL: raw_buffer_load_i16: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_ushort v0, off, s[0:3], 0 @@ -788,13 +788,13 @@ ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX11-NEXT: ; return to shader part epilog main_body: - %tmp = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %tmp = call i16 @llvm.amdgcn.raw.buffer.load.i16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) %tmp2 = zext i16 %tmp to i32 %val = uitofp i32 %tmp2 to float ret float %val } -define amdgpu_ps float @raw_buffer_load_sbyte(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @raw_buffer_load_sbyte(ptr addrspace(8) inreg %rsrc) { ; PREGFX10-LABEL: raw_buffer_load_sbyte: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 @@ -816,13 +816,13 @@ ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX11-NEXT: ; return to shader part epilog main_body: - %tmp = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %tmp = call i8 @llvm.amdgcn.raw.buffer.load.i8(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) %tmp2 = sext i8 %tmp to i32 %val = sitofp i32 %tmp2 to float ret float %val } -define amdgpu_ps float @raw_buffer_load_sshort(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @raw_buffer_load_sshort(ptr addrspace(8) inreg %rsrc) { ; PREGFX10-LABEL: raw_buffer_load_sshort: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_sshort v0, off, s[0:3], 0 @@ -844,13 +844,13 @@ ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX11-NEXT: ; return to shader part epilog main_body: - %tmp = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %tmp = call i16 @llvm.amdgcn.raw.buffer.load.i16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) %tmp2 = sext i16 %tmp to i32 %val = sitofp i32 %tmp2 to float ret float %val } -define amdgpu_ps void @raw_buffer_load_f16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr) { +define amdgpu_ps void @raw_buffer_load_f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) { ; PREGFX10-LABEL: raw_buffer_load_f16: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_ushort v1, off, s[0:3], 0 @@ -873,12 +873,12 @@ ; GFX11-NEXT: ds_store_b16 v0, v1 ; GFX11-NEXT: s_endpgm main_body: - %val = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %val = call half @llvm.amdgcn.raw.buffer.load.f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) store half %val, ptr addrspace(3) %ptr ret void } -define amdgpu_ps void @raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr) { +define amdgpu_ps void @raw_buffer_load_v2f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) { ; PREGFX10-LABEL: raw_buffer_load_v2f16: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0 @@ -901,12 +901,12 @@ ; GFX11-NEXT: ds_store_b32 v0, v1 ; GFX11-NEXT: s_endpgm main_body: - %val = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %val = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) store <2 x half> %val, ptr addrspace(3) %ptr ret void } -define amdgpu_ps void @raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr) { +define amdgpu_ps void @raw_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) { ; PREGFX10-LABEL: raw_buffer_load_v4f16: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0 @@ -929,12 +929,12 @@ ; GFX11-NEXT: ds_store_b64 v0, v[1:2] ; GFX11-NEXT: s_endpgm main_body: - %val = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %val = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) store <4 x half> %val, ptr addrspace(3) %ptr ret void } -define amdgpu_ps void @raw_buffer_load_v2i16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr) { +define amdgpu_ps void @raw_buffer_load_v2i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) { ; PREGFX10-LABEL: raw_buffer_load_v2i16: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0 @@ -957,12 +957,12 @@ ; GFX11-NEXT: ds_store_b32 v0, v1 ; GFX11-NEXT: s_endpgm main_body: - %val = call <2 x i16> @llvm.amdgcn.raw.buffer.load.v2i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %val = call <2 x i16> @llvm.amdgcn.raw.buffer.load.v2i16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) store <2 x i16> %val, ptr addrspace(3) %ptr ret void } -define amdgpu_ps void @raw_buffer_load_v4i16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr) { +define amdgpu_ps void @raw_buffer_load_v4i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) { ; PREGFX10-LABEL: raw_buffer_load_v4i16: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0 @@ -985,12 +985,12 @@ ; GFX11-NEXT: ds_store_b64 v0, v[1:2] ; GFX11-NEXT: s_endpgm main_body: - %val = call <4 x i16> @llvm.amdgcn.raw.buffer.load.v4i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %val = call <4 x i16> @llvm.amdgcn.raw.buffer.load.v4i16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) store <4 x i16> %val, ptr addrspace(3) %ptr ret void } -define amdgpu_ps void @raw_buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) { +define amdgpu_ps void @raw_buffer_load_x1_offset_merged(ptr addrspace(8) inreg %rsrc) { ; PREGFX10-LABEL: raw_buffer_load_x1_offset_merged: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4 @@ -1023,18 +1023,18 @@ ; GFX11-NEXT: exp mrt0 v4, v5, v0, v0 done ; GFX11-NEXT: s_endpgm main_body: - %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0) - %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 0) - %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 12, i32 0, i32 0) - %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 16, i32 0, i32 0) - %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 28, i32 0, i32 0) - %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 32, i32 0, i32 0) + %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0) + %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 8, i32 0, i32 0) + %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 12, i32 0, i32 0) + %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 16, i32 0, i32 0) + %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 28, i32 0, i32 0) + %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 32, i32 0, i32 0) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) ret void } -define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc) { +define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(ptr addrspace(8) inreg %rsrc) { ; PREGFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 @@ -1079,29 +1079,29 @@ ; GFX11-NEXT: exp mrt0 v4, v5, v0, v0 done ; GFX11-NEXT: s_endpgm main_body: - %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 8) - %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 8) - %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 12, i32 0, i32 8) - %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 16, i32 0, i32 8) - %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 28, i32 0, i32 8) - %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 32, i32 0, i32 8) + %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 4, i32 0, i32 8) + %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 8, i32 0, i32 8) + %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 12, i32 0, i32 8) + %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 16, i32 0, i32 8) + %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 28, i32 0, i32 8) + %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 32, i32 0, i32 8) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) ret void } -declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #0 -declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) #0 -declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32) #0 -declare <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32>, i32, i32, i32) #0 -declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32) #0 +declare float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8), i32, i32, i32) #0 +declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8), i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8), i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8), i32, i32, i32) #0 +declare <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(ptr addrspace(8), i32, i32, i32) #0 +declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(ptr addrspace(8), i32, i32, i32) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 -declare i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32>, i32, i32, i32) #0 -declare i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32>, i32, i32, i32) #0 -declare <2 x i16> @llvm.amdgcn.raw.buffer.load.v2i16(<4 x i32>, i32, i32, i32) #0 -declare <4 x i16> @llvm.amdgcn.raw.buffer.load.v4i16(<4 x i32>, i32, i32, i32) #0 -declare half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32>, i32, i32, i32) #0 -declare <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32>, i32, i32, i32) #0 -declare <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32>, i32, i32, i32) #0 +declare i8 @llvm.amdgcn.raw.buffer.load.i8(ptr addrspace(8), i32, i32, i32) #0 +declare i16 @llvm.amdgcn.raw.buffer.load.i16(ptr addrspace(8), i32, i32, i32) #0 +declare <2 x i16> @llvm.amdgcn.raw.buffer.load.v2i16(ptr addrspace(8), i32, i32, i32) #0 +declare <4 x i16> @llvm.amdgcn.raw.buffer.load.v4i16(ptr addrspace(8), i32, i32, i32) #0 +declare half @llvm.amdgcn.raw.buffer.load.f16(ptr addrspace(8), i32, i32, i32) #0 +declare <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(ptr addrspace(8), i32, i32, i32) #0 +declare <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(ptr addrspace(8), i32, i32, i32) #0 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll @@ -6,9 +6,9 @@ ; GCN: s_load_dword s[[LO:[0-9]+]] ; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]] ; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen -define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %voffset) { +define amdgpu_kernel void @buffer_store_format_d16_x(ptr addrspace(8) %rsrc, [8 x i32], half %data, [8 x i32], i32 %voffset) { main_body: - call void @llvm.amdgcn.raw.buffer.store.format.f16(half %data, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.f16(half %data, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 0) ret void } @@ -22,9 +22,9 @@ ; UNPACKED: buffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen ; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen -define amdgpu_kernel void @buffer_store_format_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %voffset) { +define amdgpu_kernel void @buffer_store_format_d16_xy(ptr addrspace(8) %rsrc, <2 x half> %data, i32 %voffset) { main_body: - call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %data, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 0) ret void } @@ -45,10 +45,10 @@ ; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED0]] ; PACKED: buffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen -define amdgpu_kernel void @buffer_store_format_d16_xyz(<4 x i32> %rsrc, <4 x half> %data, i32 %voffset) { +define amdgpu_kernel void @buffer_store_format_d16_xyz(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %voffset) { main_body: %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> - call void @llvm.amdgcn.raw.buffer.store.format.v3f16(<3 x half> %data_subvec, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v3f16(<3 x half> %data_subvec, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 0) ret void } @@ -69,13 +69,13 @@ ; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] ; PACKED: buffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen -define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %voffset) { +define amdgpu_kernel void @buffer_store_format_d16_xyzw(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %voffset) { main_body: - call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %data, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 0) ret void } -declare void @llvm.amdgcn.raw.buffer.store.format.f16(half, <4 x i32>, i32, i32, i32) -declare void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i32) -declare void @llvm.amdgcn.raw.buffer.store.format.v3f16(<3 x half>, <4 x i32>, i32, i32, i32) -declare void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.format.f16(half, ptr addrspace(8), i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.format.v3f16(<3 x half>, ptr addrspace(8), i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half>, ptr addrspace(8), i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.ll @@ -6,29 +6,29 @@ ;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0 ;CHECK: buffer_store_format_xyzw v[4:7], off, s[0:3], 0 glc ;CHECK: buffer_store_format_xyzw v[8:11], off, s[0:3], 0 slc -define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +define amdgpu_ps void @buffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x float>, <4 x float>) { main_body: - call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 1) - call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 2) + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %2, ptr addrspace(8) %0, i32 0, i32 0, i32 1) + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %3, ptr addrspace(8) %0, i32 0, i32 0, i32 2) ret void } ;CHECK-LABEL: {{^}}buffer_store_immoffs: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0 offset:42 -define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +define amdgpu_ps void @buffer_store_immoffs(ptr addrspace(8) inreg, <4 x float>) { main_body: - call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 42, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 42, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_ofs: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 offen -define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { +define amdgpu_ps void @buffer_store_ofs(ptr addrspace(8) inreg, <4 x float>, i32) { main_body: - call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0) ret void } @@ -41,36 +41,36 @@ ;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 offen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 offen -define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { +define amdgpu_ps void @buffer_store_wait(ptr addrspace(8) inreg, <4 x float>, i32, i32, i32) { main_body: - call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0) - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %3, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %data, ptr addrspace(8) %0, i32 %4, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_x1: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 offen -define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %offset) { +define amdgpu_ps void @buffer_store_x1(ptr addrspace(8) inreg %rsrc, float %data, i32 %offset) { main_body: - call void @llvm.amdgcn.raw.buffer.store.format.f32(float %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.f32(float %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_x2: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 offen -define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %offset) { +define amdgpu_ps void @buffer_store_x2(ptr addrspace(8) inreg %rsrc, <2 x float> %data, i32 %offset) { main_body: - call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } -declare void @llvm.amdgcn.raw.buffer.store.format.f32(float, <4 x i32>, i32, i32, i32) #0 -declare void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) #0 -declare void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #0 -declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) #1 +declare void @llvm.amdgcn.raw.buffer.store.format.f32(float, ptr addrspace(8), i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float>, ptr addrspace(8), i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8), i32, i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll @@ -6,29 +6,29 @@ ;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ;CHECK: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc ;CHECK: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc -define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +define amdgpu_ps void @buffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x float>, <4 x float>) { main_body: - call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 1) - call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 2) + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %2, ptr addrspace(8) %0, i32 0, i32 0, i32 1) + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %3, ptr addrspace(8) %0, i32 0, i32 0, i32 2) ret void } ;CHECK-LABEL: {{^}}buffer_store_immoffs: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42 -define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +define amdgpu_ps void @buffer_store_immoffs(ptr addrspace(8) inreg, <4 x float>) { main_body: - call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 42, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 42, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_ofs: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen -define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { +define amdgpu_ps void @buffer_store_ofs(ptr addrspace(8) inreg, <4 x float>, i32) { main_body: - call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0) ret void } @@ -41,29 +41,29 @@ ;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen -define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { +define amdgpu_ps void @buffer_store_wait(ptr addrspace(8) inreg, <4 x float>, i32, i32, i32) { main_body: - call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0) - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %0, i32 %3, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %0, i32 %4, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_x1: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen -define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %offset) { +define amdgpu_ps void @buffer_store_x1(ptr addrspace(8) inreg %rsrc, float %data, i32 %offset) { main_body: - call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_x2: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen -define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %offset) #0 { +define amdgpu_ps void @buffer_store_x2(ptr addrspace(8) inreg %rsrc, <2 x float> %data, i32 %offset) #0 { main_body: - call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } @@ -71,19 +71,19 @@ ;CHECK-NOT: s_waitcnt ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 -define amdgpu_ps void @buffer_store_x1_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +define amdgpu_ps void @buffer_store_x1_offen_merged_and(ptr addrspace(8) inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 %a3 = add i32 %a, 12 %a4 = add i32 %a, 16 %a5 = add i32 %a, 28 %a6 = add i32 %a, 32 - call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 %a2, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 %a3, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 %a4, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 %a5, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 %a6, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, ptr addrspace(8) %rsrc, i32 %a1, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, ptr addrspace(8) %rsrc, i32 %a2, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, ptr addrspace(8) %rsrc, i32 %a3, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, ptr addrspace(8) %rsrc, i32 %a4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, ptr addrspace(8) %rsrc, i32 %a5, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, ptr addrspace(8) %rsrc, i32 %a6, i32 0, i32 0) ret void } @@ -91,7 +91,7 @@ ;CHECK-NOT: s_waitcnt ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:28 -define amdgpu_ps void @buffer_store_x1_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +define amdgpu_ps void @buffer_store_x1_offen_merged_or(ptr addrspace(8) inreg %rsrc, i32 %inp, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { %a = shl i32 %inp, 6 %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 @@ -99,12 +99,12 @@ %a4 = add i32 %a, 16 %a5 = add i32 %a, 28 %a6 = add i32 %a, 32 - call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 %a2, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 %a3, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 %a4, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 %a5, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 %a6, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, ptr addrspace(8) %rsrc, i32 %a1, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, ptr addrspace(8) %rsrc, i32 %a2, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, ptr addrspace(8) %rsrc, i32 %a3, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, ptr addrspace(8) %rsrc, i32 %a4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, ptr addrspace(8) %rsrc, i32 %a5, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, ptr addrspace(8) %rsrc, i32 %a6, i32 0, i32 0) ret void } @@ -114,42 +114,42 @@ ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}} ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}} ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}} -define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(ptr addrspace(8) inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 %a3 = add i32 %a, 12 %a4 = add i32 %a, 16 %a5 = add i32 %a, 28 %a6 = add i32 %a, 32 - call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 %a2, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 %a3, i32 0, i32 1) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 %a4, i32 0, i32 1) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 %a5, i32 0, i32 3) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 %a6, i32 0, i32 3) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, ptr addrspace(8) %rsrc, i32 %a1, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, ptr addrspace(8) %rsrc, i32 %a2, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, ptr addrspace(8) %rsrc, i32 %a3, i32 0, i32 1) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, ptr addrspace(8) %rsrc, i32 %a4, i32 0, i32 1) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, ptr addrspace(8) %rsrc, i32 %a5, i32 0, i32 3) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, ptr addrspace(8) %rsrc, i32 %a6, i32 0, i32 3) ret void } ;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged_and: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -define amdgpu_ps void @buffer_store_x2_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) { +define amdgpu_ps void @buffer_store_x2_offen_merged_and(ptr addrspace(8) inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) { %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 - call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 %a2, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, ptr addrspace(8) %rsrc, i32 %a1, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, ptr addrspace(8) %rsrc, i32 %a2, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged_or: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 -define amdgpu_ps void @buffer_store_x2_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp, <2 x float> %v1, <2 x float> %v2) { +define amdgpu_ps void @buffer_store_x2_offen_merged_or(ptr addrspace(8) inreg %rsrc, i32 %inp, <2 x float> %v1, <2 x float> %v2) { %a = shl i32 %inp, 4 %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 - call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 %a2, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, ptr addrspace(8) %rsrc, i32 %a1, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, ptr addrspace(8) %rsrc, i32 %a2, i32 0, i32 0) ret void } @@ -157,22 +157,22 @@ ;CHECK-NOT: s_waitcnt ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 -define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { - call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 16, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 28, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 32, i32 0, i32 0) +define amdgpu_ps void @buffer_store_x1_offset_merged(ptr addrspace(8) inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { + call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, ptr addrspace(8) %rsrc, i32 8, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, ptr addrspace(8) %rsrc, i32 12, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, ptr addrspace(8) %rsrc, i32 16, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, ptr addrspace(8) %rsrc, i32 28, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, ptr addrspace(8) %rsrc, i32 32, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1,<2 x float> %v2) { - call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 12, i32 0, i32 0) +define amdgpu_ps void @buffer_store_x2_offset_merged(ptr addrspace(8) inreg %rsrc, <2 x float> %v1,<2 x float> %v2) { + call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, ptr addrspace(8) %rsrc, i32 12, i32 0, i32 0) ret void } @@ -181,11 +181,11 @@ ;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ;CHECK: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 glc ;CHECK: buffer_store_dword v6, off, s[0:3], 0 slc -define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i32) { +define amdgpu_ps void @buffer_store_int(ptr addrspace(8) inreg, <4 x i32>, <2 x i32>, i32) { main_body: - call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %2, <4 x i32> %0, i32 0, i32 0, i32 1) - call void @llvm.amdgcn.raw.buffer.store.i32(i32 %3, <4 x i32> %0, i32 0, i32 0, i32 2) + call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %1, ptr addrspace(8) %0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %2, ptr addrspace(8) %0, i32 0, i32 0, i32 1) + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %3, ptr addrspace(8) %0, i32 0, i32 0, i32 2) ret void } @@ -194,11 +194,11 @@ ;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} ;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0 ;CHECK-NEXT: s_endpgm -define amdgpu_ps void @raw_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) { +define amdgpu_ps void @raw_buffer_store_byte(ptr addrspace(8) inreg %rsrc, float %v1) { main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i8 - call void @llvm.amdgcn.raw.buffer.store.i8(i8 %v3, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i8(i8 %v3, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) ret void } @@ -207,11 +207,11 @@ ;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} ;CHECK-NEXT: buffer_store_short v{{[0-9]}}, off, s[0:3], 0 ;CHECK-NEXT: s_endpgm -define amdgpu_ps void @raw_buffer_store_short(<4 x i32> inreg %rsrc, float %v1) { +define amdgpu_ps void @raw_buffer_store_short(ptr addrspace(8) inreg %rsrc, float %v1) { main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i16 - call void @llvm.amdgcn.raw.buffer.store.i16(i16 %v3, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i16(i16 %v3, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) ret void } @@ -220,29 +220,29 @@ ;CHECK-NOT: v0 ;CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 ;CHECK-NEXT: s_endpgm -define amdgpu_ps void @raw_buffer_store_f16(<4 x i32> inreg %rsrc, i32 %v1) { +define amdgpu_ps void @raw_buffer_store_f16(ptr addrspace(8) inreg %rsrc, i32 %v1) { main_body: %trunc = trunc i32 %v1 to i16 %cast = bitcast i16 %trunc to half - call void @llvm.amdgcn.raw.buffer.store.f16(half %cast, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f16(half %cast, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_v2f16: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen -define amdgpu_ps void @buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %offset) { +define amdgpu_ps void @buffer_store_v2f16(ptr addrspace(8) inreg %rsrc, <2 x half> %data, i32 %offset) { main_body: - call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_v4f16: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen -define amdgpu_ps void @buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x half> %data, i32 %offset) #0 { +define amdgpu_ps void @buffer_store_v4f16(ptr addrspace(8) inreg %rsrc, <4 x half> %data, i32 %offset) #0 { main_body: - call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } @@ -251,28 +251,28 @@ ;CHECK-NOT: v0 ;CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 ;CHECK-NEXT: s_endpgm -define amdgpu_ps void @raw_buffer_store_i16(<4 x i32> inreg %rsrc, i32 %v1) { +define amdgpu_ps void @raw_buffer_store_i16(ptr addrspace(8) inreg %rsrc, i32 %v1) { main_body: %trunc = trunc i32 %v1 to i16 - call void @llvm.amdgcn.raw.buffer.store.i16(i16 %trunc, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i16(i16 %trunc, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_v2i16: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen -define amdgpu_ps void @buffer_store_v2i16(<4 x i32> inreg %rsrc, <2 x i16> %data, i32 %offset) { +define amdgpu_ps void @buffer_store_v2i16(ptr addrspace(8) inreg %rsrc, <2 x i16> %data, i32 %offset) { main_body: - call void @llvm.amdgcn.raw.buffer.store.v2i16(<2 x i16> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2i16(<2 x i16> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_v4i16: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen -define amdgpu_ps void @buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16> %data, i32 %offset) #0 { +define amdgpu_ps void @buffer_store_v4i16(ptr addrspace(8) inreg %rsrc, <4 x i16> %data, i32 %offset) #0 { main_body: - call void @llvm.amdgcn.raw.buffer.store.v4i16(<4 x i16> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4i16(<4 x i16> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } @@ -280,13 +280,13 @@ ;CHECK-NOT: s_waitcnt ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 -define amdgpu_ps void @raw_buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { - call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 16, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 28, i32 0, i32 0) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 32, i32 0, i32 0) +define amdgpu_ps void @raw_buffer_store_x1_offset_merged(ptr addrspace(8) inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { + call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, ptr addrspace(8) %rsrc, i32 8, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, ptr addrspace(8) %rsrc, i32 12, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, ptr addrspace(8) %rsrc, i32 16, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, ptr addrspace(8) %rsrc, i32 28, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, ptr addrspace(8) %rsrc, i32 32, i32 0, i32 0) ret void } @@ -297,30 +297,30 @@ ;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:16 ;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:28 ;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:32 -define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { - call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 8) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 8) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 8) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 16, i32 0, i32 8) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 28, i32 0, i32 8) - call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 32, i32 0, i32 8) +define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(ptr addrspace(8) inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { + call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, ptr addrspace(8) %rsrc, i32 4, i32 0, i32 8) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, ptr addrspace(8) %rsrc, i32 8, i32 0, i32 8) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, ptr addrspace(8) %rsrc, i32 12, i32 0, i32 8) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, ptr addrspace(8) %rsrc, i32 16, i32 0, i32 8) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, ptr addrspace(8) %rsrc, i32 28, i32 0, i32 8) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, ptr addrspace(8) %rsrc, i32 32, i32 0, i32 8) ret void } -declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) #0 -declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) #0 -declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #0 -declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32) #0 -declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32) #0 -declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32) #0 -declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #1 -declare void @llvm.amdgcn.raw.buffer.store.i8(i8, <4 x i32>, i32, i32, i32) #0 -declare void @llvm.amdgcn.raw.buffer.store.f16(half, <4 x i32>, i32, i32, i32) #0 -declare void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32) #0 -declare void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32) #0 -declare void @llvm.amdgcn.raw.buffer.store.i16(i16, <4 x i32>, i32, i32, i32) #0 -declare void @llvm.amdgcn.raw.buffer.store.v2i16(<2 x i16>, <4 x i32>, i32, i32, i32) #0 -declare void @llvm.amdgcn.raw.buffer.store.v4i16(<4 x i16>, <4 x i32>, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, ptr addrspace(8), i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.i32(i32, ptr addrspace(8), i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, ptr addrspace(8), i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, ptr addrspace(8), i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8), i32, i32, i32) #1 +declare void @llvm.amdgcn.raw.buffer.store.i8(i8, ptr addrspace(8), i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.f16(half, ptr addrspace(8), i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half>, ptr addrspace(8), i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.i16(i16, ptr addrspace(8), i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.v2i16(<2 x i16>, ptr addrspace(8), i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.v4i16(<4 x i16>, ptr addrspace(8), i32, i32, i32) #0 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll @@ -5,7 +5,7 @@ ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GFX10-PACKED %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GFX11-PACKED %s -define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) { +define amdgpu_ps half @tbuffer_load_d16_x(ptr addrspace(8) inreg %rsrc) { ; PREGFX10-UNPACKED-LABEL: tbuffer_load_d16_x: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body ; PREGFX10-UNPACKED-NEXT: tbuffer_load_format_d16_x v0, off, s[0:3], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] ; encoding: [0x00,0x00,0xb4,0xe8,0x00,0x00,0x00,0x80] @@ -30,11 +30,11 @@ ; GFX11-PACKED-NEXT: s_waitcnt vmcnt(0) ; GFX11-PACKED-NEXT: ; return to shader part epilog main_body: - %data = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 22, i32 0) + %data = call half @llvm.amdgcn.raw.tbuffer.load.f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 22, i32 0) ret half %data } -define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) { +define amdgpu_ps half @tbuffer_load_d16_xy(ptr addrspace(8) inreg %rsrc) { ; PREGFX10-UNPACKED-LABEL: tbuffer_load_d16_xy: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body ; PREGFX10-UNPACKED-NEXT: tbuffer_load_format_d16_xy v[0:1], off, s[0:3], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] ; encoding: [0x00,0x80,0xb4,0xe8,0x00,0x00,0x00,0x80] @@ -63,12 +63,12 @@ ; GFX11-PACKED-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-PACKED-NEXT: ; return to shader part epilog main_body: - %data = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 22, i32 0) + %data = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 22, i32 0) %elt = extractelement <2 x half> %data, i32 1 ret half %elt } -define amdgpu_ps half @tbuffer_load_d16_xyz(<4 x i32> inreg %rsrc) { +define amdgpu_ps half @tbuffer_load_d16_xyz(ptr addrspace(8) inreg %rsrc) { ; PREGFX10-UNPACKED-LABEL: tbuffer_load_d16_xyz: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body ; PREGFX10-UNPACKED-NEXT: tbuffer_load_format_d16_xyz v[0:2], off, s[0:3], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] ; encoding: [0x00,0x00,0xb5,0xe8,0x00,0x00,0x00,0x80] @@ -97,12 +97,12 @@ ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-PACKED-NEXT: ; return to shader part epilog main_body: - %data = call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 22, i32 0) + %data = call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 22, i32 0) %elt = extractelement <3 x half> %data, i32 2 ret half %elt } -define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) { +define amdgpu_ps half @tbuffer_load_d16_xyzw(ptr addrspace(8) inreg %rsrc) { ; PREGFX10-UNPACKED-LABEL: tbuffer_load_d16_xyzw: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body ; PREGFX10-UNPACKED-NEXT: tbuffer_load_format_d16_xyzw v[0:3], off, s[0:3], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] ; encoding: [0x00,0x80,0xb5,0xe8,0x00,0x00,0x00,0x80] @@ -131,12 +131,12 @@ ; GFX11-PACKED-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX11-PACKED-NEXT: ; return to shader part epilog main_body: - %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 22, i32 0) + %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 22, i32 0) %elt = extractelement <4 x half> %data, i32 3 ret half %elt } -declare half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32) -declare <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32) -declare <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32>, i32, i32, i32, i32) -declare <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32) +declare half @llvm.amdgcn.raw.tbuffer.load.f16(ptr addrspace(8), i32, i32, i32, i32) +declare <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(ptr addrspace(8), i32, i32, i32, i32) +declare <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(ptr addrspace(8), i32, i32, i32, i32) +declare <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(ptr addrspace(8), i32, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll @@ -4,7 +4,7 @@ ;RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s ;RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefix=GFX11 %s -define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) { +define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(ptr addrspace(8) inreg) { ; PREGFX10-LABEL: tbuffer_load: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: tbuffer_load_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] @@ -34,10 +34,10 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 78, i32 0) - %vdata_glc = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 63, i32 1) - %vdata_slc = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 22, i32 2) - %vdata_f32 = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 22, i32 5) + %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 0, i32 78, i32 0) + %vdata_glc = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 0, i32 63, i32 1) + %vdata_slc = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 0, i32 22, i32 2) + %vdata_f32 = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 22, i32 5) %vdata.f = bitcast <4 x i32> %vdata to <4 x float> %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float> %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float> @@ -48,7 +48,7 @@ ret {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r3 } -define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) { +define amdgpu_vs <4 x float> @tbuffer_load_immoffs(ptr addrspace(8) inreg) { ; PREGFX10-LABEL: tbuffer_load_immoffs: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: tbuffer_load_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42 @@ -67,12 +67,12 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 42, i32 0, i32 78, i32 0) + %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 42, i32 0, i32 78, i32 0) %vdata.f = bitcast <4 x i32> %vdata to <4 x float> ret <4 x float> %vdata.f } -define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(ptr addrspace(8) inreg) { ; PREGFX10-LABEL: buffer_load_voffset_large_12bit: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: tbuffer_load_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] offset:4092 @@ -91,11 +91,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %0, i32 4092, i32 0, i32 63, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %0, i32 4092, i32 0, i32 63, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_13bit(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_13bit(ptr addrspace(8) inreg) { ; PREGFX10-LABEL: tbuffer_load_voffset_large_13bit: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: v_mov_b32_e32 v0, 0x1000 @@ -117,11 +117,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %0, i32 8188, i32 0, i32 63, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %0, i32 8188, i32 0, i32 63, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_16bit(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_16bit(ptr addrspace(8) inreg) { ; PREGFX10-LABEL: tbuffer_load_voffset_large_16bit: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: v_mov_b32_e32 v0, 0xf000 @@ -143,11 +143,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %0, i32 65532, i32 0, i32 63, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %0, i32 65532, i32 0, i32 63, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_23bit(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_23bit(ptr addrspace(8) inreg) { ; PREGFX10-LABEL: tbuffer_load_voffset_large_23bit: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: v_mov_b32_e32 v0, 0x7ff000 @@ -169,11 +169,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %0, i32 8388604, i32 0, i32 63, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %0, i32 8388604, i32 0, i32 63, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_24bit(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_24bit(ptr addrspace(8) inreg) { ; PREGFX10-LABEL: tbuffer_load_voffset_large_24bit: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: v_mov_b32_e32 v0, 0xfff000 @@ -195,11 +195,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %0, i32 16777212, i32 0, i32 63, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %0, i32 16777212, i32 0, i32 63, i32 0) ret <4 x float> %data } -define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) { +define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(ptr addrspace(8) inreg, i32 inreg %soffs) { ; PREGFX10-LABEL: tbuffer_load_immoffs_large: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: tbuffer_load_format_xyzw v[0:3], off, s[0:3], 61 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] offset:4095 @@ -225,9 +225,9 @@ ; GFX11-NEXT: tbuffer_load_format_xyzw v[8:11], off, s[0:3], s4 format:77 offset:1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog - %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 4095, i32 61, i32 47, i32 0) - %vdata_glc = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 73, i32 %soffs, i32 62, i32 0) - %vdata_slc = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 1, i32 %soffs, i32 77, i32 0) + %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 4095, i32 61, i32 47, i32 0) + %vdata_glc = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 73, i32 %soffs, i32 62, i32 0) + %vdata_slc = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 1, i32 %soffs, i32 77, i32 0) %vdata.f = bitcast <4 x i32> %vdata to <4 x float> %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float> %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float> @@ -237,7 +237,7 @@ ret {<4 x float>, <4 x float>, <4 x float>} %r2 } -define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) { +define amdgpu_vs <4 x float> @tbuffer_load_ofs(ptr addrspace(8) inreg, i32 %voffs) { ; PREGFX10-LABEL: tbuffer_load_ofs: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offen @@ -256,12 +256,12 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 %voffs, i32 0, i32 78, i32 0) + %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 %voffs, i32 0, i32 78, i32 0) %vdata.f = bitcast <4 x i32> %vdata to <4 x float> ret <4 x float> %vdata.f } -define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) { +define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(ptr addrspace(8) inreg, i32 %voffs) { ; PREGFX10-LABEL: tbuffer_load_ofs_imm: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offen offset:52 @@ -281,12 +281,12 @@ ; GFX11-NEXT: ; return to shader part epilog main_body: %ofs = add i32 %voffs, 52 - %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 %ofs, i32 0, i32 78, i32 0) + %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 %ofs, i32 0, i32 78, i32 0) %vdata.f = bitcast <4 x i32> %vdata to <4 x float> ret <4 x float> %vdata.f } -define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { +define amdgpu_vs <2 x float> @buffer_load_xy(ptr addrspace(8) inreg %rsrc) { ; PREGFX10-LABEL: buffer_load_xy: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: tbuffer_load_format_xy v[0:1], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] @@ -304,12 +304,12 @@ ; GFX11-NEXT: tbuffer_load_format_xy v[0:1], off, s[0:3], 0 format:77 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog - %vdata = call <2 x i32> @llvm.amdgcn.raw.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 77, i32 0) + %vdata = call <2 x i32> @llvm.amdgcn.raw.tbuffer.load.v2i32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 77, i32 0) %vdata.f = bitcast <2 x i32> %vdata to <2 x float> ret <2 x float> %vdata.f } -define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) { +define amdgpu_vs float @buffer_load_x(ptr addrspace(8) inreg %rsrc) { ; PREGFX10-LABEL: buffer_load_x: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: tbuffer_load_format_x v0, off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] @@ -327,12 +327,12 @@ ; GFX11-NEXT: tbuffer_load_format_x v0, off, s[0:3], 0 format:77 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog - %vdata = call i32 @llvm.amdgcn.raw.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 77, i32 0) + %vdata = call i32 @llvm.amdgcn.raw.tbuffer.load.i32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 77, i32 0) %vdata.f = bitcast i32 %vdata to float ret float %vdata.f } -declare i32 @llvm.amdgcn.raw.tbuffer.load.i32(<4 x i32>, i32, i32, i32, i32) -declare <2 x i32> @llvm.amdgcn.raw.tbuffer.load.v2i32(<4 x i32>, i32, i32, i32, i32) -declare <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) -declare <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) +declare i32 @llvm.amdgcn.raw.tbuffer.load.i32(ptr addrspace(8), i32, i32, i32, i32) +declare <2 x i32> @llvm.amdgcn.raw.tbuffer.load.v2i32(ptr addrspace(8), i32, i32, i32, i32) +declare <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(ptr addrspace(8), i32, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8), i32, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll @@ -5,7 +5,7 @@ ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10-PACKED %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX11-PACKED %s -define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) { +define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body ; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34 @@ -45,11 +45,11 @@ ; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.raw.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.f16(half %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 33, i32 0) ret void } -define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data) { +define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body ; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34 @@ -92,11 +92,11 @@ ; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half> %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 33, i32 0) ret void } -define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %data) { +define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body ; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 @@ -148,11 +148,11 @@ ; GFX11-PACKED-NEXT: s_endpgm main_body: %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> - call void @llvm.amdgcn.raw.tbuffer.store.v3f16(<3 x half> %data_subvec, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v3f16(<3 x half> %data_subvec, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 33, i32 0) ret void } -define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data) { +define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body ; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 @@ -202,11 +202,11 @@ ; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half> %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 33, i32 0) ret void } -declare void @llvm.amdgcn.raw.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32, i32) -declare void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32) -declare void @llvm.amdgcn.raw.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32) -declare void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32) +declare void @llvm.amdgcn.raw.tbuffer.store.f16(half, ptr addrspace(8), i32, i32, i32, i32) +declare void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32, i32) +declare void @llvm.amdgcn.raw.tbuffer.store.v3f16(<3 x half>, ptr addrspace(8), i32, i32, i32, i32) +declare void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half>, ptr addrspace(8), i32, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll @@ -4,7 +4,7 @@ ;RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10 %s ;RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX11 %s -define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +define amdgpu_ps void @tbuffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x float>, <4 x float>) { ; PREGFX10-LABEL: tbuffer_store: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16_16_16,BUF_NUM_FORMAT_USCALED] @@ -34,14 +34,14 @@ %in1 = bitcast <4 x float> %1 to <4 x i32> %in2 = bitcast <4 x float> %2 to <4 x i32> %in3 = bitcast <4 x float> %3 to <4 x i32> - call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 44, i32 0) - call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in2, <4 x i32> %0, i32 0, i32 0, i32 61, i32 1) - call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in3, <4 x i32> %0, i32 0, i32 0, i32 78, i32 2) - call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 78, i32 5) + call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in1, ptr addrspace(8) %0, i32 0, i32 0, i32 44, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in2, ptr addrspace(8) %0, i32 0, i32 0, i32 61, i32 1) + call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in3, ptr addrspace(8) %0, i32 0, i32 0, i32 78, i32 2) + call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %3, ptr addrspace(8) %0, i32 0, i32 0, i32 78, i32 5) ret void } -define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +define amdgpu_ps void @tbuffer_store_immoffs(ptr addrspace(8) inreg, <4 x float>) { ; PREGFX10-LABEL: tbuffer_store_immoffs: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42 @@ -59,11 +59,11 @@ ; GFX11-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> - call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 42, i32 0, i32 117, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in1, ptr addrspace(8) %0, i32 42, i32 0, i32 117, i32 0) ret void } -define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) { +define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(ptr addrspace(8) inreg, <4 x float> %vdata, i32 inreg %soffset) { ; PREGFX10-LABEL: tbuffer_store_scalar_and_imm_offs: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: tbuffer_store_format_xyzw v[0:3], off, s[0:3], s4 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42 @@ -81,11 +81,11 @@ ; GFX11-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> - call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 42, i32 %soffset, i32 117, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in1, ptr addrspace(8) %0, i32 42, i32 %soffset, i32 117, i32 0) ret void } -define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) { +define amdgpu_ps void @buffer_store_ofs(ptr addrspace(8) inreg, <4 x float> %vdata, i32 %voffset) { ; PREGFX10-LABEL: buffer_store_ofs: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_8_8,BUF_NUM_FORMAT_FLOAT] offen @@ -103,11 +103,11 @@ ; GFX11-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> - call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %voffset, i32 0, i32 115, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in1, ptr addrspace(8) %0, i32 %voffset, i32 0, i32 115, i32 0) ret void } -define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data) { +define amdgpu_ps void @buffer_store_x1(ptr addrspace(8) inreg %rsrc, float %data) { ; PREGFX10-LABEL: buffer_store_x1: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: tbuffer_store_format_x v0, off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_FLOAT] @@ -125,11 +125,11 @@ ; GFX11-NEXT: s_endpgm main_body: %data.i = bitcast float %data to i32 - call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %data.i, <4 x i32> %rsrc, i32 0, i32 0, i32 125, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %data.i, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 125, i32 0) ret void } -define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data) { +define amdgpu_ps void @buffer_store_x2(ptr addrspace(8) inreg %rsrc, <2 x float> %data) { ; PREGFX10-LABEL: buffer_store_x2: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: tbuffer_store_format_xy v[0:1], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] @@ -147,11 +147,11 @@ ; GFX11-NEXT: s_endpgm main_body: %data.i = bitcast <2 x float> %data to <2 x i32> - call void @llvm.amdgcn.raw.tbuffer.store.v2i32(<2 x i32> %data.i, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v2i32(<2 x i32> %data.i, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 33, i32 0) ret void } -define amdgpu_ps void @buffer_store_voffset_large_12bit(<4 x i32> inreg %rsrc, <4 x float> %data) { +define amdgpu_ps void @buffer_store_voffset_large_12bit(ptr addrspace(8) inreg %rsrc, <4 x float> %data) { ; PREGFX10-LABEL: buffer_store_voffset_large_12bit: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] offset:4092 @@ -168,11 +168,11 @@ ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 4092, i32 0, i32 63, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 4092, i32 0, i32 63, i32 0) ret void } -define amdgpu_ps void @buffer_store_voffset_large_13bit(<4 x i32> inreg %rsrc, <4 x float> %data) { +define amdgpu_ps void @buffer_store_voffset_large_13bit(ptr addrspace(8) inreg %rsrc, <4 x float> %data) { ; PREGFX10-LABEL: buffer_store_voffset_large_13bit: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: v_mov_b32_e32 v4, 0x1000 @@ -192,11 +192,11 @@ ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 8188, i32 0, i32 63, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 8188, i32 0, i32 63, i32 0) ret void } -define amdgpu_ps void @buffer_store_voffset_large_16bit(<4 x i32> inreg %rsrc, <4 x float> %data) { +define amdgpu_ps void @buffer_store_voffset_large_16bit(ptr addrspace(8) inreg %rsrc, <4 x float> %data) { ; PREGFX10-LABEL: buffer_store_voffset_large_16bit: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: v_mov_b32_e32 v4, 0xf000 @@ -216,11 +216,11 @@ ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 65532, i32 0, i32 63, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 65532, i32 0, i32 63, i32 0) ret void } -define amdgpu_ps void @buffer_store_voffset_large_23bit(<4 x i32> inreg %rsrc, <4 x float> %data) { +define amdgpu_ps void @buffer_store_voffset_large_23bit(ptr addrspace(8) inreg %rsrc, <4 x float> %data) { ; PREGFX10-LABEL: buffer_store_voffset_large_23bit: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: v_mov_b32_e32 v4, 0x7ff000 @@ -240,11 +240,11 @@ ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 8388604, i32 0, i32 63, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 8388604, i32 0, i32 63, i32 0) ret void } -define amdgpu_ps void @buffer_store_voffset_large_24bit(<4 x i32> inreg %rsrc, <4 x float> %data) { +define amdgpu_ps void @buffer_store_voffset_large_24bit(ptr addrspace(8) inreg %rsrc, <4 x float> %data) { ; PREGFX10-LABEL: buffer_store_voffset_large_24bit: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: v_mov_b32_e32 v4, 0xfff000 @@ -264,13 +264,13 @@ ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 16777212, i32 0, i32 63, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 16777212, i32 0, i32 63, i32 0) ret void } -declare void @llvm.amdgcn.raw.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 -declare void @llvm.amdgcn.raw.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32) #0 -declare void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32) #0 -declare void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.tbuffer.store.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.tbuffer.store.v2i32(<2 x i32>, ptr addrspace(8), i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32>, ptr addrspace(8), i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32, i32) #0 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll @@ -6,7 +6,7 @@ ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX678910,GFX78910,GFX8910,GFX910,GFX10 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX11 -define amdgpu_ps void @s_buffer_load_imm(<4 x i32> inreg %desc) { +define amdgpu_ps void @s_buffer_load_imm(ptr addrspace(8) inreg %desc) { ; GFX67-LABEL: s_buffer_load_imm: ; GFX67: ; %bb.0: ; %main_body ; GFX67-NEXT: s_buffer_load_dword s0, s[0:3], 0x1 @@ -31,13 +31,13 @@ ; GFX11-NEXT: exp mrt0 v0, v0, v0, v0 done ; GFX11-NEXT: s_endpgm main_body: - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 4, i32 0) %bitcast = bitcast i32 %load to float call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) ret void } -define amdgpu_ps void @s_buffer_load_index(<4 x i32> inreg %desc, i32 inreg %index) { +define amdgpu_ps void @s_buffer_load_index(ptr addrspace(8) inreg %desc, i32 inreg %index) { ; GFX678910-LABEL: s_buffer_load_index: ; GFX678910: ; %bb.0: ; %main_body ; GFX678910-NEXT: s_buffer_load_dword s0, s[0:3], s4 @@ -54,13 +54,13 @@ ; GFX11-NEXT: exp mrt0 v0, v0, v0, v0 done ; GFX11-NEXT: s_endpgm main_body: - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %index, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 %index, i32 0) %bitcast = bitcast i32 %load to float call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) ret void } -define amdgpu_ps void @s_buffer_load_index_divergent(<4 x i32> inreg %desc, i32 %index) { +define amdgpu_ps void @s_buffer_load_index_divergent(ptr addrspace(8) inreg %desc, i32 %index) { ; GFX678910-LABEL: s_buffer_load_index_divergent: ; GFX678910: ; %bb.0: ; %main_body ; GFX678910-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen @@ -75,13 +75,13 @@ ; GFX11-NEXT: exp mrt0 v0, v0, v0, v0 done ; GFX11-NEXT: s_endpgm main_body: - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %index, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 %index, i32 0) %bitcast = bitcast i32 %load to float call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) ret void } -define amdgpu_ps void @s_buffer_loadx2_imm(<4 x i32> inreg %desc) { +define amdgpu_ps void @s_buffer_loadx2_imm(ptr addrspace(8) inreg %desc) { ; GFX67-LABEL: s_buffer_loadx2_imm: ; GFX67: ; %bb.0: ; %main_body ; GFX67-NEXT: s_buffer_load_dwordx2 s[0:1], s[0:3], 0x10 @@ -109,7 +109,7 @@ ; GFX11-NEXT: exp mrt0 v0, v1, v0, v0 done ; GFX11-NEXT: s_endpgm main_body: - %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 64, i32 0) + %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(ptr addrspace(8) %desc, i32 64, i32 0) %bitcast = bitcast <2 x i32> %load to <2 x float> %x = extractelement <2 x float> %bitcast, i32 0 %y = extractelement <2 x float> %bitcast, i32 1 @@ -117,7 +117,7 @@ ret void } -define amdgpu_ps void @s_buffer_loadx2_index(<4 x i32> inreg %desc, i32 inreg %index) { +define amdgpu_ps void @s_buffer_loadx2_index(ptr addrspace(8) inreg %desc, i32 inreg %index) { ; GFX678910-LABEL: s_buffer_loadx2_index: ; GFX678910: ; %bb.0: ; %main_body ; GFX678910-NEXT: s_buffer_load_dwordx2 s[0:1], s[0:3], s4 @@ -136,7 +136,7 @@ ; GFX11-NEXT: exp mrt0 v0, v1, v0, v0 done ; GFX11-NEXT: s_endpgm main_body: - %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %index, i32 0) + %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(ptr addrspace(8) %desc, i32 %index, i32 0) %bitcast = bitcast <2 x i32> %load to <2 x float> %x = extractelement <2 x float> %bitcast, i32 0 %y = extractelement <2 x float> %bitcast, i32 1 @@ -144,7 +144,7 @@ ret void } -define amdgpu_ps void @s_buffer_loadx2_index_divergent(<4 x i32> inreg %desc, i32 %index) { +define amdgpu_ps void @s_buffer_loadx2_index_divergent(ptr addrspace(8) inreg %desc, i32 %index) { ; GFX678910-LABEL: s_buffer_loadx2_index_divergent: ; GFX678910: ; %bb.0: ; %main_body ; GFX678910-NEXT: buffer_load_dwordx2 v[0:1], v0, s[0:3], 0 offen @@ -159,7 +159,7 @@ ; GFX11-NEXT: exp mrt0 v0, v1, v0, v0 done ; GFX11-NEXT: s_endpgm main_body: - %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %index, i32 0) + %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(ptr addrspace(8) %desc, i32 %index, i32 0) %bitcast = bitcast <2 x i32> %load to <2 x float> %x = extractelement <2 x float> %bitcast, i32 0 %y = extractelement <2 x float> %bitcast, i32 1 @@ -167,7 +167,7 @@ ret void } -define amdgpu_ps void @s_buffer_loadx3_imm(<4 x i32> inreg %desc) { +define amdgpu_ps void @s_buffer_loadx3_imm(ptr addrspace(8) inreg %desc) { ; GFX67-LABEL: s_buffer_loadx3_imm: ; GFX67: ; %bb.0: ; %main_body ; GFX67-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x10 @@ -198,7 +198,7 @@ ; GFX11-NEXT: exp mrt0 v0, v1, v2, v0 done ; GFX11-NEXT: s_endpgm main_body: - %load = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %desc, i32 64, i32 0) + %load = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(ptr addrspace(8) %desc, i32 64, i32 0) %bitcast = bitcast <3 x i32> %load to <3 x float> %x = extractelement <3 x float> %bitcast, i32 0 %y = extractelement <3 x float> %bitcast, i32 1 @@ -207,7 +207,7 @@ ret void } -define amdgpu_ps void @s_buffer_loadx3_index(<4 x i32> inreg %desc, i32 inreg %index) { +define amdgpu_ps void @s_buffer_loadx3_index(ptr addrspace(8) inreg %desc, i32 inreg %index) { ; GFX678910-LABEL: s_buffer_loadx3_index: ; GFX678910: ; %bb.0: ; %main_body ; GFX678910-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], s4 @@ -228,7 +228,7 @@ ; GFX11-NEXT: exp mrt0 v0, v1, v2, v0 done ; GFX11-NEXT: s_endpgm main_body: - %load = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %desc, i32 %index, i32 0) + %load = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(ptr addrspace(8) %desc, i32 %index, i32 0) %bitcast = bitcast <3 x i32> %load to <3 x float> %x = extractelement <3 x float> %bitcast, i32 0 %y = extractelement <3 x float> %bitcast, i32 1 @@ -237,7 +237,7 @@ ret void } -define amdgpu_ps void @s_buffer_loadx3_index_divergent(<4 x i32> inreg %desc, i32 %index) { +define amdgpu_ps void @s_buffer_loadx3_index_divergent(ptr addrspace(8) inreg %desc, i32 %index) { ; GFX6-LABEL: s_buffer_loadx3_index_divergent: ; GFX6: ; %bb.0: ; %main_body ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen @@ -259,7 +259,7 @@ ; GFX11-NEXT: exp mrt0 v0, v1, v2, v0 done ; GFX11-NEXT: s_endpgm main_body: - %load = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %desc, i32 %index, i32 0) + %load = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(ptr addrspace(8) %desc, i32 %index, i32 0) %bitcast = bitcast <3 x i32> %load to <3 x float> %x = extractelement <3 x float> %bitcast, i32 0 %y = extractelement <3 x float> %bitcast, i32 1 @@ -268,7 +268,7 @@ ret void } -define amdgpu_ps void @s_buffer_loadx4_imm(<4 x i32> inreg %desc) { +define amdgpu_ps void @s_buffer_loadx4_imm(ptr addrspace(8) inreg %desc) { ; GFX67-LABEL: s_buffer_loadx4_imm: ; GFX67: ; %bb.0: ; %main_body ; GFX67-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x32 @@ -302,7 +302,7 @@ ; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done ; GFX11-NEXT: s_endpgm main_body: - %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 200, i32 0) + %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(ptr addrspace(8) %desc, i32 200, i32 0) %bitcast = bitcast <4 x i32> %load to <4 x float> %x = extractelement <4 x float> %bitcast, i32 0 %y = extractelement <4 x float> %bitcast, i32 1 @@ -312,7 +312,7 @@ ret void } -define amdgpu_ps void @s_buffer_loadx4_index(<4 x i32> inreg %desc, i32 inreg %index) { +define amdgpu_ps void @s_buffer_loadx4_index(ptr addrspace(8) inreg %desc, i32 inreg %index) { ; GFX678910-LABEL: s_buffer_loadx4_index: ; GFX678910: ; %bb.0: ; %main_body ; GFX678910-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], s4 @@ -335,7 +335,7 @@ ; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done ; GFX11-NEXT: s_endpgm main_body: - %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %index, i32 0) + %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(ptr addrspace(8) %desc, i32 %index, i32 0) %bitcast = bitcast <4 x i32> %load to <4 x float> %x = extractelement <4 x float> %bitcast, i32 0 %y = extractelement <4 x float> %bitcast, i32 1 @@ -345,7 +345,7 @@ ret void } -define amdgpu_ps void @s_buffer_loadx4_index_divergent(<4 x i32> inreg %desc, i32 %index) { +define amdgpu_ps void @s_buffer_loadx4_index_divergent(ptr addrspace(8) inreg %desc, i32 %index) { ; GFX678910-LABEL: s_buffer_loadx4_index_divergent: ; GFX678910: ; %bb.0: ; %main_body ; GFX678910-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen @@ -360,7 +360,7 @@ ; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done ; GFX11-NEXT: s_endpgm main_body: - %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %index, i32 0) + %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(ptr addrspace(8) %desc, i32 %index, i32 0) %bitcast = bitcast <4 x i32> %load to <4 x float> %x = extractelement <4 x float> %bitcast, i32 0 %y = extractelement <4 x float> %bitcast, i32 1 @@ -370,7 +370,7 @@ ret void } -define amdgpu_ps void @s_buffer_load_imm_mergex2(<4 x i32> inreg %desc) { +define amdgpu_ps void @s_buffer_load_imm_mergex2(ptr addrspace(8) inreg %desc) { ; GFX67-LABEL: s_buffer_load_imm_mergex2: ; GFX67: ; %bb.0: ; %main_body ; GFX67-NEXT: s_buffer_load_dwordx2 s[0:1], s[0:3], 0x1 @@ -398,15 +398,15 @@ ; GFX11-NEXT: exp mrt0 v0, v1, v0, v0 done ; GFX11-NEXT: s_endpgm main_body: - %load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0) - %load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0) + %load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 4, i32 0) + %load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 8, i32 0) %x = bitcast i32 %load0 to float %y = bitcast i32 %load1 to float call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true) ret void } -define amdgpu_ps void @s_buffer_load_imm_mergex4(<4 x i32> inreg %desc) { +define amdgpu_ps void @s_buffer_load_imm_mergex4(ptr addrspace(8) inreg %desc) { ; GFX67-LABEL: s_buffer_load_imm_mergex4: ; GFX67: ; %bb.0: ; %main_body ; GFX67-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x2 @@ -440,10 +440,10 @@ ; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done ; GFX11-NEXT: s_endpgm main_body: - %load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0) - %load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 12, i32 0) - %load2 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 16, i32 0) - %load3 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 20, i32 0) + %load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 8, i32 0) + %load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 12, i32 0) + %load2 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 16, i32 0) + %load3 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 20, i32 0) %x = bitcast i32 %load0 to float %y = bitcast i32 %load1 to float %z = bitcast i32 %load2 to float @@ -454,7 +454,7 @@ @gv = external addrspace(1) global i32 -define amdgpu_ps void @s_buffer_load_index_across_bb(<4 x i32> inreg %desc, i32 %index) { +define amdgpu_ps void @s_buffer_load_index_across_bb(ptr addrspace(8) inreg %desc, i32 %index) { ; GFX6-LABEL: s_buffer_load_index_across_bb: ; GFX6: ; %bb.0: ; %main_body ; GFX6-NEXT: s_getpc_b64 s[4:5] @@ -561,13 +561,13 @@ bb1: ; preds = %main_body %tmp1 = or i32 %tmp, 8 - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %tmp1, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 %tmp1, i32 0) %bitcast = bitcast i32 %load to float call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) ret void } -define amdgpu_ps void @s_buffer_load_index_across_bb_merged(<4 x i32> inreg %desc, i32 %index) { +define amdgpu_ps void @s_buffer_load_index_across_bb_merged(ptr addrspace(8) inreg %desc, i32 %index) { ; GFX678910-LABEL: s_buffer_load_index_across_bb_merged: ; GFX678910: ; %bb.0: ; %main_body ; GFX678910-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -589,16 +589,16 @@ bb1: ; preds = %main_body %tmp1 = or i32 %tmp, 8 - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %tmp1, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 %tmp1, i32 0) %tmp2 = or i32 %tmp1, 4 - %load2 = tail call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %tmp2, i32 0) + %load2 = tail call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 %tmp2, i32 0) %bitcast = bitcast i32 %load to float %bitcast2 = bitcast i32 %load2 to float call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float %bitcast2, float undef, float undef, i1 true, i1 true) ret void } -define amdgpu_ps i32 @s_buffer_load_imm_neg1(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_imm_neg1(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: s_buffer_load_imm_neg1: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s4, -1 @@ -620,11 +620,11 @@ ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -1, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 -1, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_imm_neg4(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_imm_neg4(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: s_buffer_load_imm_neg4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s4, -4 @@ -652,11 +652,11 @@ ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -4, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 -4, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_imm_neg8(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_imm_neg8(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: s_buffer_load_imm_neg8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s4, -8 @@ -684,11 +684,11 @@ ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -8, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 -8, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_imm_bit31(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_imm_bit31(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: s_buffer_load_imm_bit31: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_brev_b32 s4, 1 @@ -716,11 +716,11 @@ ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -2147483648, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 -2147483648, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_imm_bit30(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_imm_bit30(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: s_buffer_load_imm_bit30: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s4, 2.0 @@ -748,11 +748,11 @@ ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1073741824, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 1073741824, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_imm_bit29(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_imm_bit29(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: s_buffer_load_imm_bit29: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_brev_b32 s4, 4 @@ -780,11 +780,11 @@ ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 536870912, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 536870912, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_imm_bit21(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_imm_bit21(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: s_buffer_load_imm_bit21: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s4, 0x200000 @@ -812,11 +812,11 @@ ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 2097152, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 2097152, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_imm_bit20(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_imm_bit20(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: s_buffer_load_imm_bit20: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s4, 0x100000 @@ -844,11 +844,11 @@ ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1048576, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 1048576, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_imm_neg_bit20(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_imm_neg_bit20(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: s_buffer_load_imm_neg_bit20: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s4, 0xfff00000 @@ -876,11 +876,11 @@ ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -1048576, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 -1048576, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_imm_bit19(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_imm_bit19(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: s_buffer_load_imm_bit19: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s4, 0x80000 @@ -906,11 +906,11 @@ ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 524288, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 524288, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_imm_neg_bit19(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_imm_neg_bit19(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: s_buffer_load_imm_neg_bit19: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s4, 0xfff80000 @@ -938,11 +938,11 @@ ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -524288, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 -524288, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_imm_255(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_imm_255(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: s_buffer_load_imm_255: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_movk_i32 s4, 0xff @@ -969,11 +969,11 @@ ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0xff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 255, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 255, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_imm_256(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_imm_256(ptr addrspace(8) inreg %desc) { ; GFX67-LABEL: s_buffer_load_imm_256: ; GFX67: ; %bb.0: ; GFX67-NEXT: s_buffer_load_dword s0, s[0:3], 0x40 @@ -991,11 +991,11 @@ ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x100 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 256, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 256, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_imm_1016(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_imm_1016(ptr addrspace(8) inreg %desc) { ; GFX67-LABEL: s_buffer_load_imm_1016: ; GFX67: ; %bb.0: ; GFX67-NEXT: s_buffer_load_dword s0, s[0:3], 0xfe @@ -1013,11 +1013,11 @@ ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x3f8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1016, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 1016, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_imm_1020(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_imm_1020(ptr addrspace(8) inreg %desc) { ; GFX67-LABEL: s_buffer_load_imm_1020: ; GFX67: ; %bb.0: ; GFX67-NEXT: s_buffer_load_dword s0, s[0:3], 0xff @@ -1035,11 +1035,11 @@ ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x3fc ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1020, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 1020, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_imm_1021(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_imm_1021(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: s_buffer_load_imm_1021: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_movk_i32 s4, 0x3fd @@ -1066,11 +1066,11 @@ ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x3fd ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1021, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 1021, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_imm_1024(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_imm_1024(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: s_buffer_load_imm_1024: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_movk_i32 s4, 0x400 @@ -1096,11 +1096,11 @@ ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x400 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1024, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 1024, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_imm_1025(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_imm_1025(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: s_buffer_load_imm_1025: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_movk_i32 s4, 0x401 @@ -1127,11 +1127,11 @@ ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x401 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1025, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 1025, i32 0) ret i32 %load } -define amdgpu_ps i32 @s_buffer_load_imm_1028(<4 x i32> inreg %desc) { +define amdgpu_ps i32 @s_buffer_load_imm_1028(ptr addrspace(8) inreg %desc) { ; GFX6-LABEL: s_buffer_load_imm_1028: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_movk_i32 s4, 0x400 @@ -1157,15 +1157,15 @@ ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x400 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog - %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1024, i32 0) + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 1024, i32 0) ret i32 %load } declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) -declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32) -declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32) -declare <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32>, i32, i32) -declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) +declare i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8), i32, i32) +declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(ptr addrspace(8), i32, i32) +declare <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(ptr addrspace(8), i32, i32) +declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(ptr addrspace(8), i32, i32) ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GFX678: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll @@ -43,7 +43,7 @@ ret void } -define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x i32> inreg %desc) { +define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, ptr addrspace(8) inreg %desc) { ; GCN-LABEL: set_inactive_scc: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 @@ -76,7 +76,7 @@ ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0) + %val = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %desc, i32 0, i32 0) %cmp = icmp eq i32 %val, 56 %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 br i1 %cmp, label %.zero, label %.one @@ -96,6 +96,6 @@ declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0 declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0 -declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32) +declare i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8), i32, i32) attributes #0 = { convergent readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll @@ -15,8 +15,8 @@ ; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; CHECK-NEXT: ; return to shader part epilog main_body: - %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) - %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) %out = fadd float %src0, %src1 %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out) ret float %out.0 @@ -36,8 +36,8 @@ ; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; CHECK-NEXT: ; return to shader part epilog main_body: - %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) - %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) %out = fadd float %src0, %src1 %out.0 = bitcast float %out to i32 %out.1 = call i32 @llvm.amdgcn.softwqm.i32(i32 %out.0) @@ -62,10 +62,10 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ; return to shader part epilog main_body: - %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) - %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) %temp = fadd float %src0, %src1 - call void @llvm.amdgcn.struct.buffer.store.f32(float %temp, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %temp, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) %out = fadd float %temp, %temp %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out) ret float %out.0 @@ -94,11 +94,11 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ; return to shader part epilog main_body: - %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) - %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) %temp = fadd float %src0, %src1 %temp.0 = call float @llvm.amdgcn.wqm.f32(float %temp) - call void @llvm.amdgcn.struct.buffer.store.f32(float %temp.0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %temp.0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) %out = fadd float %temp, %temp %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out) ret float %out.0 @@ -127,9 +127,9 @@ ; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; CHECK-NEXT: ; return to shader part epilog main_body: - %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) - call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) - %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) %temp = fadd float %src0, %src1 %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp) %out = fadd float %temp.0, %temp.0 @@ -159,9 +159,9 @@ ; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; CHECK-NEXT: ; return to shader part epilog main_body: - %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) - call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) - %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) %temp = fadd float %src0, %src1 %temp.0 = call float @llvm.amdgcn.strict.wwm.f32(float %temp) %out = fadd float %temp.0, %temp.0 @@ -202,14 +202,14 @@ br i1 %cmp, label %IF, label %ELSE IF: - %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) - %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) %out = fadd float %src0, %src1 %data.if = call float @llvm.amdgcn.softwqm.f32(float %out) br label %END ELSE: - call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 %c, i32 0, i32 0, i32 0) br label %END END: @@ -258,8 +258,8 @@ br i1 %cmp, label %IF, label %ELSE IF: - %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) - %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) %out = fadd float %src0, %src1 %data.if = call float @llvm.amdgcn.softwqm.f32(float %out) br label %END @@ -271,7 +271,7 @@ %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 %data.sample = extractelement <4 x float> %dtex, i32 0 - call void @llvm.amdgcn.struct.buffer.store.f32(float %data.sample, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %data.sample, ptr addrspace(8) undef, i32 %c, i32 0, i32 0, i32 0) br label %END END: @@ -279,9 +279,9 @@ ret float %r } -declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2 -declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2 -declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32 immarg) #3 +declare void @llvm.amdgcn.struct.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg) #2 +declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32, i32 immarg) #2 +declare float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8), i32, i32, i32, i32 immarg) #3 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 declare float @llvm.amdgcn.wqm.f32(float) #3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll @@ -2,7 +2,7 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck %s -check-prefix=CHECK -define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_mov_b32 s11, s5 @@ -11,12 +11,12 @@ ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s6 idxen offen ; CHECK-NEXT: s_endpgm - %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } ; Natural mapping, no voffset -define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) { ; CHECK-LABEL: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_mov_b32 s11, s5 @@ -25,11 +25,11 @@ ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s6 idxen ; CHECK-NEXT: s_endpgm - %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } -define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_mov_b32 s11, s5 @@ -38,11 +38,11 @@ ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s6 idxen offen slc ; CHECK-NEXT: s_endpgm - %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void } -define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_mov_b32 s11, s5 @@ -51,11 +51,11 @@ ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[8:11], s6 idxen offen ; CHECK-NEXT: s_endpgm - %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } -declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0 -declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) #0 +declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32, i32 immarg) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll @@ -18,18 +18,18 @@ ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_swap v0, {{v[0-9]+}}, s[0:3], 0 idxen{{$}} ;CHECK: buffer_atomic_swap v0, {{v[0-9]+}}, s[0:3], 0 idxen glc -define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) { +define amdgpu_ps float @test1(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) { main_body: - %o1 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) - %o2 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 %o1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) - %o3 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 %o2, <4 x i32> %rsrc, i32 0, i32 %voffset, i32 0, i32 0) - %o4 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 %o3, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 0, i32 0) + %o1 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) + %o2 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 %o1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %o3 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 %o2, ptr addrspace(8) %rsrc, i32 0, i32 %voffset, i32 0, i32 0) + %o4 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 %o3, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 0, i32 0) %ofs.5 = add i32 %voffset, 42 - %o5 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 %o4, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i32 0, i32 0) - %o6 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 %o5, <4 x i32> %rsrc, i32 0, i32 4, i32 8188, i32 0) - %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 %o6, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %o5 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 %o4, ptr addrspace(8) %rsrc, i32 0, i32 %ofs.5, i32 0, i32 0) + %o6 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 %o5, ptr addrspace(8) %rsrc, i32 0, i32 4, i32 8188, i32 0) + %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 %o6, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %o7 = bitcast i32 %o6 to float - %out = call float @llvm.amdgcn.struct.buffer.atomic.swap.f32(float %o7, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %out = call float @llvm.amdgcn.struct.buffer.atomic.swap.f32(float %o7, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) ret float %out } @@ -56,19 +56,19 @@ ;CHECK: buffer_atomic_inc v0, v1, s[0:3], 0 idxen glc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_dec v0, v1, s[0:3], 0 idxen glc -define amdgpu_ps float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) { +define amdgpu_ps float @test2(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %vindex) { main_body: - %t1 = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) - %t2 = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 %t1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 2) - %t3 = call i32 @llvm.amdgcn.struct.buffer.atomic.smin.i32(i32 %t2, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) - %t4 = call i32 @llvm.amdgcn.struct.buffer.atomic.umin.i32(i32 %t3, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 2) - %t5 = call i32 @llvm.amdgcn.struct.buffer.atomic.smax.i32(i32 %t4, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) - %t6 = call i32 @llvm.amdgcn.struct.buffer.atomic.umax.i32(i32 %t5, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 2) - %t7 = call i32 @llvm.amdgcn.struct.buffer.atomic.and.i32(i32 %t6, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) - %t8 = call i32 @llvm.amdgcn.struct.buffer.atomic.or.i32(i32 %t7, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 2) - %t9 = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 %t8, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) - %t10 = call i32 @llvm.amdgcn.struct.buffer.atomic.inc.i32(i32 %t9, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) - %t11 = call i32 @llvm.amdgcn.struct.buffer.atomic.dec.i32(i32 %t10, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %t1 = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %t2 = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 %t1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 2) + %t3 = call i32 @llvm.amdgcn.struct.buffer.atomic.smin.i32(i32 %t2, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %t4 = call i32 @llvm.amdgcn.struct.buffer.atomic.umin.i32(i32 %t3, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 2) + %t5 = call i32 @llvm.amdgcn.struct.buffer.atomic.smax.i32(i32 %t4, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %t6 = call i32 @llvm.amdgcn.struct.buffer.atomic.umax.i32(i32 %t5, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 2) + %t7 = call i32 @llvm.amdgcn.struct.buffer.atomic.and.i32(i32 %t6, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %t8 = call i32 @llvm.amdgcn.struct.buffer.atomic.or.i32(i32 %t7, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 2) + %t9 = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 %t8, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %t10 = call i32 @llvm.amdgcn.struct.buffer.atomic.inc.i32(i32 %t9, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %t11 = call i32 @llvm.amdgcn.struct.buffer.atomic.dec.i32(i32 %t10, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) %out = bitcast i32 %t11 to float ret float %out } @@ -91,22 +91,22 @@ ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen offset:44 glc ;CHECK-DAG: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], [[SOFS]] idxen offset:4 glc -define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) { +define amdgpu_ps float @test3(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) { main_body: - %o1 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) - %o2 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) - %o3 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %o2, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %voffset, i32 0, i32 0) - %o4 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %o3, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 0, i32 0) + %o1 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %data, i32 %cmp, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) + %o2 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %o1, i32 %cmp, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %o3 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %o2, i32 %cmp, ptr addrspace(8) %rsrc, i32 0, i32 %voffset, i32 0, i32 0) + %o4 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %o3, i32 %cmp, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 0, i32 0) %offs.5 = add i32 %voffset, 44 - %o5 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %o4, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %offs.5, i32 0, i32 0) - %o6 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %o5, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 4, i32 8188, i32 0) + %o5 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %o4, i32 %cmp, ptr addrspace(8) %rsrc, i32 0, i32 %offs.5, i32 0, i32 0) + %o6 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %o5, i32 %cmp, ptr addrspace(8) %rsrc, i32 0, i32 4, i32 8188, i32 0) ; Detecting the no-return variant doesn't work right now because of how the ; intrinsic is replaced by an instruction that feeds into an EXTRACT_SUBREG. ; Since there probably isn't a reasonable use-case of cmpswap that discards ; the return value, that seems okay. ; -; %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %o6, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) +; %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %o6, i32 %cmp, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %out = bitcast i32 %o6 to float ret float %out } @@ -115,24 +115,24 @@ ;CHECK: buffer_atomic_add v0, define amdgpu_ps float @test4() { main_body: - %v = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 1, <4 x i32> undef, i32 0, i32 4, i32 0, i32 0) + %v = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 1, ptr addrspace(8) undef, i32 0, i32 4, i32 0, i32 0) %v.float = bitcast i32 %v to float ret float %v.float } -declare i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 -declare float @llvm.amdgcn.struct.buffer.atomic.swap.f32(float, <4 x i32>, i32, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.atomic.smin.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.atomic.umin.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.atomic.smax.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.atomic.inc.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.atomic.dec.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0 +declare float @llvm.amdgcn.struct.buffer.atomic.swap.f32(float, ptr addrspace(8), i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.smin.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.umin.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.smax.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.umax.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.and.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.or.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.inc.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.dec.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32, i32, ptr addrspace(8), i32, i32, i32, i32) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.d16.ll @@ -4,9 +4,9 @@ ; GCN-LABEL: {{^}}buffer_load_format_d16_x: ; GCN: buffer_load_format_d16_x v{{[0-9]+}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen -define amdgpu_ps half @buffer_load_format_d16_x(<4 x i32> inreg %rsrc) { +define amdgpu_ps half @buffer_load_format_d16_x(ptr addrspace(8) inreg %rsrc) { main_body: - %data = call half @llvm.amdgcn.struct.buffer.load.format.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %data = call half @llvm.amdgcn.struct.buffer.load.format.f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) ret half %data } @@ -16,9 +16,9 @@ ; PACKED: buffer_load_format_d16_xy v[[FULL:[0-9]+]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] -define amdgpu_ps half @buffer_load_format_d16_xy(<4 x i32> inreg %rsrc) { +define amdgpu_ps half @buffer_load_format_d16_xy(ptr addrspace(8) inreg %rsrc) { main_body: - %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.format.v2f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %elt = extractelement <2 x half> %data, i32 1 ret half %elt } @@ -29,9 +29,9 @@ ; PACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen ; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] -define amdgpu_ps half @buffer_load_format_d16_xyz(<4 x i32> inreg %rsrc) { +define amdgpu_ps half @buffer_load_format_d16_xyz(ptr addrspace(8) inreg %rsrc) { main_body: - %data = call <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %data = call <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %elt = extractelement <3 x half> %data, i32 2 ret half %elt } @@ -42,24 +42,24 @@ ; PACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] -define amdgpu_ps half @buffer_load_format_d16_xyzw(<4 x i32> inreg %rsrc) { +define amdgpu_ps half @buffer_load_format_d16_xyzw(ptr addrspace(8) inreg %rsrc) { main_body: - %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %elt = extractelement <4 x half> %data, i32 3 ret half %elt } ; GCN-LABEL: {{^}}buffer_load_format_i16_x: ; GCN: buffer_load_format_d16_x v{{[0-9]+}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen -define amdgpu_ps half @buffer_load_format_i16_x(<4 x i32> inreg %rsrc) { +define amdgpu_ps half @buffer_load_format_i16_x(ptr addrspace(8) inreg %rsrc) { main_body: - %data = call i16 @llvm.amdgcn.struct.buffer.load.format.i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %data = call i16 @llvm.amdgcn.struct.buffer.load.format.i16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %fdata = bitcast i16 %data to half ret half %fdata } -declare half @llvm.amdgcn.struct.buffer.load.format.f16(<4 x i32>, i32, i32, i32, i32) -declare <2 x half> @llvm.amdgcn.struct.buffer.load.format.v2f16(<4 x i32>, i32, i32, i32, i32) -declare <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(<4 x i32>, i32, i32, i32, i32) -declare <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(<4 x i32>, i32, i32, i32, i32) -declare i16 @llvm.amdgcn.struct.buffer.load.format.i16(<4 x i32>, i32, i32, i32, i32) +declare half @llvm.amdgcn.struct.buffer.load.format.f16(ptr addrspace(8), i32, i32, i32, i32) +declare <2 x half> @llvm.amdgcn.struct.buffer.load.format.v2f16(ptr addrspace(8), i32, i32, i32, i32) +declare <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(ptr addrspace(8), i32, i32, i32, i32) +declare <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(ptr addrspace(8), i32, i32, i32, i32) +declare i16 @llvm.amdgcn.struct.buffer.load.format.i16(ptr addrspace(8), i32, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll @@ -3,7 +3,7 @@ ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefixes=GFX8PLUS %s ;RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=GFX11 %s -define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(ptr addrspace(8) inreg) { ; GFX6-LABEL: buffer_load: ; GFX6: ; %bb.0: ; %main_body ; GFX6-NEXT: v_mov_b32_e32 v8, 0 @@ -32,16 +32,16 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0) - %data_glc = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 1) - %data_slc = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 2) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0) + %data_glc = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1) + %data_slc = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 2) %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 ret {<4 x float>, <4 x float>, <4 x float>} %r2 } -define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_immoffs(ptr addrspace(8) inreg) { ; GFX6-LABEL: buffer_load_immoffs: ; GFX6: ; %bb.0: ; %main_body ; GFX6-NEXT: v_mov_b32_e32 v0, 0 @@ -63,11 +63,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 42, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 42, i32 0, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_immoffs_large(ptr addrspace(8) inreg) { ; GFX6-LABEL: buffer_load_immoffs_large: ; GFX6: ; %bb.0: ; %main_body ; GFX6-NEXT: v_mov_b32_e32 v8, 0 @@ -127,15 +127,15 @@ ; GFX11-NEXT: v_add_f32_e32 v2, v10, v2 ; GFX11-NEXT: ; return to shader part epilog main_body: - %d.0 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4092, i32 60, i32 0) - %d.1 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4092, i32 32764, i32 0) - %d.2 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4, i32 36860, i32 0) + %d.0 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 4092, i32 60, i32 0) + %d.1 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 4092, i32 32764, i32 0) + %d.2 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 4, i32 36860, i32 0) %d.3 = fadd <4 x float> %d.0, %d.1 %data = fadd <4 x float> %d.2, %d.3 ret <4 x float> %data } -define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(ptr addrspace(8) inreg) { ; GFX6-LABEL: buffer_load_voffset_large_12bit: ; GFX6: ; %bb.0: ; %main_body ; GFX6-NEXT: v_mov_b32_e32 v0, 0 @@ -157,11 +157,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4092, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 4092, i32 0, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @buffer_load_voffset_large_13bit(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_voffset_large_13bit(ptr addrspace(8) inreg) { ; GFX6-LABEL: buffer_load_voffset_large_13bit: ; GFX6: ; %bb.0: ; %main_body ; GFX6-NEXT: s_mov_b32 s4, 0 @@ -189,11 +189,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 8188, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 8188, i32 0, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @buffer_load_voffset_large_16bit(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_voffset_large_16bit(ptr addrspace(8) inreg) { ; GFX6-LABEL: buffer_load_voffset_large_16bit: ; GFX6: ; %bb.0: ; %main_body ; GFX6-NEXT: s_mov_b32 s4, 0 @@ -221,11 +221,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 65532, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 65532, i32 0, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @buffer_load_voffset_large_23bit(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_voffset_large_23bit(ptr addrspace(8) inreg) { ; GFX6-LABEL: buffer_load_voffset_large_23bit: ; GFX6: ; %bb.0: ; %main_body ; GFX6-NEXT: s_mov_b32 s4, 0 @@ -253,11 +253,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 8388604, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 8388604, i32 0, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @buffer_load_voffset_large_24bit(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_voffset_large_24bit(ptr addrspace(8) inreg) { ; GFX6-LABEL: buffer_load_voffset_large_24bit: ; GFX6: ; %bb.0: ; %main_body ; GFX6-NEXT: s_mov_b32 s4, 0 @@ -285,11 +285,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 16777212, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 16777212, i32 0, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) { +define amdgpu_ps <4 x float> @buffer_load_idx(ptr addrspace(8) inreg, i32) { ; GFX6-LABEL: buffer_load_idx: ; GFX6: ; %bb.0: ; %main_body ; GFX6-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen @@ -308,11 +308,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %1, i32 0, i32 0, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { +define amdgpu_ps <4 x float> @buffer_load_ofs(ptr addrspace(8) inreg, i32) { ; GFX6-LABEL: buffer_load_ofs: ; GFX6: ; %bb.0: ; %main_body ; GFX6-NEXT: s_mov_b32 s4, 0 @@ -340,11 +340,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %1, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 %1, i32 0, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { +define amdgpu_ps <4 x float> @buffer_load_ofs_imm(ptr addrspace(8) inreg, i32) { ; GFX6-LABEL: buffer_load_ofs_imm: ; GFX6: ; %bb.0: ; %main_body ; GFX6-NEXT: s_mov_b32 s4, 0 @@ -373,11 +373,11 @@ ; GFX11-NEXT: ; return to shader part epilog main_body: %ofs = add i32 %1, 60 - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 %ofs, i32 0, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) { +define amdgpu_ps <4 x float> @buffer_load_both(ptr addrspace(8) inreg, i32, i32) { ; GFX6-LABEL: buffer_load_both: ; GFX6: ; %bb.0: ; %main_body ; GFX6-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen @@ -396,11 +396,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 %2, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %1, i32 %2, i32 0, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) { +define amdgpu_ps <4 x float> @buffer_load_both_reversed(ptr addrspace(8) inreg, i32, i32) { ; GFX6-LABEL: buffer_load_both_reversed: ; GFX6: ; %bb.0: ; %main_body ; GFX6-NEXT: v_mov_b32_e32 v2, v0 @@ -422,11 +422,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %2, i32 %1, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %2, i32 %1, i32 0, i32 0) ret <4 x float> %data } -define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_x(ptr addrspace(8) inreg %rsrc) { ; GFX6-LABEL: buffer_load_x: ; GFX6: ; %bb.0: ; %main_body ; GFX6-NEXT: v_mov_b32_e32 v0, 0 @@ -448,11 +448,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) ret float %data } -define amdgpu_ps float @buffer_load_x_i32(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_x_i32(ptr addrspace(8) inreg %rsrc) { ; GFX6-LABEL: buffer_load_x_i32: ; GFX6: ; %bb.0: ; %main_body ; GFX6-NEXT: v_mov_b32_e32 v0, 0 @@ -474,12 +474,12 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %data = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %fdata = bitcast i32 %data to float ret float %fdata } -define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { +define amdgpu_ps <2 x float> @buffer_load_xy(ptr addrspace(8) inreg %rsrc) { ; GFX6-LABEL: buffer_load_xy: ; GFX6: ; %bb.0: ; %main_body ; GFX6-NEXT: v_mov_b32_e32 v0, 0 @@ -501,11 +501,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) ret <2 x float> %data } -define amdgpu_cs float @buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %out) { +define amdgpu_cs float @buffer_load_v4i32_tfe(ptr addrspace(8) inreg %rsrc, ptr addrspace(1) %out) { ; GFX6-LABEL: buffer_load_v4i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 @@ -539,7 +539,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ; return to shader part epilog - %load = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %load = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <4 x i32>, i32 } %load, 0 store <4 x i32> %data, ptr addrspace(1) %out %status = extractvalue { <4 x i32>, i32 } %load, 1 @@ -547,7 +547,7 @@ ret float %fstatus } -define amdgpu_cs float @buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %out) { +define amdgpu_cs float @buffer_load_v4f32_tfe(ptr addrspace(8) inreg %rsrc, ptr addrspace(1) %out) { ; GFX6-LABEL: buffer_load_v4f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 @@ -581,7 +581,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ; return to shader part epilog - %load = call { <4 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %load = call { <4 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4f32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <4 x float>, i32 } %load, 0 store <4 x float> %data, ptr addrspace(1) %out %status = extractvalue { <4 x float>, i32 } %load, 1 @@ -589,7 +589,7 @@ ret float %fstatus } -define amdgpu_cs float @buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %out) { +define amdgpu_cs float @buffer_load_v3i32_tfe(ptr addrspace(8) inreg %rsrc, ptr addrspace(1) %out) { ; GFX6-LABEL: buffer_load_v3i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 @@ -624,7 +624,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ; return to shader part epilog - %load = call { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %load = call { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <3 x i32>, i32 } %load, 0 store <3 x i32> %data, ptr addrspace(1) %out %status = extractvalue { <3 x i32>, i32 } %load, 1 @@ -632,7 +632,7 @@ ret float %fstatus } -define amdgpu_cs float @buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %out) { +define amdgpu_cs float @buffer_load_v3f32_tfe(ptr addrspace(8) inreg %rsrc, ptr addrspace(1) %out) { ; GFX6-LABEL: buffer_load_v3f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 @@ -667,7 +667,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ; return to shader part epilog - %load = call { <3 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %load = call { <3 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3f32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <3 x float>, i32 } %load, 0 store <3 x float> %data, ptr addrspace(1) %out %status = extractvalue { <3 x float>, i32 } %load, 1 @@ -675,7 +675,7 @@ ret float %fstatus } -define amdgpu_cs float @buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %out) { +define amdgpu_cs float @buffer_load_v2i32_tfe(ptr addrspace(8) inreg %rsrc, ptr addrspace(1) %out) { ; GFX6-LABEL: buffer_load_v2i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 @@ -709,7 +709,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ; return to shader part epilog - %load = call { <2 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v2i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %load = call { <2 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v2i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <2 x i32>, i32 } %load, 0 store <2 x i32> %data, ptr addrspace(1) %out %status = extractvalue { <2 x i32>, i32 } %load, 1 @@ -717,7 +717,7 @@ ret float %fstatus } -define amdgpu_cs float @buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %out) { +define amdgpu_cs float @buffer_load_v2f32_tfe(ptr addrspace(8) inreg %rsrc, ptr addrspace(1) %out) { ; GFX6-LABEL: buffer_load_v2f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 @@ -751,7 +751,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ; return to shader part epilog - %load = call { <2 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v2f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %load = call { <2 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v2f32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <2 x float>, i32 } %load, 0 store <2 x float> %data, ptr addrspace(1) %out %status = extractvalue { <2 x float>, i32 } %load, 1 @@ -759,7 +759,7 @@ ret float %fstatus } -define amdgpu_cs float @buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %out) { +define amdgpu_cs float @buffer_load_i32_tfe(ptr addrspace(8) inreg %rsrc, ptr addrspace(1) %out) { ; GFX6-LABEL: buffer_load_i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 @@ -793,7 +793,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ; return to shader part epilog - %load = call { i32, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %load = call { i32, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { i32, i32 } %load, 0 store i32 %data, ptr addrspace(1) %out %status = extractvalue { i32, i32 } %load, 1 @@ -801,7 +801,7 @@ ret float %fstatus } -define amdgpu_cs float @buffer_load_f32_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %out) { +define amdgpu_cs float @buffer_load_f32_tfe(ptr addrspace(8) inreg %rsrc, ptr addrspace(1) %out) { ; GFX6-LABEL: buffer_load_f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 @@ -835,7 +835,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ; return to shader part epilog - %load = call { float, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %load = call { float, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_f32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { float, i32 } %load, 0 store float %data, ptr addrspace(1) %out %status = extractvalue { float, i32 } %load, 1 @@ -843,16 +843,16 @@ ret float %fstatus } -declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32) #0 -declare <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32>, i32, i32, i32, i32) #0 -declare { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare { <4 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4f32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3i32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare { <3 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3f32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare { <2 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v2i32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare { <2 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v2f32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare { i32, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_i32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0 -declare { float, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_f32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0 +declare float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8), i32, i32, i32, i32) #0 +declare <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8), i32, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8), i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8), i32, i32, i32, i32) #0 +declare { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare { <4 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4f32i32s(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3i32i32s(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare { <3 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3f32i32s(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare { <2 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v2i32i32s(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare { <2 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v2f32i32s(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare { i32, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_i32i32s(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 +declare { float, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_f32i32s(ptr addrspace(8), i32, i32, i32, i32 immarg) #0 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll @@ -5,7 +5,7 @@ ; RUN: llc -mcpu=gfx1100 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s @esgs_ring = external addrspace(3) global [0 x i32], align 65536 -define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { +define amdgpu_gs void @main(ptr addrspace(8) %arg, i32 %arg1) { ; GFX10-LABEL: main: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_mov_b32 s1, exec_lo @@ -113,7 +113,7 @@ ; GFX11-NEXT: ds_store_2addr_b32 v2, v0, v1 offset0:7 offset1:8 bb: %i = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 undef) - %i2 = call nsz arcp <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(<4 x i32> %arg, i32 %arg1, i32 0, i32 0, i32 0) + %i2 = call nsz arcp <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(ptr addrspace(8) %arg, i32 %arg1, i32 0, i32 0, i32 0) %i3 = bitcast <3 x half> %i2 to <3 x i16> %i4 = extractelement <3 x i16> %i3, i32 1 %i5 = bitcast <3 x half> %i2 to <3 x i16> @@ -131,6 +131,6 @@ ; Function Attrs: nounwind readnone willreturn declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0 ; Function Attrs: nounwind readonly willreturn -declare <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(<4 x i32>, i32, i32, i32, i32 immarg) #1 +declare <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(ptr addrspace(8), i32, i32, i32, i32 immarg) #1 attributes #0 = { nounwind readnone willreturn } attributes #1 = { nounwind readonly willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll @@ -2,9 +2,9 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,SDAG ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GISEL -declare void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %vindex, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux) +declare void @llvm.amdgcn.struct.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %vindex, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux) -define amdgpu_ps float @buffer_load_lds_dword(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) { +define amdgpu_ps float @buffer_load_lds_dword(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) { ; SDAG-LABEL: buffer_load_lds_dword: ; SDAG: ; %bb.0: ; %main_body ; SDAG-NEXT: v_mov_b32_e32 v0, 8 @@ -32,14 +32,14 @@ ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: ; return to shader part epilog main_body: - call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 0, i32 0) - call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 4, i32 1) - call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 8, i32 2) + call void @llvm.amdgcn.struct.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 4, i32 1) + call void @llvm.amdgcn.struct.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 8, i32 2) %res = load float, ptr addrspace(3) %lds ret float %res } -define amdgpu_ps void @buffer_load_lds_dword_imm_offset(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex) { +define amdgpu_ps void @buffer_load_lds_dword_imm_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex) { ; GCN-LABEL: buffer_load_lds_dword_imm_offset: ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b32 m0, s4 @@ -47,11 +47,11 @@ ; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:2048 lds ; GCN-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 %vindex, i32 0, i32 0, i32 2048, i32 0) + call void @llvm.amdgcn.struct.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 %vindex, i32 0, i32 0, i32 2048, i32 0) ret void } -define amdgpu_ps void @buffer_load_lds_dword_v_offset(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 %voffset) { +define amdgpu_ps void @buffer_load_lds_dword_v_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 %voffset) { ; GCN-LABEL: buffer_load_lds_dword_v_offset: ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b32 m0, s4 @@ -59,11 +59,11 @@ ; GCN-NEXT: buffer_load_dword v[0:1], s[0:3], 0 idxen offen lds ; GCN-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 %vindex, i32 %voffset, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 %vindex, i32 %voffset, i32 0, i32 0, i32 0) ret void } -define amdgpu_ps void @buffer_load_lds_dword_s_offset(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 inreg %soffset) { +define amdgpu_ps void @buffer_load_lds_dword_s_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 inreg %soffset) { ; GCN-LABEL: buffer_load_lds_dword_s_offset: ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b32 m0, s4 @@ -71,11 +71,11 @@ ; GCN-NEXT: buffer_load_dword v0, s[0:3], s5 idxen lds ; GCN-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 %vindex, i32 0, i32 %soffset, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 %vindex, i32 0, i32 %soffset, i32 0, i32 0) ret void } -define amdgpu_ps void @buffer_load_lds_dword_vs_offset(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @buffer_load_lds_dword_vs_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; GCN-LABEL: buffer_load_lds_dword_vs_offset: ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b32 m0, s4 @@ -83,11 +83,11 @@ ; GCN-NEXT: buffer_load_dword v[0:1], s[0:3], s5 idxen offen lds ; GCN-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 %vindex, i32 %voffset, i32 %soffset, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 %vindex, i32 %voffset, i32 %soffset, i32 0, i32 0) ret void } -define amdgpu_ps void @buffer_load_lds_dword_vs_imm_offset(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps void @buffer_load_lds_dword_vs_imm_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; GCN-LABEL: buffer_load_lds_dword_vs_imm_offset: ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b32 m0, s4 @@ -95,11 +95,11 @@ ; GCN-NEXT: buffer_load_dword v[0:1], s[0:3], s5 idxen offen offset:2048 lds ; GCN-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 %vindex, i32 %voffset, i32 %soffset, i32 2048, i32 0) + call void @llvm.amdgcn.struct.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 %vindex, i32 %voffset, i32 %soffset, i32 2048, i32 0) ret void } -define amdgpu_ps void @buffer_load_lds_ushort(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex) { +define amdgpu_ps void @buffer_load_lds_ushort(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex) { ; GCN-LABEL: buffer_load_lds_ushort: ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: v_mov_b32_e32 v1, 0x800 @@ -108,11 +108,11 @@ ; GCN-NEXT: buffer_load_ushort v[0:1], s[0:3], 0 idxen offen lds ; GCN-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 2, i32 %vindex, i32 2048, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 2, i32 %vindex, i32 2048, i32 0, i32 0, i32 0) ret void } -define amdgpu_ps void @buffer_load_lds_ubyte(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex) { +define amdgpu_ps void @buffer_load_lds_ubyte(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex) { ; GCN-LABEL: buffer_load_lds_ubyte: ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b32 m0, s4 @@ -120,6 +120,6 @@ ; GCN-NEXT: buffer_load_ubyte v0, s[0:3], 0 idxen offset:2048 lds ; GCN-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 1, i32 %vindex, i32 0, i32 0, i32 2048, i32 0) + call void @llvm.amdgcn.struct.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 1, i32 %vindex, i32 0, i32 0, i32 2048, i32 0) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll @@ -6,11 +6,11 @@ ;CHECK: buffer_load_dwordx4 v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc ;CHECK: buffer_load_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc ;CHECK: s_waitcnt -define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(ptr addrspace(8) inreg) { main_body: - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0) - %data_glc = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 1) - %data_slc = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 2) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0) + %data_glc = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1) + %data_slc = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 2) %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 @@ -20,9 +20,9 @@ ;CHECK-LABEL: {{^}}buffer_load_immoffs: ;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:40 ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_immoffs(ptr addrspace(8) inreg) { main_body: - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 40, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 40, i32 0, i32 0) ret <4 x float> %data } @@ -30,46 +30,46 @@ ;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc ;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], [[OFFSET]] idxen offset:4 ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_immoffs_large(ptr addrspace(8) inreg) { main_body: - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 4, i32 8188, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 4, i32 8188, i32 0) ret <4 x float> %data } ;CHECK-LABEL: {{^}}buffer_load_idx: ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) { +define amdgpu_ps <4 x float> @buffer_load_idx(ptr addrspace(8) inreg, i32) { main_body: - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %0, i32 %1, i32 0, i32 0, i32 0) ret <4 x float> %data } ;CHECK-LABEL: {{^}}buffer_load_ofs: ;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { +define amdgpu_ps <4 x float> @buffer_load_ofs(ptr addrspace(8) inreg, i32) { main_body: - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 %1, i32 0, i32 0) ret <4 x float> %data } ;CHECK-LABEL: {{^}}buffer_load_ofs_imm: ;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60 ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { +define amdgpu_ps <4 x float> @buffer_load_ofs_imm(ptr addrspace(8) inreg, i32) { main_body: %ofs = add i32 %1, 60 - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 %ofs, i32 0, i32 0) ret <4 x float> %data } ;CHECK-LABEL: {{^}}buffer_load_both: ;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) { +define amdgpu_ps <4 x float> @buffer_load_both(ptr addrspace(8) inreg, i32, i32) { main_body: - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %0, i32 %1, i32 %2, i32 0, i32 0) ret <4 x float> %data } @@ -77,37 +77,37 @@ ;CHECK: v_mov_b32_e32 v2, v0 ;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) { +define amdgpu_ps <4 x float> @buffer_load_both_reversed(ptr addrspace(8) inreg, i32, i32) { main_body: - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %0, i32 %2, i32 %1, i32 0, i32 0) ret <4 x float> %data } ;CHECK-LABEL: {{^}}buffer_load_x1: ;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +define amdgpu_ps float @buffer_load_x1(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) { main_body: - %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) + %data = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) ret float %data } ;CHECK-LABEL: {{^}}buffer_load_x2: ;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +define amdgpu_ps <2 x float> @buffer_load_x2(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) { main_body: - %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) + %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) ret <2 x float> %data } ;CHECK-LABEL: {{^}}buffer_load_negative_offset: ;CHECK: v_add_{{[iu]}}32_e32 {{v[0-9]+}}, vcc, -16, v0 ;CHECK: buffer_load_dwordx4 v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen -define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) { +define amdgpu_ps <4 x float> @buffer_load_negative_offset(ptr addrspace(8) inreg, i32 %ofs) { main_body: %ofs.1 = add i32 %ofs, -16 - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs.1, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 %ofs.1, i32 0, i32 0) ret <4 x float> %data } @@ -116,10 +116,10 @@ ; CHECK-LABEL: buffer_load_mmo: ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4 -define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, ptr addrspace(3) %lds) { +define amdgpu_ps float @buffer_load_mmo(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %lds) { entry: store float 0.0, ptr addrspace(3) %lds - %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %val = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %tmp2 = getelementptr float, ptr addrspace(3) %lds, i32 4 store float 0.0, ptr addrspace(3) %tmp2 ret float %val @@ -130,11 +130,11 @@ ;CHECK: buffer_load_dwordx2 v[4:5], {{v[0-9]+}}, s[0:3], 0 idxen glc ;CHECK: buffer_load_dword v6, {{v[0-9]+}}, s[0:3], 0 idxen slc ;CHECK: s_waitcnt -define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(<4 x i32> inreg) { +define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(ptr addrspace(8) inreg) { main_body: - %data = call <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0) - %data_glc = call <2 x i32> @llvm.amdgcn.struct.buffer.load.v2i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 1) - %data_slc = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 2) + %data = call <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0) + %data_glc = call <2 x i32> @llvm.amdgcn.struct.buffer.load.v2i32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1) + %data_slc = call i32 @llvm.amdgcn.struct.buffer.load.i32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 2) %fdata = bitcast <4 x i32> %data to <4 x float> %fdata_glc = bitcast <2 x i32> %data_glc to <2 x float> %fdata_slc = bitcast i32 %data_slc to float @@ -150,9 +150,9 @@ ;CHECK: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @struct_buffer_load_ubyte(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +define amdgpu_ps float @struct_buffer_load_ubyte(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) { main_body: - %tmp = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) + %tmp = call i8 @llvm.amdgcn.struct.buffer.load.i8(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) %tmp2 = zext i8 %tmp to i32 %val = uitofp i32 %tmp2 to float ret float %val @@ -164,9 +164,9 @@ ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0 ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @struct_buffer_load_ushort(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +define amdgpu_ps float @struct_buffer_load_ushort(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) { main_body: - %tmp = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) + %tmp = call i16 @llvm.amdgcn.struct.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) %tmp2 = zext i16 %tmp to i32 %val = uitofp i32 %tmp2 to float ret float %val @@ -178,9 +178,9 @@ ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @struct_buffer_load_sbyte(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +define amdgpu_ps float @struct_buffer_load_sbyte(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) { main_body: - %tmp = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) + %tmp = call i8 @llvm.amdgcn.struct.buffer.load.i8(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) %tmp2 = sext i8 %tmp to i32 %val = sitofp i32 %tmp2 to float ret float %val @@ -192,9 +192,9 @@ ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @struct_buffer_load_sshort(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +define amdgpu_ps float @struct_buffer_load_sshort(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) { main_body: - %tmp = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) + %tmp = call i16 @llvm.amdgcn.struct.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) %tmp2 = sext i16 %tmp to i32 %val = sitofp i32 %tmp2 to float ret float %val @@ -205,9 +205,9 @@ ;CHECK-NEXT: buffer_load_ushort [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: ds_write_b16 v0, [[VAL]] -define amdgpu_ps void @struct_buffer_load_f16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) { +define amdgpu_ps void @struct_buffer_load_f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) { main_body: - %val = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0) + %val = call half @llvm.amdgcn.struct.buffer.load.f16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0) store half %val, ptr addrspace(3) %ptr ret void } @@ -217,9 +217,9 @@ ;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: ds_write_b32 v0, [[VAL]] -define amdgpu_ps void @struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) { +define amdgpu_ps void @struct_buffer_load_v2f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) { main_body: - %val = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0) + %val = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0) store <2 x half> %val, ptr addrspace(3) %ptr ret void } @@ -229,9 +229,9 @@ ;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], v1, s[0:3], 0 idxen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: ds_write_b64 v0, [[VAL]] -define amdgpu_ps void @struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) { +define amdgpu_ps void @struct_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) { main_body: - %val = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0) + %val = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0) store <4 x half> %val, ptr addrspace(3) %ptr ret void } @@ -241,9 +241,9 @@ ;CHECK-NEXT: buffer_load_ushort [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: ds_write_b16 v0, [[VAL]] -define amdgpu_ps void @struct_buffer_load_i16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) { +define amdgpu_ps void @struct_buffer_load_i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) { main_body: - %val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0) + %val = call i16 @llvm.amdgcn.struct.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0) store i16 %val, ptr addrspace(3) %ptr ret void } @@ -253,9 +253,9 @@ ;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: ds_write_b32 v0, [[VAL]] -define amdgpu_ps void @struct_buffer_load_v2i16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) { +define amdgpu_ps void @struct_buffer_load_v2i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) { main_body: - %val = call <2 x i16> @llvm.amdgcn.struct.buffer.load.v2i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0) + %val = call <2 x i16> @llvm.amdgcn.struct.buffer.load.v2i16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0) store <2 x i16> %val, ptr addrspace(3) %ptr ret void } @@ -265,28 +265,28 @@ ;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], v1, s[0:3], 0 idxen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: ds_write_b64 v0, [[VAL]] -define amdgpu_ps void @struct_buffer_load_v4i16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) { +define amdgpu_ps void @struct_buffer_load_v4i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) { main_body: - %val = call <4 x i16> @llvm.amdgcn.struct.buffer.load.v4i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0) + %val = call <4 x i16> @llvm.amdgcn.struct.buffer.load.v4i16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0) store <4 x i16> %val, ptr addrspace(3) %ptr ret void } -declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #0 -declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32>, i32, i32, i32, i32) #0 -declare <2 x i32> @llvm.amdgcn.struct.buffer.load.v2i32(<4 x i32>, i32, i32, i32, i32) #0 -declare <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #0 +declare float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8), i32, i32, i32, i32) #0 +declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8), i32, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8), i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.load.i32(ptr addrspace(8), i32, i32, i32, i32) #0 +declare <2 x i32> @llvm.amdgcn.struct.buffer.load.v2i32(ptr addrspace(8), i32, i32, i32, i32) #0 +declare <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(ptr addrspace(8), i32, i32, i32, i32) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 -declare i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32>, i32, i32, i32, i32) #0 +declare i8 @llvm.amdgcn.struct.buffer.load.i8(ptr addrspace(8), i32, i32, i32, i32) #0 -declare half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32>, i32, i32, i32, i32) #0 -declare <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32>, i32, i32, i32, i32) #0 -declare <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32>, i32, i32, i32, i32) #0 +declare half @llvm.amdgcn.struct.buffer.load.f16(ptr addrspace(8), i32, i32, i32, i32) #0 +declare <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(ptr addrspace(8), i32, i32, i32, i32) #0 +declare <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(ptr addrspace(8), i32, i32, i32, i32) #0 -declare i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32>, i32, i32, i32, i32) #0 -declare <2 x i16> @llvm.amdgcn.struct.buffer.load.v2i16(<4 x i32>, i32, i32, i32, i32) #0 -declare <4 x i16> @llvm.amdgcn.struct.buffer.load.v4i16(<4 x i32>, i32, i32, i32, i32) #0 +declare i16 @llvm.amdgcn.struct.buffer.load.i16(ptr addrspace(8), i32, i32, i32, i32) #0 +declare <2 x i16> @llvm.amdgcn.struct.buffer.load.v2i16(ptr addrspace(8), i32, i32, i32, i32) #0 +declare <4 x i16> @llvm.amdgcn.struct.buffer.load.v4i16(ptr addrspace(8), i32, i32, i32, i32) #0 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll @@ -6,9 +6,9 @@ ; GCN: s_load_dword s[[LO:[0-9]+]] ; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]] ; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen -define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %index) { +define amdgpu_kernel void @buffer_store_format_d16_x(ptr addrspace(8) %rsrc, [8 x i32], half %data, [8 x i32], i32 %index) { main_body: - call void @llvm.amdgcn.struct.buffer.store.format.f16(half %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.f16(half %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } @@ -22,9 +22,9 @@ ; UNPACKED: buffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen ; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen -define amdgpu_kernel void @buffer_store_format_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %index) { +define amdgpu_kernel void @buffer_store_format_d16_xy(ptr addrspace(8) %rsrc, <2 x half> %data, i32 %index) { main_body: - call void @llvm.amdgcn.struct.buffer.store.format.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v2f16(<2 x half> %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } @@ -45,10 +45,10 @@ ; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED0]] ; PACKED: buffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen -define amdgpu_kernel void @buffer_store_format_d16_xyz(<4 x i32> %rsrc, <4 x half> %data, i32 %index) { +define amdgpu_kernel void @buffer_store_format_d16_xyz(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %index) { main_body: %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> - call void @llvm.amdgcn.struct.buffer.store.format.v3f16(<3 x half> %data_subvec, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v3f16(<3 x half> %data_subvec, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } @@ -69,9 +69,9 @@ ; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] ; PACKED: buffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen -define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %index) { +define amdgpu_kernel void @buffer_store_format_d16_xyzw(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %index) { main_body: - call void @llvm.amdgcn.struct.buffer.store.format.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v4f16(<4 x half> %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } @@ -79,14 +79,14 @@ ; GCN: s_load_dword s[[LO:[0-9]+]] ; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]] ; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen -define amdgpu_kernel void @buffer_store_format_i16_x(<4 x i32> %rsrc, [8 x i32], i16 %data, [8 x i32], i32 %index) { +define amdgpu_kernel void @buffer_store_format_i16_x(ptr addrspace(8) %rsrc, [8 x i32], i16 %data, [8 x i32], i32 %index) { main_body: - call void @llvm.amdgcn.struct.buffer.store.format.i16(i16 %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.i16(i16 %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -declare void @llvm.amdgcn.struct.buffer.store.format.f16(half, <4 x i32>, i32, i32, i32, i32) -declare void @llvm.amdgcn.struct.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32) -declare void @llvm.amdgcn.struct.buffer.store.format.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32) -declare void @llvm.amdgcn.struct.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32) -declare void @llvm.amdgcn.struct.buffer.store.format.i16(i16, <4 x i32>, i32, i32, i32, i32) +declare void @llvm.amdgcn.struct.buffer.store.format.f16(half, ptr addrspace(8), i32, i32, i32, i32) +declare void @llvm.amdgcn.struct.buffer.store.format.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32, i32) +declare void @llvm.amdgcn.struct.buffer.store.format.v3f16(<3 x half>, ptr addrspace(8), i32, i32, i32, i32) +declare void @llvm.amdgcn.struct.buffer.store.format.v4f16(<4 x half>, ptr addrspace(8), i32, i32, i32, i32) +declare void @llvm.amdgcn.struct.buffer.store.format.i16(i16, ptr addrspace(8), i32, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.ll @@ -6,47 +6,47 @@ ;CHECK: buffer_store_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen ;CHECK: buffer_store_format_xyzw v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc ;CHECK: buffer_store_format_xyzw v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc -define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +define amdgpu_ps void @buffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x float>, <4 x float>) { main_body: - call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0) - call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1) - call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 2) + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %2, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1) + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %3, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 2) ret void } ;CHECK-LABEL: {{^}}buffer_store_immoffs: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42 -define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +define amdgpu_ps void @buffer_store_immoffs(ptr addrspace(8) inreg, <4 x float>) { main_body: - call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 42, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_idx: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) { +define amdgpu_ps void @buffer_store_idx(ptr addrspace(8) inreg, <4 x float>, i32) { main_body: - call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_ofs: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen -define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { +define amdgpu_ps void @buffer_store_ofs(ptr addrspace(8) inreg, <4 x float>, i32) { main_body: - call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 %2, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_both: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen -define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) { +define amdgpu_ps void @buffer_store_both(ptr addrspace(8) inreg, <4 x float>, i32, i32) { main_body: - call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 %3, i32 0, i32 0) ret void } @@ -54,9 +54,9 @@ ;CHECK: v_mov_b32_e32 v6, v4 ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], v[5:6], s[0:3], 0 idxen offen -define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) { +define amdgpu_ps void @buffer_store_both_reversed(ptr addrspace(8) inreg, <4 x float>, i32, i32) { main_body: - call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %3, i32 %2, i32 0, i32 0) ret void } @@ -69,46 +69,46 @@ ;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { +define amdgpu_ps void @buffer_store_wait(ptr addrspace(8) inreg, <4 x float>, i32, i32, i32) { main_body: - call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0) - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0, i32 0) - call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %3, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %data, ptr addrspace(8) %0, i32 %4, i32 0, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_x1: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) { +define amdgpu_ps void @buffer_store_x1(ptr addrspace(8) inreg %rsrc, float %data, i32 %index) { main_body: - call void @llvm.amdgcn.struct.buffer.store.format.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.f32(float %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_x1_i32: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_x1_i32(<4 x i32> inreg %rsrc, i32 %data, i32 %index) { +define amdgpu_ps void @buffer_store_x1_i32(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %index) { main_body: - call void @llvm.amdgcn.struct.buffer.store.format.i32(i32 %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.i32(i32 %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_x2: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) { +define amdgpu_ps void @buffer_store_x2(ptr addrspace(8) inreg %rsrc, <2 x float> %data, i32 %index) { main_body: - call void @llvm.amdgcn.struct.buffer.store.format.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v2f32(<2 x float> %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -declare void @llvm.amdgcn.struct.buffer.store.format.f32(float, <4 x i32>, i32, i32, i32, i32) #0 -declare void @llvm.amdgcn.struct.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32) #0 -declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0 -declare void @llvm.amdgcn.struct.buffer.store.format.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32) #1 +declare void @llvm.amdgcn.struct.buffer.store.format.f32(float, ptr addrspace(8), i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.format.v2f32(<2 x float>, ptr addrspace(8), i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.format.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8), i32, i32, i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll @@ -6,47 +6,47 @@ ;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen ;CHECK: buffer_store_dwordx4 v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc ;CHECK: buffer_store_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc -define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +define amdgpu_ps void @buffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x float>, <4 x float>) { main_body: - call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0) - call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1) - call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 2) + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %2, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1) + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %3, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 2) ret void } ;CHECK-LABEL: {{^}}buffer_store_immoffs: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42 -define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +define amdgpu_ps void @buffer_store_immoffs(ptr addrspace(8) inreg, <4 x float>) { main_body: - call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 42, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_idx: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) { +define amdgpu_ps void @buffer_store_idx(ptr addrspace(8) inreg, <4 x float>, i32) { main_body: - call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_ofs: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen -define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { +define amdgpu_ps void @buffer_store_ofs(ptr addrspace(8) inreg, <4 x float>, i32) { main_body: - call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 %2, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_both: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen -define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) { +define amdgpu_ps void @buffer_store_both(ptr addrspace(8) inreg, <4 x float>, i32, i32) { main_body: - call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 %3, i32 0, i32 0) ret void } @@ -54,9 +54,9 @@ ;CHECK: v_mov_b32_e32 v6, v4 ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen -define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) { +define amdgpu_ps void @buffer_store_both_reversed(ptr addrspace(8) inreg, <4 x float>, i32, i32) { main_body: - call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %3, i32 %2, i32 0, i32 0) ret void } @@ -69,29 +69,29 @@ ;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { +define amdgpu_ps void @buffer_store_wait(ptr addrspace(8) inreg, <4 x float>, i32, i32, i32) { main_body: - call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0) - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0, i32 0) - call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %0, i32 %3, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %0, i32 %4, i32 0, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_x1: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) { +define amdgpu_ps void @buffer_store_x1(ptr addrspace(8) inreg %rsrc, float %data, i32 %index) { main_body: - call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_x2: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 { +define amdgpu_ps void @buffer_store_x2(ptr addrspace(8) inreg %rsrc, <2 x float> %data, i32 %index) #0 { main_body: - call void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float> %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } @@ -100,11 +100,11 @@ ;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen ;CHECK: buffer_store_dwordx2 v[4:5], {{v[0-9]+}}, s[0:3], 0 idxen glc ;CHECK: buffer_store_dword v6, {{v[0-9]+}}, s[0:3], 0 idxen slc -define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i32) { +define amdgpu_ps void @buffer_store_int(ptr addrspace(8) inreg, <4 x i32>, <2 x i32>, i32) { main_body: - call void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0) - call void @llvm.amdgcn.struct.buffer.store.v2i32(<2 x i32> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1) - call void @llvm.amdgcn.struct.buffer.store.i32(i32 %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 2) + call void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32> %1, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v2i32(<2 x i32> %2, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1) + call void @llvm.amdgcn.struct.buffer.store.i32(i32 %3, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 2) ret void } @@ -113,11 +113,11 @@ ;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} ;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen ;CHECK-NEXT: s_endpgm -define amdgpu_ps void @struct_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1, i32 %index) { +define amdgpu_ps void @struct_buffer_store_byte(ptr addrspace(8) inreg %rsrc, float %v1, i32 %index) { main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i8 - call void @llvm.amdgcn.struct.buffer.store.i8(i8 %v3, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.i8(i8 %v3, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } @@ -126,25 +126,25 @@ ;CHECK-NEXT: v_cvt_f16_f32_e32 v{{[0-9]}}, v{{[0-9]}} ;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen ;CHECK-NEXT: s_endpgm -define amdgpu_ps void @struct_buffer_store_f16(<4 x i32> inreg %rsrc, float %v1, i32 %index) { +define amdgpu_ps void @struct_buffer_store_f16(ptr addrspace(8) inreg %rsrc, float %v1, i32 %index) { %v2 = fptrunc float %v1 to half - call void @llvm.amdgcn.struct.buffer.store.f16(half %v2, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f16(half %v2, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}struct_buffer_store_v2f16: ;CHECK-NEXT: %bb. ;CHECK: buffer_store_dword v0, {{v[0-9]+}}, s[0:3], 0 idxen -define amdgpu_ps void @struct_buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %v1, i32 %index) { - call void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) +define amdgpu_ps void @struct_buffer_store_v2f16(ptr addrspace(8) inreg %rsrc, <2 x half> %v1, i32 %index) { + call void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half> %v1, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}struct_buffer_store_v4f16: ;CHECK-NEXT: %bb. ;CHECK: buffer_store_dwordx2 v[0:1], {{v[0-9]+}}, s[0:3], 0 idxen -define amdgpu_ps void @struct_buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x half> %v1, i32 %index) { - call void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) +define amdgpu_ps void @struct_buffer_store_v4f16(ptr addrspace(8) inreg %rsrc, <4 x half> %v1, i32 %index) { + call void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half> %v1, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } @@ -153,44 +153,44 @@ ;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} ;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen ;CHECK-NEXT: s_endpgm -define amdgpu_ps void @struct_buffer_store_i16(<4 x i32> inreg %rsrc, float %v1, i32 %index) { +define amdgpu_ps void @struct_buffer_store_i16(ptr addrspace(8) inreg %rsrc, float %v1, i32 %index) { main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i16 - call void @llvm.amdgcn.struct.buffer.store.i16(i16 %v3, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.i16(i16 %v3, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}struct_buffer_store_vif16: ;CHECK-NEXT: %bb. ;CHECK: buffer_store_dword v0, {{v[0-9]+}}, s[0:3], 0 idxen -define amdgpu_ps void @struct_buffer_store_vif16(<4 x i32> inreg %rsrc, <2 x i16> %v1, i32 %index) { - call void @llvm.amdgcn.struct.buffer.store.v2i16(<2 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) +define amdgpu_ps void @struct_buffer_store_vif16(ptr addrspace(8) inreg %rsrc, <2 x i16> %v1, i32 %index) { + call void @llvm.amdgcn.struct.buffer.store.v2i16(<2 x i16> %v1, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } ;CHECK-LABEL: {{^}}struct_buffer_store_v4i16: ;CHECK-NEXT: %bb. ;CHECK: buffer_store_dwordx2 v[0:1], {{v[0-9]+}}, s[0:3], 0 idxen -define amdgpu_ps void @struct_buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16> %v1, i32 %index) { - call void @llvm.amdgcn.struct.buffer.store.v4i16(<4 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) - ret void -} - -declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32) #0 -declare void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32) #0 -declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0 -declare void @llvm.amdgcn.struct.buffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 -declare void @llvm.amdgcn.struct.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32) #0 -declare void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #1 -declare void @llvm.amdgcn.struct.buffer.store.i8(i8, <4 x i32>, i32, i32, i32, i32) #0 -declare void @llvm.amdgcn.struct.buffer.store.i16(i16, <4 x i32>, i32, i32, i32, i32) #0 -declare void @llvm.amdgcn.struct.buffer.store.v2i16(<2 x i16>, <4 x i32>, i32, i32, i32, i32) #0 -declare void @llvm.amdgcn.struct.buffer.store.v4i16(<4 x i16>, <4 x i32>, i32, i32, i32, i32) #0 -declare void @llvm.amdgcn.struct.buffer.store.f16(half, <4 x i32>, i32, i32, i32, i32) #0 -declare void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32) #0 -declare void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32) #0 +define amdgpu_ps void @struct_buffer_store_v4i16(ptr addrspace(8) inreg %rsrc, <4 x i16> %v1, i32 %index) { + call void @llvm.amdgcn.struct.buffer.store.v4i16(<4 x i16> %v1, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float>, ptr addrspace(8), i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.v2i32(<2 x i32>, ptr addrspace(8), i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32>, ptr addrspace(8), i32, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8), i32, i32, i32, i32) #1 +declare void @llvm.amdgcn.struct.buffer.store.i8(i8, ptr addrspace(8), i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.i16(i16, ptr addrspace(8), i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.v2i16(<2 x i16>, ptr addrspace(8), i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.v4i16(<4 x i16>, ptr addrspace(8), i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.f16(half, ptr addrspace(8), i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half>, ptr addrspace(8), i32, i32, i32, i32) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll @@ -5,7 +5,7 @@ ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GFX10-PACKED %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GFX11-PACKED %s -define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) { +define amdgpu_ps half @tbuffer_load_d16_x(ptr addrspace(8) inreg %rsrc) { ; PREGFX10-UNPACKED-LABEL: tbuffer_load_d16_x: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] @@ -34,11 +34,11 @@ ; GFX11-PACKED-NEXT: s_waitcnt vmcnt(0) ; GFX11-PACKED-NEXT: ; return to shader part epilog main_body: - %data = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 22, i32 0) + %data = call half @llvm.amdgcn.struct.tbuffer.load.f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 22, i32 0) ret half %data } -define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) { +define amdgpu_ps half @tbuffer_load_d16_xy(ptr addrspace(8) inreg %rsrc) { ; PREGFX10-UNPACKED-LABEL: tbuffer_load_d16_xy: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] @@ -71,12 +71,12 @@ ; GFX11-PACKED-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-PACKED-NEXT: ; return to shader part epilog main_body: - %data = call <2 x half> @llvm.amdgcn.struct.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 22, i32 0) + %data = call <2 x half> @llvm.amdgcn.struct.tbuffer.load.v2f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 22, i32 0) %elt = extractelement <2 x half> %data, i32 1 ret half %elt } -define amdgpu_ps half @tbuffer_load_d16_xyz(<4 x i32> inreg %rsrc) { +define amdgpu_ps half @tbuffer_load_d16_xyz(ptr addrspace(8) inreg %rsrc) { ; PREGFX10-UNPACKED-LABEL: tbuffer_load_d16_xyz: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] @@ -109,12 +109,12 @@ ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-PACKED-NEXT: ; return to shader part epilog main_body: - %data = call <3 x half> @llvm.amdgcn.struct.tbuffer.load.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 22, i32 0) + %data = call <3 x half> @llvm.amdgcn.struct.tbuffer.load.v3f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 22, i32 0) %elt = extractelement <3 x half> %data, i32 2 ret half %elt } -define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) { +define amdgpu_ps half @tbuffer_load_d16_xyzw(ptr addrspace(8) inreg %rsrc) { ; PREGFX10-UNPACKED-LABEL: tbuffer_load_d16_xyzw: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] @@ -147,12 +147,12 @@ ; GFX11-PACKED-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX11-PACKED-NEXT: ; return to shader part epilog main_body: - %data = call <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 22, i32 0) + %data = call <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 22, i32 0) %elt = extractelement <4 x half> %data, i32 3 ret half %elt } -declare half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32, i32) -declare <2 x half> @llvm.amdgcn.struct.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32, i32) -declare <3 x half> @llvm.amdgcn.struct.tbuffer.load.v3f16(<4 x i32>, i32, i32, i32, i32, i32) -declare <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32, i32) +declare half @llvm.amdgcn.struct.tbuffer.load.f16(ptr addrspace(8), i32, i32, i32, i32, i32) +declare <2 x half> @llvm.amdgcn.struct.tbuffer.load.v2f16(ptr addrspace(8), i32, i32, i32, i32, i32) +declare <3 x half> @llvm.amdgcn.struct.tbuffer.load.v3f16(ptr addrspace(8), i32, i32, i32, i32, i32) +declare <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(ptr addrspace(8), i32, i32, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll @@ -4,7 +4,7 @@ ;RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10 %s ;RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX11 %s -define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) { +define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(ptr addrspace(8) inreg %0) { ; PREGFX10-LABEL: tbuffer_load: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: v_mov_b32_e32 v12, 0 @@ -37,10 +37,10 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 78, i32 0) - %vdata_glc = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 63, i32 1) - %vdata_slc = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 22, i32 2) - %vdata_f32 = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 22, i32 5) + %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 78, i32 0) + %vdata_glc = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 63, i32 1) + %vdata_slc = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 22, i32 2) + %vdata_f32 = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 22, i32 5) %vdata.f = bitcast <4 x i32> %vdata to <4 x float> %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float> %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float> @@ -51,7 +51,7 @@ ret {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r3 } -define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) { +define amdgpu_vs <4 x float> @tbuffer_load_immoffs(ptr addrspace(8) inreg %0) { ; PREGFX10-LABEL: tbuffer_load_immoffs: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -73,12 +73,12 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 42, i32 0, i32 78, i32 0) + %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 42, i32 0, i32 78, i32 0) %vdata.f = bitcast <4 x i32> %vdata to <4 x float> ret <4 x float> %vdata.f } -define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) { +define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(ptr addrspace(8) inreg %0, i32 inreg %soffs) { ; PREGFX10-LABEL: tbuffer_load_immoffs_large: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: v_mov_b32_e32 v8, 0 @@ -107,9 +107,9 @@ ; GFX11-NEXT: tbuffer_load_format_xyzw v[8:11], v8, s[0:3], s4 format:77 idxen offset:1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog - %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 4095, i32 61, i32 47, i32 0) - %vdata_glc = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 73, i32 %soffs, i32 62, i32 0) - %vdata_slc = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 1, i32 %soffs, i32 77, i32 0) + %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 4095, i32 61, i32 47, i32 0) + %vdata_glc = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 73, i32 %soffs, i32 62, i32 0) + %vdata_slc = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 1, i32 %soffs, i32 77, i32 0) %vdata.f = bitcast <4 x i32> %vdata to <4 x float> %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float> %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float> @@ -119,7 +119,7 @@ ret {<4 x float>, <4 x float>, <4 x float>} %r2 } -define amdgpu_vs <4 x float> @tbuffer_load_idx(<4 x i32> inreg, i32 %vindex) { +define amdgpu_vs <4 x float> @tbuffer_load_idx(ptr addrspace(8) inreg %0, i32 %vindex) { ; PREGFX10-LABEL: tbuffer_load_idx: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen @@ -138,12 +138,12 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 78, i32 0) + %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 %vindex, i32 0, i32 0, i32 78, i32 0) %vdata.f = bitcast <4 x i32> %vdata to <4 x float> ret <4 x float> %vdata.f } -define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) { +define amdgpu_vs <4 x float> @tbuffer_load_ofs(ptr addrspace(8) inreg %0, i32 %voffs) { ; PREGFX10-LABEL: tbuffer_load_ofs: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: s_mov_b32 s4, 0 @@ -171,12 +171,12 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 78, i32 0) + %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 %voffs, i32 0, i32 78, i32 0) %vdata.f = bitcast <4 x i32> %vdata to <4 x float> ret <4 x float> %vdata.f } -define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) { +define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(ptr addrspace(8) inreg %0, i32 %voffs) { ; PREGFX10-LABEL: tbuffer_load_ofs_imm: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: s_mov_b32 s4, 0 @@ -205,12 +205,12 @@ ; GFX11-NEXT: ; return to shader part epilog main_body: %ofs = add i32 %voffs, 52 - %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %ofs, i32 0, i32 78, i32 0) + %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 %ofs, i32 0, i32 78, i32 0) %vdata.f = bitcast <4 x i32> %vdata to <4 x float> ret <4 x float> %vdata.f } -define amdgpu_vs <4 x float> @tbuffer_load_both(<4 x i32> inreg, i32 %vindex, i32 %voffs) { +define amdgpu_vs <4 x float> @tbuffer_load_both(ptr addrspace(8) inreg %0, i32 %vindex, i32 %voffs) { ; PREGFX10-LABEL: tbuffer_load_both: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: tbuffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen offen @@ -229,12 +229,12 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 %voffs, i32 0, i32 78, i32 0) + %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 %vindex, i32 %voffs, i32 0, i32 78, i32 0) %vdata.f = bitcast <4 x i32> %vdata to <4 x float> ret <4 x float> %vdata.f } -define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { +define amdgpu_vs <2 x float> @buffer_load_xy(ptr addrspace(8) inreg %rsrc) { ; PREGFX10-LABEL: buffer_load_xy: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -255,12 +255,12 @@ ; GFX11-NEXT: tbuffer_load_format_xy v[0:1], v0, s[0:3], 0 format:77 idxen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog - %vdata = call <2 x i32> @llvm.amdgcn.struct.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 77, i32 0) + %vdata = call <2 x i32> @llvm.amdgcn.struct.tbuffer.load.v2i32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 77, i32 0) %vdata.f = bitcast <2 x i32> %vdata to <2 x float> ret <2 x float> %vdata.f } -define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) { +define amdgpu_vs float @buffer_load_x(ptr addrspace(8) inreg %rsrc) { ; PREGFX10-LABEL: buffer_load_x: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -281,12 +281,12 @@ ; GFX11-NEXT: tbuffer_load_format_x v0, v0, s[0:3], 0 format:77 idxen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog - %vdata = call i32 @llvm.amdgcn.struct.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 77, i32 0) + %vdata = call i32 @llvm.amdgcn.struct.tbuffer.load.i32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 77, i32 0) %vdata.f = bitcast i32 %vdata to float ret float %vdata.f } -define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(ptr addrspace(8) inreg %0) { ; PREGFX10-LABEL: buffer_load_voffset_large_12bit: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -308,11 +308,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 4092, i32 0, i32 63, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 4092, i32 0, i32 63, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_13bit(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_13bit(ptr addrspace(8) inreg %0) { ; PREGFX10-LABEL: tbuffer_load_voffset_large_13bit: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: s_mov_b32 s4, 0 @@ -340,11 +340,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 8188, i32 0, i32 63, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 8188, i32 0, i32 63, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_16bit(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_16bit(ptr addrspace(8) inreg %0) { ; PREGFX10-LABEL: tbuffer_load_voffset_large_16bit: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: s_mov_b32 s4, 0 @@ -372,11 +372,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 65532, i32 0, i32 63, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 65532, i32 0, i32 63, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_23bit(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_23bit(ptr addrspace(8) inreg %0) { ; PREGFX10-LABEL: tbuffer_load_voffset_large_23bit: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: s_mov_b32 s4, 0 @@ -404,11 +404,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 8388604, i32 0, i32 63, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 8388604, i32 0, i32 63, i32 0) ret <4 x float> %data } -define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_24bit(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_24bit(ptr addrspace(8) inreg %0) { ; PREGFX10-LABEL: tbuffer_load_voffset_large_24bit: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: s_mov_b32 s4, 0 @@ -436,11 +436,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 16777212, i32 0, i32 63, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 16777212, i32 0, i32 63, i32 0) ret <4 x float> %data } -declare i32 @llvm.amdgcn.struct.tbuffer.load.i32(<4 x i32>, i32, i32, i32, i32, i32) -declare <2 x i32> @llvm.amdgcn.struct.tbuffer.load.v2i32(<4 x i32>, i32, i32, i32, i32, i32) -declare <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32) -declare <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32) +declare i32 @llvm.amdgcn.struct.tbuffer.load.i32(ptr addrspace(8), i32, i32, i32, i32, i32) +declare <2 x i32> @llvm.amdgcn.struct.tbuffer.load.v2i32(ptr addrspace(8), i32, i32, i32, i32, i32) +declare <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(ptr addrspace(8), i32, i32, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8), i32, i32, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll @@ -5,7 +5,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-PACKED %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-PACKED %s -define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 %vindex) { +define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body ; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 @@ -49,11 +49,11 @@ ; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.struct.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.f16(half %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) ret void } -define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) { +define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body ; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 @@ -100,11 +100,11 @@ ; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.struct.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v2f16(<2 x half> %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) ret void } -define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { +define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body ; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 @@ -164,11 +164,11 @@ ; GFX11-PACKED-NEXT: s_endpgm main_body: %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> - call void @llvm.amdgcn.struct.tbuffer.store.v3f16(<3 x half> %data_subvec, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v3f16(<3 x half> %data_subvec, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) ret void } -define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { +define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body ; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 @@ -226,11 +226,11 @@ ; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.struct.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v4f16(<4 x half> %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) ret void } -declare void @llvm.amdgcn.struct.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32, i32, i32) -declare void @llvm.amdgcn.struct.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32, i32) -declare void @llvm.amdgcn.struct.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32, i32) -declare void @llvm.amdgcn.struct.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32, i32) +declare void @llvm.amdgcn.struct.tbuffer.store.f16(half, ptr addrspace(8), i32, i32, i32, i32, i32) +declare void @llvm.amdgcn.struct.tbuffer.store.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32, i32, i32) +declare void @llvm.amdgcn.struct.tbuffer.store.v3f16(<3 x half>, ptr addrspace(8), i32, i32, i32, i32, i32) +declare void @llvm.amdgcn.struct.tbuffer.store.v4f16(<4 x half>, ptr addrspace(8), i32, i32, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll @@ -4,7 +4,7 @@ ;RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10 %s ;RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX11 %s -define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +define amdgpu_ps void @tbuffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x float>, <4 x float>) { ; VERDE-LABEL: tbuffer_store: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: v_mov_b32_e32 v12, 0 @@ -46,14 +46,14 @@ %in1 = bitcast <4 x float> %1 to <4 x i32> %in2 = bitcast <4 x float> %2 to <4 x i32> %in3 = bitcast <4 x float> %3 to <4 x i32> - call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 44, i32 0) - call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 61, i32 1) - call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 78, i32 2) - call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 78, i32 5) + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 44, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in2, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 61, i32 1) + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in3, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 78, i32 2) + call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %3, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 78, i32 5) ret void } -define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +define amdgpu_ps void @tbuffer_store_immoffs(ptr addrspace(8) inreg, <4 x float>) { ; VERDE-LABEL: tbuffer_store_immoffs: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: v_mov_b32_e32 v4, 0 @@ -80,11 +80,11 @@ ; GFX11-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> - call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 117, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, ptr addrspace(8) %0, i32 0, i32 42, i32 0, i32 117, i32 0) ret void } -define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) { +define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(ptr addrspace(8) inreg, <4 x float> %vdata, i32 inreg %soffset) { ; VERDE-LABEL: tbuffer_store_scalar_and_imm_offs: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: v_mov_b32_e32 v4, 0 @@ -111,11 +111,11 @@ ; GFX11-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> - call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 42, i32 %soffset, i32 117, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, ptr addrspace(8) %0, i32 0, i32 42, i32 %soffset, i32 117, i32 0) ret void } -define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex) { +define amdgpu_ps void @buffer_store_idx(ptr addrspace(8) inreg, <4 x float> %vdata, i32 %vindex) { ; VERDE-LABEL: buffer_store_idx: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] idxen @@ -138,11 +138,11 @@ ; GFX11-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> - call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 47, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, ptr addrspace(8) %0, i32 %vindex, i32 0, i32 0, i32 47, i32 0) ret void } -define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) { +define amdgpu_ps void @buffer_store_ofs(ptr addrspace(8) inreg, <4 x float> %vdata, i32 %voffset) { ; VERDE-LABEL: buffer_store_ofs: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: s_mov_b32 s4, 0 @@ -177,11 +177,11 @@ ; GFX11-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> - call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 %voffset, i32 0, i32 115, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, ptr addrspace(8) %0, i32 0, i32 %voffset, i32 0, i32 115, i32 0) ret void } -define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex, i32 %voffset) { +define amdgpu_ps void @buffer_store_both(ptr addrspace(8) inreg, <4 x float> %vdata, i32 %vindex, i32 %voffset) { ; VERDE-LABEL: buffer_store_both: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_UINT] idxen offen @@ -204,11 +204,11 @@ ; GFX11-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> - call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 %voffset, i32 0, i32 70, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, ptr addrspace(8) %0, i32 %vindex, i32 %voffset, i32 0, i32 70, i32 0) ret void } -define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex.1, i32 %vindex.2, i32 %vindex.3) { +define amdgpu_ps void @buffer_store_wait(ptr addrspace(8) inreg, <4 x float> %vdata, i32 %vindex.1, i32 %vindex.2, i32 %vindex.3) { ; VERDE-LABEL: buffer_store_wait: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] idxen @@ -244,14 +244,14 @@ ; GFX11-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> - call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex.1, i32 0, i32 0, i32 63, i32 0) - %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %vindex.2, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, ptr addrspace(8) %0, i32 %vindex.1, i32 0, i32 0, i32 63, i32 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %vindex.2, i32 0, i1 0, i1 0) %data.i = bitcast <4 x float> %data to <4 x i32> - call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %data.i, <4 x i32> %0, i32 %vindex.3, i32 0, i32 0, i32 46, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %data.i, ptr addrspace(8) %0, i32 %vindex.3, i32 0, i32 0, i32 46, i32 0) ret void } -define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { +define amdgpu_ps void @buffer_store_x1(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex) { ; VERDE-LABEL: buffer_store_x1: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: tbuffer_store_format_x v0, v1, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_FLOAT] idxen @@ -274,11 +274,11 @@ ; GFX11-NEXT: s_endpgm main_body: %data.i = bitcast float %data to i32 - call void @llvm.amdgcn.struct.tbuffer.store.i32(i32 %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 125, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.i32(i32 %data.i, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 125, i32 0) ret void } -define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %vindex) { +define amdgpu_ps void @buffer_store_x2(ptr addrspace(8) inreg %rsrc, <2 x float> %data, i32 %vindex) { ; VERDE-LABEL: buffer_store_x2: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: tbuffer_store_format_xy v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen @@ -301,11 +301,11 @@ ; GFX11-NEXT: s_endpgm main_body: %data.i = bitcast <2 x float> %data to <2 x i32> - call void @llvm.amdgcn.struct.tbuffer.store.v2i32(<2 x i32> %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v2i32(<2 x i32> %data.i, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) ret void } -define amdgpu_ps void @buffer_store_voffset_large_12bit(<4 x i32> inreg %rsrc, <4 x float> %data) { +define amdgpu_ps void @buffer_store_voffset_large_12bit(ptr addrspace(8) inreg %rsrc, <4 x float> %data) { ; VERDE-LABEL: buffer_store_voffset_large_12bit: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: v_mov_b32_e32 v4, 0 @@ -331,11 +331,11 @@ ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 0, i32 4092, i32 0, i32 63, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 0, i32 4092, i32 0, i32 63, i32 0) ret void } -define amdgpu_ps void @buffer_store_voffset_large_13bit(<4 x i32> inreg %rsrc, <4 x float> %data) { +define amdgpu_ps void @buffer_store_voffset_large_13bit(ptr addrspace(8) inreg %rsrc, <4 x float> %data) { ; VERDE-LABEL: buffer_store_voffset_large_13bit: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: s_mov_b32 s4, 0 @@ -369,11 +369,11 @@ ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 0, i32 8188, i32 0, i32 63, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 0, i32 8188, i32 0, i32 63, i32 0) ret void } -define amdgpu_ps void @buffer_store_voffset_large_16bit(<4 x i32> inreg %rsrc, <4 x float> %data) { +define amdgpu_ps void @buffer_store_voffset_large_16bit(ptr addrspace(8) inreg %rsrc, <4 x float> %data) { ; VERDE-LABEL: buffer_store_voffset_large_16bit: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: s_mov_b32 s4, 0 @@ -407,11 +407,11 @@ ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 0, i32 65532, i32 0, i32 63, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 0, i32 65532, i32 0, i32 63, i32 0) ret void } -define amdgpu_ps void @buffer_store_voffset_large_23bit(<4 x i32> inreg %rsrc, <4 x float> %data) { +define amdgpu_ps void @buffer_store_voffset_large_23bit(ptr addrspace(8) inreg %rsrc, <4 x float> %data) { ; VERDE-LABEL: buffer_store_voffset_large_23bit: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: s_mov_b32 s4, 0 @@ -445,11 +445,11 @@ ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 0, i32 8388604, i32 0, i32 63, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 0, i32 8388604, i32 0, i32 63, i32 0) ret void } -define amdgpu_ps void @buffer_store_voffset_large_24bit(<4 x i32> inreg %rsrc, <4 x float> %data) { +define amdgpu_ps void @buffer_store_voffset_large_24bit(ptr addrspace(8) inreg %rsrc, <4 x float> %data) { ; VERDE-LABEL: buffer_store_voffset_large_24bit: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: s_mov_b32 s4, 0 @@ -483,15 +483,15 @@ ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: - call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 0, i32 16777212, i32 0, i32 63, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 0, i32 16777212, i32 0, i32 63, i32 0) ret void } -declare void @llvm.amdgcn.struct.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32) #0 -declare void @llvm.amdgcn.struct.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32, i32) #0 -declare void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32) #0 -declare void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1 +declare void @llvm.amdgcn.struct.tbuffer.store.i32(i32, ptr addrspace(8), i32, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.tbuffer.store.v2i32(<2 x i32>, ptr addrspace(8), i32, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32>, ptr addrspace(8), i32, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8), i32, i32, i1, i1) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll @@ -4,9 +4,9 @@ ; GCN-LABEL: {{^}}tbuffer_load_d16_x: ; GCN: tbuffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] -define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) { +define amdgpu_ps half @tbuffer_load_d16_x(ptr addrspace(8) inreg %rsrc) { main_body: - %data = call half @llvm.amdgcn.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) + %data = call half @llvm.amdgcn.tbuffer.load.f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) ret half %data } @@ -16,9 +16,9 @@ ; PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] -define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) { +define amdgpu_ps half @tbuffer_load_d16_xy(ptr addrspace(8) inreg %rsrc) { main_body: - %data = call <2 x half> @llvm.amdgcn.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) + %data = call <2 x half> @llvm.amdgcn.tbuffer.load.v2f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) %elt = extractelement <2 x half> %data, i32 1 ret half %elt } @@ -29,9 +29,9 @@ ; PACKED: tbuffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] ; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] -define amdgpu_ps half @tbuffer_load_d16_xyz(<4 x i32> inreg %rsrc) { +define amdgpu_ps half @tbuffer_load_d16_xyz(ptr addrspace(8) inreg %rsrc) { main_body: - %data = call <3 x half> @llvm.amdgcn.tbuffer.load.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) + %data = call <3 x half> @llvm.amdgcn.tbuffer.load.v3f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) %elt = extractelement <3 x half> %data, i32 2 ret half %elt } @@ -42,14 +42,14 @@ ; PACKED: tbuffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] -define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) { +define amdgpu_ps half @tbuffer_load_d16_xyzw(ptr addrspace(8) inreg %rsrc) { main_body: - %data = call <4 x half> @llvm.amdgcn.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) + %data = call <4 x half> @llvm.amdgcn.tbuffer.load.v4f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) %elt = extractelement <4 x half> %data, i32 3 ret half %elt } -declare half @llvm.amdgcn.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) -declare <2 x half> @llvm.amdgcn.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) -declare <3 x half> @llvm.amdgcn.tbuffer.load.v3f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) -declare <4 x half> @llvm.amdgcn.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare half @llvm.amdgcn.tbuffer.load.f16(ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) +declare <2 x half> @llvm.amdgcn.tbuffer.load.v2f16(ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) +declare <3 x half> @llvm.amdgcn.tbuffer.load.v3f16(ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) +declare <4 x half> @llvm.amdgcn.tbuffer.load.v4f16(ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll @@ -4,9 +4,9 @@ ; GCN-LABEL: {{^}}tbuffer_raw_load_immoffs_x3: ; SI: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42 ; GCNX3: tbuffer_load_format_xyz {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42 -define amdgpu_vs <3 x float> @tbuffer_raw_load_immoffs_x3(<4 x i32> inreg) { +define amdgpu_vs <3 x float> @tbuffer_raw_load_immoffs_x3(ptr addrspace(8) inreg) { main_body: - %vdata = call <3 x i32> @llvm.amdgcn.raw.tbuffer.load.v3i32(<4 x i32> %0, i32 42, i32 0, i32 78, i32 0) + %vdata = call <3 x i32> @llvm.amdgcn.raw.tbuffer.load.v3i32(ptr addrspace(8) %0, i32 42, i32 0, i32 78, i32 0) %vdata.f = bitcast <3 x i32> %vdata to <3 x float> ret <3 x float> %vdata.f } @@ -16,9 +16,9 @@ ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 ; SI: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:42 ; GCNX3: tbuffer_load_format_xyz {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:42 -define amdgpu_vs <3 x float> @tbuffer_struct_load_immoffs_x3(<4 x i32> inreg) { +define amdgpu_vs <3 x float> @tbuffer_struct_load_immoffs_x3(ptr addrspace(8) inreg) { main_body: - %vdata = call <3 x i32> @llvm.amdgcn.struct.tbuffer.load.v3i32(<4 x i32> %0, i32 0, i32 42, i32 0, i32 78, i32 0) + %vdata = call <3 x i32> @llvm.amdgcn.struct.tbuffer.load.v3i32(ptr addrspace(8) %0, i32 0, i32 42, i32 0, i32 78, i32 0) %vdata.f = bitcast <3 x i32> %vdata to <3 x float> ret <3 x float> %vdata.f } @@ -27,14 +27,14 @@ ; GCN-LABEL: {{^}}tbuffer_load_format_immoffs_x3: ; SI: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42 ; GCNX3: tbuffer_load_format_xyz {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42 -define amdgpu_vs <3 x float> @tbuffer_load_format_immoffs_x3(<4 x i32> inreg) { +define amdgpu_vs <3 x float> @tbuffer_load_format_immoffs_x3(ptr addrspace(8) inreg) { main_body: - %vdata = call <3 x i32> @llvm.amdgcn.tbuffer.load.v3i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 14, i32 4, i1 0, i1 0) + %vdata = call <3 x i32> @llvm.amdgcn.tbuffer.load.v3i32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 42, i32 14, i32 4, i1 0, i1 0) %vdata.f = bitcast <3 x i32> %vdata to <3 x float> ret <3 x float> %vdata.f } -declare <3 x i32> @llvm.amdgcn.raw.tbuffer.load.v3i32(<4 x i32>, i32, i32, i32, i32) -declare <3 x i32> @llvm.amdgcn.struct.tbuffer.load.v3i32(<4 x i32>, i32, i32, i32, i32, i32) -declare <3 x i32> @llvm.amdgcn.tbuffer.load.v3i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <3 x i32> @llvm.amdgcn.raw.tbuffer.load.v3i32(ptr addrspace(8), i32, i32, i32, i32) +declare <3 x i32> @llvm.amdgcn.struct.tbuffer.load.v3i32(ptr addrspace(8), i32, i32, i32, i32, i32) +declare <3 x i32> @llvm.amdgcn.tbuffer.load.v3i32(ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll @@ -7,12 +7,12 @@ ; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] slc ; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] ; GCN: s_waitcnt -define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) { +define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(ptr addrspace(8) inreg) { main_body: - %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) - %vdata_glc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 3, i1 1, i1 0) - %vdata_slc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 1) - %vdata_f32 = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) + %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) + %vdata_glc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 3, i1 1, i1 0) + %vdata_slc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 1) + %vdata_f32 = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) %vdata.f = bitcast <4 x i32> %vdata to <4 x float> %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float> %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float> @@ -25,9 +25,9 @@ ; GCN-LABEL: {{^}}tbuffer_load_immoffs: ; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42 -define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) { +define amdgpu_vs <4 x float> @tbuffer_load_immoffs(ptr addrspace(8) inreg) { main_body: - %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 14, i32 4, i1 0, i1 0) + %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 42, i32 14, i32 4, i1 0, i1 0) %vdata.f = bitcast <4 x i32> %vdata to <4 x float> ret <4 x float> %vdata.f } @@ -37,10 +37,10 @@ ; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_SSCALED] offset:73 ; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] offset:1 ; GCN: s_waitcnt -define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) { - %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 61, i32 4095, i32 15, i32 2, i1 0, i1 0) - %vdata_glc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 %soffs, i32 73, i32 14, i32 3, i1 0, i1 0) - %vdata_slc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 %soffs, i32 1, i32 13, i32 4, i1 0, i1 0) +define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(ptr addrspace(8) inreg, i32 inreg %soffs) { + %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 0, i32 61, i32 4095, i32 15, i32 2, i1 0, i1 0) + %vdata_glc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 0, i32 %soffs, i32 73, i32 14, i32 3, i1 0, i1 0) + %vdata_slc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 0, i32 %soffs, i32 1, i32 13, i32 4, i1 0, i1 0) %vdata.f = bitcast <4 x i32> %vdata to <4 x float> %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float> %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float> @@ -52,36 +52,36 @@ ; GCN-LABEL: {{^}}tbuffer_load_idx: ; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen -define amdgpu_vs <4 x float> @tbuffer_load_idx(<4 x i32> inreg, i32 %vindex) { +define amdgpu_vs <4 x float> @tbuffer_load_idx(ptr addrspace(8) inreg, i32 %vindex) { main_body: - %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) + %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 %vindex, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) %vdata.f = bitcast <4 x i32> %vdata to <4 x float> ret <4 x float> %vdata.f } ; GCN-LABEL: {{^}}tbuffer_load_ofs: ; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offen -define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) { +define amdgpu_vs <4 x float> @tbuffer_load_ofs(ptr addrspace(8) inreg, i32 %voffs) { main_body: - %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) + %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 %voffs, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) %vdata.f = bitcast <4 x i32> %vdata to <4 x float> ret <4 x float> %vdata.f } ; GCN-LABEL: {{^}}tbuffer_load_ofs_imm: ; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offen offset:52 -define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) { +define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(ptr addrspace(8) inreg, i32 %voffs) { main_body: - %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 52, i32 14, i32 4, i1 0, i1 0) + %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 %voffs, i32 0, i32 52, i32 14, i32 4, i1 0, i1 0) %vdata.f = bitcast <4 x i32> %vdata to <4 x float> ret <4 x float> %vdata.f } ; GCN-LABEL: {{^}}tbuffer_load_both: ; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen offen -define amdgpu_vs <4 x float> @tbuffer_load_both(<4 x i32> inreg, i32 %vindex, i32 %voffs) { +define amdgpu_vs <4 x float> @tbuffer_load_both(ptr addrspace(8) inreg, i32 %vindex, i32 %voffs) { main_body: - %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 %voffs, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) + %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(ptr addrspace(8) %0, i32 %vindex, i32 %voffs, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) %vdata.f = bitcast <4 x i32> %vdata to <4 x float> ret <4 x float> %vdata.f } @@ -89,21 +89,21 @@ ; GCN-LABEL: {{^}}buffer_load_xy: ; GCN: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] -define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { - %vdata = call <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0) +define amdgpu_vs <2 x float> @buffer_load_xy(ptr addrspace(8) inreg %rsrc) { + %vdata = call <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0) %vdata.f = bitcast <2 x i32> %vdata to <2 x float> ret <2 x float> %vdata.f } ; GCN-LABEL: {{^}}buffer_load_x: ; GCN: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] -define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) { - %vdata = call i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0) +define amdgpu_vs float @buffer_load_x(ptr addrspace(8) inreg %rsrc) { + %vdata = call i32 @llvm.amdgcn.tbuffer.load.i32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0) %vdata.f = bitcast i32 %vdata to float ret float %vdata.f } -declare i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) -declare <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) -declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) -declare <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare i32 @llvm.amdgcn.tbuffer.load.i32(ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) +declare <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) +declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) +declare <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll @@ -7,9 +7,9 @@ ; GCN: s_load_dword s[[S_LO:[0-9]+]] ; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]] ; GCN: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen -define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %vindex) { +define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, [8 x i32], half %data, [8 x i32], i32 %vindex) { main_body: - call void @llvm.amdgcn.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) + call void @llvm.amdgcn.tbuffer.store.f16(half %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) ret void } @@ -22,9 +22,9 @@ ; UNPACKED: tbuffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen -define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) { +define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x half> %data, i32 %vindex) { main_body: - call void @llvm.amdgcn.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) + call void @llvm.amdgcn.tbuffer.store.v2f16(<2 x half> %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) ret void } @@ -43,9 +43,9 @@ ; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] ; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR0]] ; PACKED: tbuffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen -define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <3 x half> %data, i32 %vindex) { +define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <3 x half> %data, i32 %vindex) { main_body: - call void @llvm.amdgcn.tbuffer.store.v3f16(<3 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) + call void @llvm.amdgcn.tbuffer.store.v3f16(<3 x half> %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) ret void } @@ -64,13 +64,13 @@ ; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] ; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] ; PACKED: tbuffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen -define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { +define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %vindex) { main_body: - call void @llvm.amdgcn.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) + call void @llvm.amdgcn.tbuffer.store.v4f16(<4 x half> %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) ret void } -declare void @llvm.amdgcn.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) -declare void @llvm.amdgcn.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) -declare void @llvm.amdgcn.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) -declare void @llvm.amdgcn.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare void @llvm.amdgcn.tbuffer.store.f16(half, ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) +declare void @llvm.amdgcn.tbuffer.store.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) +declare void @llvm.amdgcn.tbuffer.store.v3f16(<3 x half>, ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) +declare void @llvm.amdgcn.tbuffer.store.v4f16(<4 x half>, ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll @@ -2,10 +2,10 @@ ; GCN-LABEL: {{^}}tbuffer_raw_store_immoffs_x3: ; GCN: tbuffer_store_format_xyz v[0:2], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42 -define amdgpu_ps void @tbuffer_raw_store_immoffs_x3(<4 x i32> inreg, <3 x float>) { +define amdgpu_ps void @tbuffer_raw_store_immoffs_x3(ptr addrspace(8) inreg, <3 x float>) { main_body: %in1 = bitcast <3 x float> %1 to <3 x i32> - call void @llvm.amdgcn.raw.tbuffer.store.v3i32(<3 x i32> %in1, <4 x i32> %0, i32 42, i32 0, i32 117, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v3i32(<3 x i32> %in1, ptr addrspace(8) %0, i32 42, i32 0, i32 117, i32 0) ret void } @@ -13,23 +13,23 @@ ; GCN-LABEL: {{^}}tbuffer_struct_store_immoffs_x3: ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 ; GCN: tbuffer_store_format_xyz v[0:2], [[ZEROREG]], s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] idxen offset:42 -define amdgpu_ps void @tbuffer_struct_store_immoffs_x3(<4 x i32> inreg, <3 x float>) { +define amdgpu_ps void @tbuffer_struct_store_immoffs_x3(ptr addrspace(8) inreg, <3 x float>) { main_body: %in1 = bitcast <3 x float> %1 to <3 x i32> - call void @llvm.amdgcn.struct.tbuffer.store.v3i32(<3 x i32> %in1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 117, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v3i32(<3 x i32> %in1, ptr addrspace(8) %0, i32 0, i32 42, i32 0, i32 117, i32 0) ret void } ; GCN-LABEL: {{^}}tbuffer_store_immoffs_x3: ; GCN: tbuffer_store_format_xyz v[0:2], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42 -define amdgpu_ps void @tbuffer_store_immoffs_x3(<4 x i32> inreg, <3 x float>) { +define amdgpu_ps void @tbuffer_store_immoffs_x3(ptr addrspace(8) inreg, <3 x float>) { main_body: %in1 = bitcast <3 x float> %1 to <3 x i32> - call void @llvm.amdgcn.tbuffer.store.v3i32(<3 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 5, i32 7, i1 0, i1 0) + call void @llvm.amdgcn.tbuffer.store.v3i32(<3 x i32> %in1, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 42, i32 5, i32 7, i1 0, i1 0) ret void } -declare void @llvm.amdgcn.raw.tbuffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32, i32) #0 -declare void @llvm.amdgcn.struct.tbuffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32, i32, i32) #0 -declare void @llvm.amdgcn.tbuffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.raw.tbuffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.tbuffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.tbuffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll @@ -6,60 +6,60 @@ ; GCN: tbuffer_store_format_xyzw v[4:7], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_SSCALED] glc ; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] slc ; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] -define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +define amdgpu_ps void @tbuffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x float>, <4 x float>) { main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> %in2 = bitcast <4 x float> %2 to <4 x i32> %in3 = bitcast <4 x float> %3 to <4 x i32> - call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 12, i32 2, i1 0, i1 0) - call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 13, i32 3, i1 1, i1 0) - call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 1) - call void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0, i32 12, i32 2, i1 0, i1 0) + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in2, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0, i32 13, i32 3, i1 1, i1 0) + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in3, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 1) + call void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float> %3, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) ret void } ; GCN-LABEL: {{^}}tbuffer_store_immoffs: ; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42 -define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +define amdgpu_ps void @tbuffer_store_immoffs(ptr addrspace(8) inreg, <4 x float>) { main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> - call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 5, i32 7, i1 0, i1 0) + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 42, i32 5, i32 7, i1 0, i1 0) ret void } ; GCN-LABEL: {{^}}tbuffer_store_scalar_and_imm_offs: ; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], {{s[0-9]+}} format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42 -define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) { +define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(ptr addrspace(8) inreg, <4 x float> %vdata, i32 inreg %soffset) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> - call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 %soffset, i32 42, i32 5, i32 7, i1 0, i1 0) + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, ptr addrspace(8) %0, i32 0, i32 0, i32 %soffset, i32 42, i32 5, i32 7, i1 0, i1 0) ret void } ; GCN-LABEL: {{^}}buffer_store_idx: ; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] idxen -define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex) { +define amdgpu_ps void @buffer_store_idx(ptr addrspace(8) inreg, <4 x float> %vdata, i32 %vindex) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> - call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 0, i32 15, i32 2, i1 0, i1 0) + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, ptr addrspace(8) %0, i32 %vindex, i32 0, i32 0, i32 0, i32 15, i32 2, i1 0, i1 0) ret void } ; GCN-LABEL: {{^}}buffer_store_ofs: ; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_8_8,BUF_NUM_FORMAT_FLOAT] offen -define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) { +define amdgpu_ps void @buffer_store_ofs(ptr addrspace(8) inreg, <4 x float> %vdata, i32 %voffset) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> - call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 %voffset, i32 0, i32 0, i32 3, i32 7, i1 0, i1 0) + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, ptr addrspace(8) %0, i32 0, i32 %voffset, i32 0, i32 0, i32 3, i32 7, i1 0, i1 0) ret void } ; GCN-LABEL: {{^}}buffer_store_both: ; GCN: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_UINT] idxen offen -define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex, i32 %voffset) { +define amdgpu_ps void @buffer_store_both(ptr addrspace(8) inreg, <4 x float> %vdata, i32 %vindex, i32 %voffset) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> - call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 %voffset, i32 0, i32 0, i32 6, i32 4, i1 0, i1 0) + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, ptr addrspace(8) %0, i32 %vindex, i32 %voffset, i32 0, i32 0, i32 6, i32 4, i1 0, i1 0) ret void } @@ -71,39 +71,39 @@ ; GCN: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen ; GCN: s_waitcnt vmcnt(0) ; GCN: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_USCALED] idxen -define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex.1, i32 %vindex.2, i32 %vindex.3) { +define amdgpu_ps void @buffer_store_wait(ptr addrspace(8) inreg, <4 x float> %vdata, i32 %vindex.1, i32 %vindex.2, i32 %vindex.3) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> - call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex.1, i32 0, i32 0, i32 0, i32 15, i32 3, i1 0, i1 0) - %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %vindex.2, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, ptr addrspace(8) %0, i32 %vindex.1, i32 0, i32 0, i32 0, i32 15, i32 3, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %vindex.2, i32 0, i1 0, i1 0) %data.i = bitcast <4 x float> %data to <4 x i32> - call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %data.i, <4 x i32> %0, i32 %vindex.3, i32 0, i32 0, i32 0, i32 14, i32 2, i1 0, i1 0) + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %data.i, ptr addrspace(8) %0, i32 %vindex.3, i32 0, i32 0, i32 0, i32 14, i32 2, i1 0, i1 0) ret void } ; GCN-LABEL: {{^}}buffer_store_x1: ; GCN: tbuffer_store_format_x v0, v1, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_FLOAT] idxen -define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { +define amdgpu_ps void @buffer_store_x1(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex) { main_body: %data.i = bitcast float %data to i32 - call void @llvm.amdgcn.tbuffer.store.i32(i32 %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 13, i32 7, i1 0, i1 0) + call void @llvm.amdgcn.tbuffer.store.i32(i32 %data.i, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 13, i32 7, i1 0, i1 0) ret void } ; GCN-LABEL: {{^}}buffer_store_x2: ; GCN: tbuffer_store_format_xy v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen -define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %vindex) { +define amdgpu_ps void @buffer_store_x2(ptr addrspace(8) inreg %rsrc, <2 x float> %data, i32 %vindex) { main_body: %data.i = bitcast <2 x float> %data to <2 x i32> - call void @llvm.amdgcn.tbuffer.store.v2i32(<2 x i32> %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) + call void @llvm.amdgcn.tbuffer.store.v2i32(<2 x i32> %data.i, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) ret void } -declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 -declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1 +declare void @llvm.amdgcn.tbuffer.store.i32(i32, ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.tbuffer.store.v2i32(<2 x i32>, ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8), i32, i32, i1, i1) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll --- a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll @@ -4,7 +4,7 @@ ; Test that checks for redundant copies to temporary stack slot produced by ; expandUnalignedLoad. -define amdgpu_vs void @test(<4 x i32> inreg %arg1, ptr addrspace(3) %arg2) { +define amdgpu_vs void @test(ptr addrspace(8) inreg %arg1, ptr addrspace(3) %arg2) { ; CHECK-LABEL: test: ; CHECK: ; %bb.0: ; CHECK-NEXT: v_add_i32_e32 v3, vcc, 12, v0 @@ -23,11 +23,11 @@ call void @llvm.amdgcn.exp.f32(i32 immarg 0, i32 immarg 0, float undef, float undef, float undef, float undef, i1 immarg false, i1 immarg false) %var1 = load <6 x float>, ptr addrspace(3) %arg2, align 4 %var2 = shufflevector <6 x float> %var1, <6 x float> undef, <4 x i32> - call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %var2, <4 x i32> %arg1, i32 0, i32 0, i32 0, i32 immarg 126, i32 immarg 0) + call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %var2, ptr addrspace(8) %arg1, i32 0, i32 0, i32 0, i32 immarg 126, i32 immarg 0) ret void } -define amdgpu_vs void @test_2(<4 x i32> inreg %arg1, i32 %arg2, i32 inreg %arg3, ptr addrspace(3) %arg4) { +define amdgpu_vs void @test_2(ptr addrspace(8) inreg %arg1, i32 %arg2, i32 inreg %arg3, ptr addrspace(3) %arg4) { ; CHECK-LABEL: test_2: ; CHECK: ; %bb.0: ; CHECK-NEXT: v_add_i32_e32 v5, vcc, 28, v1 @@ -53,13 +53,13 @@ ; CHECK-NEXT: s_endpgm %load = load <8 x float>, ptr addrspace(3) %arg4, align 4 %vec1 = shufflevector <8 x float> %load, <8 x float> undef, <4 x i32> - call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %vec1, <4 x i32> %arg1, i32 %arg2, i32 0, i32 %arg3, i32 immarg 77, i32 immarg 3) + call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %vec1, ptr addrspace(8) %arg1, i32 %arg2, i32 0, i32 %arg3, i32 immarg 77, i32 immarg 3) %vec2 = shufflevector <8 x float> %load, <8 x float> undef, <4 x i32> - call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %vec2, <4 x i32> %arg1, i32 %arg2, i32 16, i32 %arg3, i32 immarg 77, i32 immarg 3) + call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %vec2, ptr addrspace(8) %arg1, i32 %arg2, i32 16, i32 %arg3, i32 immarg 77, i32 immarg 3) ret void } -define amdgpu_vs void @test_3(i32 inreg %arg1, i32 inreg %arg2, <4 x i32> inreg %arg3, i32 %arg4, ptr addrspace(3) %arg5, ptr addrspace(3) %arg6) { +define amdgpu_vs void @test_3(i32 inreg %arg1, i32 inreg %arg2, ptr addrspace(8) inreg %arg3, i32 %arg4, ptr addrspace(3) %arg5, ptr addrspace(3) %arg6) { ; CHECK-LABEL: test_3: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_mov_b32 s7, s5 @@ -103,21 +103,21 @@ ; CHECK-NEXT: s_endpgm %load1 = load <6 x float>, ptr addrspace(3) %arg5, align 4 %vec11 = shufflevector <6 x float> %load1, <6 x float> undef, <4 x i32> - call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %vec11, <4 x i32> %arg3, i32 %arg1, i32 264, i32 %arg2, i32 immarg 77, i32 immarg 3) + call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %vec11, ptr addrspace(8) %arg3, i32 %arg1, i32 264, i32 %arg2, i32 immarg 77, i32 immarg 3) %vec12 = shufflevector <6 x float> %load1, <6 x float> undef, <2 x i32> - call void @llvm.amdgcn.struct.tbuffer.store.v2f32(<2 x float> %vec12, <4 x i32> %arg3, i32 %arg1, i32 280, i32 %arg2, i32 immarg 64, i32 immarg 3) + call void @llvm.amdgcn.struct.tbuffer.store.v2f32(<2 x float> %vec12, ptr addrspace(8) %arg3, i32 %arg1, i32 280, i32 %arg2, i32 immarg 64, i32 immarg 3) call void @llvm.amdgcn.exp.f32(i32 immarg 0, i32 immarg 0, float undef, float undef, float undef, float undef, i1 immarg false, i1 immarg false) %load2 = load <6 x float>, ptr addrspace(3) %arg6, align 4 %vec21 = shufflevector <6 x float> %load2, <6 x float> undef, <4 x i32> - call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %vec21, <4 x i32> %arg3, i32 %arg1, i32 240, i32 %arg2, i32 immarg 77, i32 immarg 3) + call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %vec21, ptr addrspace(8) %arg3, i32 %arg1, i32 240, i32 %arg2, i32 immarg 77, i32 immarg 3) %vec22 = shufflevector <6 x float> %load2, <6 x float> undef, <2 x i32> - call void @llvm.amdgcn.struct.tbuffer.store.v2f32(<2 x float> %vec22, <4 x i32> %arg3, i32 %arg1, i32 256, i32 %arg2, i32 immarg 64, i32 immarg 3) + call void @llvm.amdgcn.struct.tbuffer.store.v2f32(<2 x float> %vec22, ptr addrspace(8) %arg3, i32 %arg1, i32 256, i32 %arg2, i32 immarg 64, i32 immarg 3) ret void } -declare void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg, i32 immarg) -declare void @llvm.amdgcn.struct.tbuffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32 immarg, i32 immarg) +declare void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32, i32 immarg, i32 immarg) +declare void @llvm.amdgcn.struct.tbuffer.store.v2f32(<2 x float>, ptr addrspace(8), i32, i32, i32, i32 immarg, i32 immarg) declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll --- a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll +++ b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll @@ -22,7 +22,7 @@ br i1 %tmp27, label %then, label %endif then: ; preds = %bb - call void @llvm.amdgcn.raw.buffer.store.f32(float undef, <4 x i32> undef, i32 0, i32 undef, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float undef, ptr addrspace(8) undef, i32 0, i32 undef, i32 0) br label %endif endif: ; preds = %bb28, %bb @@ -49,7 +49,7 @@ %tmp23phi = phi i32 [ %tmp23, %loop ], [ 0, %entry ] %tmp23 = add nuw i32 %tmp23phi, 1 %tmp27 = icmp ult i32 %arg, %tmp23 - call void @llvm.amdgcn.raw.buffer.store.f32(float undef, <4 x i32> undef, i32 0, i32 undef, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float undef, ptr addrspace(8) undef, i32 0, i32 undef, i32 0) br i1 %tmp27, label %loop, label %loopexit loopexit: @@ -76,7 +76,7 @@ br i1 %tmp27, label %then, label %endif then: ; preds = %bb - call void @llvm.amdgcn.raw.buffer.store.f32(float undef, <4 x i32> undef, i32 0, i32 undef, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float undef, ptr addrspace(8) undef, i32 0, i32 undef, i32 0) br label %endif endif: ; preds = %bb28, %bb @@ -86,6 +86,6 @@ ret void } -declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #0 +declare void @llvm.amdgcn.raw.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32 immarg) #0 attributes #0 = { nounwind writeonly } diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll @@ -29,11 +29,11 @@ %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0 %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1 %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2 - call void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32> %ielemz, <4 x i32> undef, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0) ret void } declare i32 @llvm.amdgcn.workgroup.id.x() declare i32 @llvm.amdgcn.workgroup.id.y() declare i32 @llvm.amdgcn.workgroup.id.z() -declare void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll b/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll --- a/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll @@ -26,10 +26,10 @@ %tmp9 = insertelement <4 x i32> %tmp8, i32 %tmp7, i32 1 %tmp10 = insertelement <4 x i32> %tmp9, i32 undef, i32 2 %tmp11 = insertelement <4 x i32> %tmp10, i32 undef, i32 3 - call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %tmp11, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 78, i32 3) #2 + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %tmp11, ptr addrspace(8) undef, i32 0, i32 0, i32 %arg, i32 78, i32 3) #2 ret void } -declare void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32 immarg, i32 immarg) #0 +declare void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32>, ptr addrspace(8), i32, i32, i32, i32 immarg, i32 immarg) #0 attributes #0 = { nounwind willreturn writeonly } diff --git a/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll b/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll --- a/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll +++ b/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll @@ -10,14 +10,14 @@ store i32 %v, ptr addrspace(3) null - call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %v, <4 x i32> undef, i32 0, i32 0, i32 68, i32 1) + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %v, ptr addrspace(8) undef, i32 0, i32 0, i32 68, i32 1) %w = load i32, ptr addrspace(3) null store i32 %w, ptr addrspace(3) %p1 ret void } -declare void @llvm.amdgcn.raw.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg, i32 immarg) #1 +declare void @llvm.amdgcn.raw.tbuffer.store.i32(i32, ptr addrspace(8), i32, i32, i32 immarg, i32 immarg) #1 attributes #0 = { nounwind } attributes #1 = { nounwind willreturn writeonly } diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -8,7 +8,7 @@ ; Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions. -define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { +define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; GFX9_W64-LABEL: mubuf_vgpr: ; GFX9_W64: ; %bb.0: ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -147,15 +147,28 @@ ; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v4, v3 ; W64-O0-NEXT: v_mov_b32_e32 v6, v2 +; W64-O0-NEXT: v_mov_b32_e32 v2, v1 +; W64-O0-NEXT: ; implicit-def: $sgpr4 +; W64-O0-NEXT: ; implicit-def: $sgpr4 +; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v7, v3 +; W64-O0-NEXT: v_mov_b32_e32 v4, v7 +; W64-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec +; W64-O0-NEXT: ; implicit-def: $sgpr4 +; W64-O0-NEXT: ; implicit-def: $sgpr4 +; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v2 ; W64-O0-NEXT: v_mov_b32_e32 v7, v1 +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; W64-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $exec ; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $exec ; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec ; W64-O0-NEXT: v_mov_b32_e32 v1, v7 @@ -166,7 +179,8 @@ ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: s_mov_b32 s4, 0 ; W64-O0-NEXT: v_writelane_b32 v5, s4, 0 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec @@ -228,7 +242,7 @@ ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] - %call = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %i, i32 %c, i32 0, i32 0, i32 0) #1 + %call = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) %i, i32 %c, i32 0, i32 0, i32 0) #1 ret float %call } @@ -236,7 +250,7 @@ ; FIXME: redundant s_mov -define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr addrspace(1) %out0, ptr addrspace(1) %out1) #0 { +define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) %j, i32 %c, ptr addrspace(1) %out0, ptr addrspace(1) %out1) #0 { ; GFX9_W64-LABEL: mubuf_vgpr_adjacent_in_block: ; GFX9_W64: ; %bb.0: ; %entry ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -480,35 +494,47 @@ ; W64-O0: ; %bb.0: ; %entry ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v15, v6 +; W64-O0-NEXT: v_mov_b32_e32 v8, v5 +; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; W64-O0-NEXT: v_mov_b32_e32 v14, v4 -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v6, v3 -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; W64-O0-NEXT: v_mov_b32_e32 v7, v2 -; W64-O0-NEXT: v_mov_b32_e32 v8, v1 +; W64-O0-NEXT: v_mov_b32_e32 v4, v1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; W64-O0-NEXT: v_mov_b32_e32 v2, v0 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 +; W64-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; W64-O0-NEXT: s_waitcnt vmcnt(2) +; W64-O0-NEXT: v_mov_b32_e32 v16, v5 +; W64-O0-NEXT: v_mov_b32_e32 v5, v16 +; W64-O0-NEXT: v_mov_b32_e32 v6, v15 +; W64-O0-NEXT: ; implicit-def: $sgpr4 +; W64-O0-NEXT: ; implicit-def: $sgpr4 +; W64-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v15, v8 +; W64-O0-NEXT: v_mov_b32_e32 v8, v15 +; W64-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec +; W64-O0-NEXT: ; implicit-def: $sgpr4 +; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 +; W64-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $exec ; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec ; W64-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15_vgpr16_vgpr17 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v15, v5 -; W64-O0-NEXT: s_waitcnt vmcnt(3) -; W64-O0-NEXT: v_mov_b32_e32 v16, v4 -; W64-O0-NEXT: s_waitcnt vmcnt(2) -; W64-O0-NEXT: v_mov_b32_e32 v17, v3 +; W64-O0-NEXT: v_mov_b32_e32 v15, v8 +; W64-O0-NEXT: v_mov_b32_e32 v16, v6 +; W64-O0-NEXT: v_mov_b32_e32 v17, v5 ; W64-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill @@ -516,6 +542,20 @@ ; W64-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 +; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v8, v3 +; W64-O0-NEXT: v_mov_b32_e32 v6, v8 +; W64-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec +; W64-O0-NEXT: ; implicit-def: $sgpr4 +; W64-O0-NEXT: ; implicit-def: $sgpr4 +; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v3, v4 +; W64-O0-NEXT: v_mov_b32_e32 v8, v3 +; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec +; W64-O0-NEXT: ; implicit-def: $sgpr4 +; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec @@ -548,8 +588,10 @@ ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 -; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: s_mov_b32 s4, 0 ; W64-O0-NEXT: v_writelane_b32 v13, s4, 0 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec @@ -586,7 +628,7 @@ ; W64-O0-NEXT: v_writelane_b32 v13, s4, 7 ; W64-O0-NEXT: v_writelane_b32 v13, s5, 8 ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB1_1 Depth=1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; W64-O0-NEXT: v_readlane_b32 s4, v13, 7 ; W64-O0-NEXT: v_readlane_b32 s5, v13, 8 ; W64-O0-NEXT: v_readlane_b32 s8, v13, 3 @@ -598,7 +640,7 @@ ; W64-O0-NEXT: s_nop 3 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB1_1 ; W64-O0-NEXT: ; %bb.3: @@ -639,7 +681,7 @@ ; W64-O0-NEXT: v_writelane_b32 v13, s4, 15 ; W64-O0-NEXT: v_writelane_b32 v13, s5, 16 ; W64-O0-NEXT: ; %bb.5: ; in Loop: Header=BB1_4 Depth=1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; W64-O0-NEXT: v_readlane_b32 s4, v13, 15 ; W64-O0-NEXT: v_readlane_b32 s5, v13, 16 ; W64-O0-NEXT: v_readlane_b32 s8, v13, 11 @@ -651,17 +693,17 @@ ; W64-O0-NEXT: s_nop 3 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB1_4 ; W64-O0-NEXT: ; %bb.6: -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; W64-O0-NEXT: s_nop 0 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; W64-O0-NEXT: s_nop 0 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; W64-O0-NEXT: s_nop 0 ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_nop 0 @@ -675,13 +717,13 @@ ; W64-O0-NEXT: global_store_dword v[0:1], v2, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] entry: - %val0 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %i, i32 %c, i32 0, i32 0, i32 0) #1 - %val1 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %j, i32 %c, i32 0, i32 0, i32 0) #1 + %val0 = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) %i, i32 %c, i32 0, i32 0, i32 0) #1 + %val1 = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) %j, i32 %c, i32 0, i32 0, i32 0) #1 store volatile float %val0, ptr addrspace(1) %out0 store volatile float %val1, ptr addrspace(1) %out1 ret void @@ -690,7 +732,7 @@ ; Confirm spills do not occur between the XOR and branch that terminate the ; waterfall loop BBs. -define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { +define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, i32 %c, ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { ; GFX9_W64-LABEL: mubuf_vgpr_outside_entry: ; GFX9_W64: ; %bb.0: ; %entry ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -987,44 +1029,47 @@ ; W64-O0: ; %bb.0: ; %entry ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v9, v7 -; W64-O0-NEXT: v_mov_b32_e32 v10, v6 -; W64-O0-NEXT: v_mov_b32_e32 v11, v5 +; W64-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; W64-O0-NEXT: v_mov_b32_e32 v5, v4 -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v5, v3 -; W64-O0-NEXT: v_mov_b32_e32 v6, v2 -; W64-O0-NEXT: v_mov_b32_e32 v7, v1 -; W64-O0-NEXT: v_mov_b32_e32 v13, v0 +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v10, v2 +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; W64-O0-NEXT: v_mov_b32_e32 v6, v1 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; W64-O0-NEXT: v_mov_b32_e32 v9, v0 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 +; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v11, v3 +; W64-O0-NEXT: v_mov_b32_e32 v3, v11 +; W64-O0-NEXT: v_mov_b32_e32 v5, v10 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v1, v11 -; W64-O0-NEXT: v_mov_b32_e32 v2, v10 -; W64-O0-NEXT: v_mov_b32_e32 v3, v9 +; W64-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v10, v6 +; W64-O0-NEXT: v_mov_b32_e32 v6, v10 +; W64-O0-NEXT: v_mov_b32_e32 v13, v9 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec ; W64-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec ; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v14, v7 -; W64-O0-NEXT: v_mov_b32_e32 v15, v6 -; W64-O0-NEXT: v_mov_b32_e32 v16, v5 +; W64-O0-NEXT: v_mov_b32_e32 v14, v6 +; W64-O0-NEXT: v_mov_b32_e32 v15, v5 +; W64-O0-NEXT: v_mov_b32_e32 v16, v3 ; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill @@ -1032,18 +1077,30 @@ ; W64-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $exec ; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v5, v12 +; W64-O0-NEXT: v_mov_b32_e32 v5, v7 +; W64-O0-NEXT: ; implicit-def: $sgpr4 +; W64-O0-NEXT: ; implicit-def: $sgpr4 +; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v3, v1 +; W64-O0-NEXT: ; implicit-def: $sgpr4 +; W64-O0-NEXT: ; implicit-def: $sgpr4 +; W64-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v12 ; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; W64-O0-NEXT: ;;#ASMSTART ; W64-O0-NEXT: s_mov_b32 s4, 17 ; W64-O0-NEXT: ;;#ASMEND @@ -1099,12 +1156,12 @@ ; W64-O0-NEXT: s_nop 3 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB2_1 ; W64-O0-NEXT: ; %bb.3: -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; W64-O0-NEXT: v_readlane_b32 s6, v8, 2 ; W64-O0-NEXT: v_readlane_b32 s7, v8, 3 ; W64-O0-NEXT: s_mov_b64 exec, s[6:7] @@ -1113,7 +1170,7 @@ ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: v_and_b32_e64 v1, v1, s5 ; W64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, s4 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 s[4:5], exec ; W64-O0-NEXT: v_writelane_b32 v8, s4, 10 ; W64-O0-NEXT: v_writelane_b32 v8, s5, 11 @@ -1121,22 +1178,49 @@ ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execz .LBB2_8 ; W64-O0-NEXT: ; %bb.4: ; %bb1 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; W64-O0-NEXT: s_nop 0 +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; W64-O0-NEXT: s_nop 0 +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; W64-O0-NEXT: s_nop 0 +; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; W64-O0-NEXT: v_readlane_b32 s4, v8, 0 +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_mov_b32_e32 v6, v4 +; W64-O0-NEXT: v_mov_b32_e32 v0, v3 +; W64-O0-NEXT: v_mov_b32_e32 v4, v2 +; W64-O0-NEXT: v_mov_b32_e32 v5, v1 +; W64-O0-NEXT: ; implicit-def: $sgpr5 +; W64-O0-NEXT: ; implicit-def: $sgpr5 +; W64-O0-NEXT: ; implicit-def: $sgpr5 +; W64-O0-NEXT: ; implicit-def: $sgpr5 +; W64-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v6 +; W64-O0-NEXT: v_mov_b32_e32 v2, v5 +; W64-O0-NEXT: v_mov_b32_e32 v3, v4 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b32 s5, 0 ; W64-O0-NEXT: v_writelane_b32 v8, s5, 12 ; W64-O0-NEXT: v_mov_b32_e32 v0, s4 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 s[4:5], exec ; W64-O0-NEXT: v_writelane_b32 v8, s4, 13 ; W64-O0-NEXT: v_writelane_b32 v8, s5, 14 ; W64-O0-NEXT: .LBB2_5: ; =>This Inner Loop Header: Depth=1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 ; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 @@ -1161,7 +1245,7 @@ ; W64-O0-NEXT: v_writelane_b32 v8, s4, 19 ; W64-O0-NEXT: v_writelane_b32 v8, s5, 20 ; W64-O0-NEXT: ; %bb.6: ; in Loop: Header=BB2_5 Depth=1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; W64-O0-NEXT: v_readlane_b32 s4, v8, 19 ; W64-O0-NEXT: v_readlane_b32 s5, v8, 20 ; W64-O0-NEXT: v_readlane_b32 s8, v8, 15 @@ -1173,22 +1257,22 @@ ; W64-O0-NEXT: s_nop 3 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB2_5 ; W64-O0-NEXT: ; %bb.7: -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; W64-O0-NEXT: v_readlane_b32 s4, v8, 13 ; W64-O0-NEXT: v_readlane_b32 s5, v8, 14 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; W64-O0-NEXT: .LBB2_8: ; %bb2 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; W64-O0-NEXT: v_readlane_b32 s4, v8, 10 ; W64-O0-NEXT: v_readlane_b32 s5, v8, 11 ; W64-O0-NEXT: s_or_b64 exec, exec, s[4:5] @@ -1196,19 +1280,19 @@ ; W64-O0-NEXT: global_store_dword v[0:1], v2, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] entry: %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={s4}" () - %val0 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %i, i32 %live.out.reg, i32 0, i32 0, i32 0) #1 + %val0 = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) %i, i32 %live.out.reg, i32 0, i32 0, i32 0) #1 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 %cmp = icmp eq i32 %idx, 0 br i1 %cmp, label %bb1, label %bb2 bb1: - %val1 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %j, i32 %live.out.reg, i32 0, i32 0, i32 0) #1 + %val1 = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) %j, i32 %live.out.reg, i32 0, i32 0, i32 0) #1 br label %bb2 bb2: @@ -1218,7 +1302,7 @@ } declare i32 @llvm.amdgcn.workitem.id.x() #1 -declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32 immarg) #1 +declare float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8), i32, i32, i32, i32 immarg) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll b/llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll --- a/llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll @@ -8,8 +8,8 @@ define amdgpu_vs float @test_none(ptr addrspace(4) inreg %base, i32 %i) { main_body: %ptr = getelementptr <4 x i32>, ptr addrspace(4) %base, i32 %i - %tmp2 = load <4 x i32>, ptr addrspace(4) %ptr, align 32 - %tmp7 = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %tmp2, i32 0, i32 0, i32 0) + %tmp2 = load ptr addrspace(8), ptr addrspace(4) %ptr, align 32 + %tmp7 = call float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8) %tmp2, i32 0, i32 0, i32 0) ret float %tmp7 } @@ -18,8 +18,8 @@ define amdgpu_vs float @test_idxen(ptr addrspace(4) inreg %base, i32 %i) { main_body: %ptr = getelementptr <4 x i32>, ptr addrspace(4) %base, i32 %i - %tmp2 = load <4 x i32>, ptr addrspace(4) %ptr, align 32 - %tmp7 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %tmp2, i32 undef, i32 0, i32 0, i32 0) + %tmp2 = load ptr addrspace(8), ptr addrspace(4) %ptr, align 32 + %tmp7 = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) %tmp2, i32 undef, i32 0, i32 0, i32 0) ret float %tmp7 } @@ -28,8 +28,8 @@ define amdgpu_vs float @test_offen(ptr addrspace(4) inreg %base, i32 %i) { main_body: %ptr = getelementptr <4 x i32>, ptr addrspace(4) %base, i32 %i - %tmp2 = load <4 x i32>, ptr addrspace(4) %ptr, align 32 - %tmp7 = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %tmp2, i32 undef, i32 0, i32 0) + %tmp2 = load ptr addrspace(8), ptr addrspace(4) %ptr, align 32 + %tmp7 = call float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8) %tmp2, i32 undef, i32 0, i32 0) ret float %tmp7 } @@ -38,13 +38,13 @@ define amdgpu_vs float @test_both(ptr addrspace(4) inreg %base, i32 %i) { main_body: %ptr = getelementptr <4 x i32>, ptr addrspace(4) %base, i32 %i - %tmp2 = load <4 x i32>, ptr addrspace(4) %ptr, align 32 - %tmp7 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %tmp2, i32 undef, i32 undef, i32 0, i32 0) + %tmp2 = load ptr addrspace(8), ptr addrspace(4) %ptr, align 32 + %tmp7 = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) %tmp2, i32 undef, i32 undef, i32 0, i32 0) ret float %tmp7 } -declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32 immarg) #1 -declare float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32>, i32, i32, i32 immarg) #1 +declare float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8), i32, i32, i32, i32 immarg) #1 +declare float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8), i32, i32, i32 immarg) #1 attributes #0 = { nounwind readnone } attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/mubuf.ll b/llvm/test/CodeGen/AMDGPU/mubuf.ll --- a/llvm/test/CodeGen/AMDGPU/mubuf.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf.ll @@ -55,12 +55,11 @@ ; CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 64 offen glc define amdgpu_gs void @soffset_max_imm(ptr addrspace(4) inreg, ptr addrspace(4) inreg, ptr addrspace(4) inreg, ptr addrspace(4) inreg, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) { main_body: - %tmp1 = load <4 x i32>, ptr addrspace(4) %0 + %tmp1 = load ptr addrspace(8), ptr addrspace(4) %0 %tmp2 = shl i32 %6, 2 - %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %tmp1, i32 %tmp2, i32 64, i32 1) + %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) %tmp1, i32 %tmp2, i32 64, i32 1) %tmp4 = add i32 %6, 16 - %tmp1.4xi32 = bitcast <4 x i32> %tmp1 to <4 x i32> - call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 %tmp4, i32 %4, i32 68, i32 3) + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp3, ptr addrspace(8) %tmp1, i32 %tmp4, i32 %4, i32 68, i32 3) ret void } @@ -73,12 +72,11 @@ ; CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], [[SOFFSET]] offen glc define amdgpu_gs void @soffset_no_fold(ptr addrspace(4) inreg, ptr addrspace(4) inreg, ptr addrspace(4) inreg, ptr addrspace(4) inreg, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) { main_body: - %tmp1 = load <4 x i32>, ptr addrspace(4) %0 + %tmp1 = load ptr addrspace(8), ptr addrspace(4) %0 %tmp2 = shl i32 %6, 2 - %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %tmp1, i32 %tmp2, i32 65, i32 1) + %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) %tmp1, i32 %tmp2, i32 65, i32 1) %tmp4 = add i32 %6, 16 - %tmp1.4xi32 = bitcast <4 x i32> %tmp1 to <4 x i32> - call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 %tmp4, i32 %4, i32 68, i32 3) + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp3, ptr addrspace(8) %tmp1, i32 %tmp4, i32 %4, i32 68, i32 3) ret void } @@ -174,8 +172,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 -declare void @llvm.amdgcn.raw.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg, i32 immarg) #2 -declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg) #3 +declare void @llvm.amdgcn.raw.tbuffer.store.i32(i32, ptr addrspace(8), i32, i32, i32 immarg, i32 immarg) #2 +declare i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8), i32, i32, i32 immarg) #3 attributes #0 = { nounwind readonly } attributes #1 = { nounwind readnone speculatable willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll b/llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll --- a/llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll @@ -15,39 +15,39 @@ define amdgpu_gs void @main(i32 inreg %arg) #0 { main_body: - %tmp = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 20, i32 0) - %tmp1 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 24, i32 0) - %tmp2 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 48, i32 0) + %tmp = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 20, i32 0) + %tmp1 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 24, i32 0) + %tmp2 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 48, i32 0) %array_vector3 = insertelement <4 x float> zeroinitializer, float %tmp2, i32 3 %array_vector5 = insertelement <4 x float> , float %tmp, i32 1 %array_vector6 = insertelement <4 x float> %array_vector5, float undef, i32 2 %array_vector9 = insertelement <4 x float> , float %tmp1, i32 1 %array_vector10 = insertelement <4 x float> %array_vector9, float 0.000000e+00, i32 2 %array_vector11 = insertelement <4 x float> %array_vector10, float undef, i32 3 - %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> undef, i32 undef, i32 4864, i32 0) - call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp3, <4 x i32> undef, i32 36, i32 %arg, i32 68, i32 3) + %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) undef, i32 undef, i32 4864, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp3, ptr addrspace(8) undef, i32 36, i32 %arg, i32 68, i32 3) %bc = bitcast <4 x float> %array_vector3 to <4 x i32> %tmp4 = extractelement <4 x i32> %bc, i32 undef - call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp4, <4 x i32> undef, i32 48, i32 %arg, i32 68, i32 3) + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp4, ptr addrspace(8) undef, i32 48, i32 %arg, i32 68, i32 3) %bc49 = bitcast <4 x float> %array_vector11 to <4 x i32> %tmp5 = extractelement <4 x i32> %bc49, i32 undef - call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp5, <4 x i32> undef, i32 72, i32 %arg, i32 68, i32 3) + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp5, ptr addrspace(8) undef, i32 72, i32 %arg, i32 68, i32 3) %array_vector21 = insertelement <4 x float> , float %tmp, i32 1 %array_vector22 = insertelement <4 x float> %array_vector21, float undef, i32 2 %array_vector23 = insertelement <4 x float> %array_vector22, float undef, i32 3 - call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 28, i32 %arg, i32 68, i32 3) + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 undef, ptr addrspace(8) undef, i32 28, i32 %arg, i32 68, i32 3) %bc52 = bitcast <4 x float> %array_vector23 to <4 x i32> %tmp6 = extractelement <4 x i32> %bc52, i32 undef - call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp6, <4 x i32> undef, i32 64, i32 %arg, i32 68, i32 3) - call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 20, i32 %arg, i32 68, i32 3) - call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 56, i32 %arg, i32 68, i32 3) - call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 92, i32 %arg, i32 68, i32 3) + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp6, ptr addrspace(8) undef, i32 64, i32 %arg, i32 68, i32 3) + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 undef, ptr addrspace(8) undef, i32 20, i32 %arg, i32 68, i32 3) + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 undef, ptr addrspace(8) undef, i32 56, i32 %arg, i32 68, i32 3) + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 undef, ptr addrspace(8) undef, i32 92, i32 %arg, i32 68, i32 3) ret void } -declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #1 -declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg) #2 -declare void @llvm.amdgcn.raw.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg, i32 immarg) #3 +declare float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8), i32, i32 immarg) #1 +declare i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8), i32, i32, i32 immarg) #2 +declare void @llvm.amdgcn.raw.tbuffer.store.i32(i32, ptr addrspace(8), i32, i32, i32 immarg, i32 immarg) #3 attributes #0 = { nounwind "target-cpu"="tonga" } attributes #1 = { nounwind readnone willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll --- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll +++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN -define amdgpu_cs void @if_then(<4 x i32> inreg %input, <4 x i32> inreg %output, <3 x i32> %LocalInvocationId) { +define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) inreg %output, <3 x i32> %LocalInvocationId) { ; GCN-LABEL: if_then: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 @@ -50,17 +50,17 @@ %i562 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src, i32 0) %i563 = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %i562, i32 273, i32 15, i32 15, i1 false) %i564 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %i563) - call void @llvm.amdgcn.raw.buffer.store.i32(i32 -1, <4 x i32> %output, i32 %i564, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i32(i32 -1, ptr addrspace(8) %output, i32 %i564, i32 0, i32 0) br label %.end .end: %idx = phi i32 [ 0, %.then ], [ %src, %.merge ] - call void @llvm.amdgcn.raw.buffer.store.i32(i32 -1, <4 x i32> %output, i32 %idx, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i32(i32 -1, ptr addrspace(8) %output, i32 %idx, i32 0, i32 0) ret void } -define amdgpu_cs void @if_else_vgpr_opt(<4 x i32> inreg %input, <4 x i32> inreg %output, <3 x i32> %LocalInvocationId) { +define amdgpu_cs void @if_else_vgpr_opt(ptr addrspace(8) inreg %input, ptr addrspace(8) inreg %output, <3 x i32> %LocalInvocationId) { ; GCN-LABEL: if_else_vgpr_opt: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 @@ -114,14 +114,14 @@ br i1 %i530, label %.then, label %.else .then: - call void @llvm.amdgcn.raw.buffer.store.i32(i32 -1, <4 x i32> %output, i32 %src, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i32(i32 -1, ptr addrspace(8) %output, i32 %src, i32 0, i32 0) br label %.end .else: %i562 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src, i32 0) %i563 = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %i562, i32 273, i32 15, i32 15, i1 false) %i564 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %i563) - call void @llvm.amdgcn.raw.buffer.store.i32(i32 -1, <4 x i32> %output, i32 %i564, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i32(i32 -1, ptr addrspace(8) %output, i32 %i564, i32 0, i32 0) br label %.end .end: @@ -131,7 +131,7 @@ declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0 declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #1 -declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32 immarg, i32 immarg) #2 +declare void @llvm.amdgcn.raw.buffer.store.i32(i32, ptr addrspace(8), i32, i32 immarg, i32 immarg) #2 attributes #0 = { convergent nounwind readnone willreturn } attributes #1 = { convergent nounwind readnone speculatable willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll --- a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll +++ b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll @@ -14,8 +14,8 @@ ; CHECK: buffer_load_dwordx2 ; CHECK-NEXT: s_setprio 0 ; CHECK: ; return to shader part epilog -define amdgpu_ps <2 x float> @vmem_in_exit_block(<4 x i32> inreg %p, <2 x float> %x) "amdgpu-wave-priority-threshold"="2" { - %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0) +define amdgpu_ps <2 x float> @vmem_in_exit_block(ptr addrspace(8) inreg %p, <2 x float> %x) "amdgpu-wave-priority-threshold"="2" { + %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) %p, i32 0, i32 0, i32 0, i32 0) %s = fadd <2 x float> %v, %x ret <2 x float> %s } @@ -31,7 +31,7 @@ ; CHECK-NEXT: s_setprio 0 ; CHECK: s_branch [[EXIT]] ; CHECK-NEXT: [[EXIT]]: -define amdgpu_ps <2 x float> @branch(<4 x i32> inreg %p, i32 inreg %i, <2 x float> %x) "amdgpu-wave-priority-threshold"="2" { +define amdgpu_ps <2 x float> @branch(ptr addrspace(8) inreg %p, i32 inreg %i, <2 x float> %x) "amdgpu-wave-priority-threshold"="2" { %cond = icmp eq i32 %i, 0 br i1 %cond, label %a, label %b @@ -39,7 +39,7 @@ ret <2 x float> b: - %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0) + %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) %p, i32 0, i32 0, i32 0, i32 0) %s = fadd <2 x float> %v, %x ret <2 x float> %s } @@ -59,14 +59,14 @@ ; CHECK-NEXT: s_setprio 0 ; CHECK: s_branch [[EXIT]] ; CHECK: [[EXIT]]: -define amdgpu_ps <2 x float> @setprio_follows_setprio(<4 x i32> inreg %p, i32 inreg %i) "amdgpu-wave-priority-threshold"="3" { +define amdgpu_ps <2 x float> @setprio_follows_setprio(ptr addrspace(8) inreg %p, i32 inreg %i) "amdgpu-wave-priority-threshold"="3" { entry: - %v1 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0) + %v1 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) %p, i32 0, i32 0, i32 0, i32 0) %cond1 = icmp ne i32 %i, 0 br i1 %cond1, label %a, label %c a: - %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 1, i32 0) + %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) %p, i32 0, i32 0, i32 1, i32 0) %v20 = extractelement <2 x float> %v2, i32 0 %v21 = extractelement <2 x float> %v2, i32 1 %cond2 = fcmp ult float %v20, %v21 @@ -91,7 +91,7 @@ ; CHECK: s_cbranch_scc1 [[LOOP]] ; CHECK-NEXT: {{.*}}: ; %exit ; CHECK-NEXT: s_setprio 0 -define amdgpu_ps <2 x float> @loop(<4 x i32> inreg %p) "amdgpu-wave-priority-threshold"="2" { +define amdgpu_ps <2 x float> @loop(ptr addrspace(8) inreg %p) "amdgpu-wave-priority-threshold"="2" { entry: br label %loop @@ -101,7 +101,7 @@ %i2 = add i32 %i, 1 - %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 %i, i32 0, i32 0, i32 0) + %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) %p, i32 %i, i32 0, i32 0, i32 0) %sum2 = fadd <2 x float> %sum, %v %cond = icmp ult i32 %i2, 5 @@ -129,9 +129,9 @@ ; CHECK-NEXT: s_setprio 0 ; CHECK: s_branch [[RET]] ; CHECK: [[RET]]: -define amdgpu_ps <2 x float> @edge_split(<4 x i32> inreg %p, i32 inreg %x) "amdgpu-wave-priority-threshold"="2" { +define amdgpu_ps <2 x float> @edge_split(ptr addrspace(8) inreg %p, i32 inreg %x) "amdgpu-wave-priority-threshold"="2" { entry: - %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0) + %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) %p, i32 0, i32 0, i32 0, i32 0) %cond = icmp ne i32 %x, 0 br i1 %cond, label %loop, label %another_load @@ -149,7 +149,7 @@ ret <2 x float> %mul2 another_load: - %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 1, i32 0) + %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) %p, i32 0, i32 0, i32 1, i32 0) %sum = fadd <2 x float> %v, %v2 ret <2 x float> %sum } @@ -166,8 +166,8 @@ ; CHECK: [[A]]: ; %a ; CHECK: s_branch [[END]] ; CHECK: [[END]]: -define amdgpu_ps <2 x float> @valu_insts_threshold(<4 x i32> inreg %p, i32 inreg %i) "amdgpu-wave-priority-threshold"="4" { - %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0) +define amdgpu_ps <2 x float> @valu_insts_threshold(ptr addrspace(8) inreg %p, i32 inreg %i) "amdgpu-wave-priority-threshold"="4" { + %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) %p, i32 0, i32 0, i32 0, i32 0) %add = fadd <2 x float> %v, %v %add2 = fadd <2 x float> %add, %add @@ -178,7 +178,7 @@ ret <2 x float> %add2 b: - %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 1, i32 0, i32 0) + %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) %p, i32 0, i32 1, i32 0, i32 0) %sub = fsub <2 x float> %add2, %v2 ret <2 x float> %sub } @@ -186,8 +186,8 @@ ; CHECK-LABEL: valu_insts_threshold2: ; CHECK-NOT: s_setprio ; CHECK: ; -- End function -define amdgpu_ps <2 x float> @valu_insts_threshold2(<4 x i32> inreg %p, i32 inreg %i) "amdgpu-wave-priority-threshold"="5" { - %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0) +define amdgpu_ps <2 x float> @valu_insts_threshold2(ptr addrspace(8) inreg %p, i32 inreg %i) "amdgpu-wave-priority-threshold"="5" { + %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) %p, i32 0, i32 0, i32 0, i32 0) %add = fadd <2 x float> %v, %v %add2 = fadd <2 x float> %add, %add @@ -198,9 +198,9 @@ ret <2 x float> %add2 b: - %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 1, i32 0, i32 0) + %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) %p, i32 0, i32 1, i32 0, i32 0) %sub = fsub <2 x float> %add2, %v2 ret <2 x float> %sub } -declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) nounwind +declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8), i32, i32, i32, i32) nounwind diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll @@ -7,10 +7,10 @@ ; CHECK: s_xor_b32 s{{[0-9]}}, [[DST]] define amdgpu_ps void @phi1(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %tmp20 = load <4 x i32>, ptr addrspace(4) %arg, !tbaa !0 - %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 0, i32 0) - %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 16, i32 0) - %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 32, i32 0) + %tmp20 = load ptr addrspace(8), ptr addrspace(4) %arg, !tbaa !0 + %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 0, i32 0) + %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 16, i32 0) + %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 32, i32 0) %tmp24 = fptosi float %tmp22 to i32 %tmp25 = icmp ne i32 %tmp24, 0 br i1 %tmp25, label %ENDIF, label %ELSE @@ -30,22 +30,22 @@ ; CHECK-LABEL: {{^}}phi2: define amdgpu_ps void @phi2(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #1 { main_body: - %tmp20 = load <4 x i32>, ptr addrspace(4) %arg, !tbaa !0 - %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 16, i32 0) - %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 32, i32 0) - %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 36, i32 0) - %tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 40, i32 0) - %tmp25 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 48, i32 0) - %tmp26 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 52, i32 0) - %tmp27 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 56, i32 0) - %tmp28 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 64, i32 0) - %tmp29 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 68, i32 0) - %tmp30 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 72, i32 0) - %tmp31 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 76, i32 0) - %tmp32 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 80, i32 0) - %tmp33 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 84, i32 0) - %tmp34 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 88, i32 0) - %tmp35 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 92, i32 0) + %tmp20 = load ptr addrspace(8), ptr addrspace(4) %arg, !tbaa !0 + %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 16, i32 0) + %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 32, i32 0) + %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 36, i32 0) + %tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 40, i32 0) + %tmp25 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 48, i32 0) + %tmp26 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 52, i32 0) + %tmp27 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 56, i32 0) + %tmp28 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 64, i32 0) + %tmp29 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 68, i32 0) + %tmp30 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 72, i32 0) + %tmp31 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 76, i32 0) + %tmp32 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 80, i32 0) + %tmp33 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 84, i32 0) + %tmp34 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 88, i32 0) + %tmp35 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 92, i32 0) %tmp37 = load <8 x i32>, ptr addrspace(4) %arg2, !tbaa !0 %tmp39 = load <4 x i32>, ptr addrspace(4) %arg1, !tbaa !0 %i.i = extractelement <2 x i32> %arg5, i32 0 @@ -167,11 +167,11 @@ ; CHECK-LABEL: {{^}}loop: define amdgpu_ps void @loop(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %tmp20 = load <4 x i32>, ptr addrspace(4) %arg, !tbaa !0 - %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 0, i32 0) - %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 4, i32 0) - %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 8, i32 0) - %tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 12, i32 0) + %tmp20 = load ptr addrspace(8), ptr addrspace(4) %arg, !tbaa !0 + %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 0, i32 0) + %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 4, i32 0) + %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 8, i32 0) + %tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 12, i32 0) %tmp25 = fptosi float %tmp24 to i32 %tmp26 = bitcast i32 %tmp25 to float %tmp27 = bitcast float %tmp26 to i32 @@ -219,8 +219,8 @@ ; CHECK: s_endpgm define amdgpu_ps void @sample_v3(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { entry: - %tmp21 = load <4 x i32>, ptr addrspace(4) %arg, !tbaa !0 - %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 16, i32 0) + %tmp21 = load ptr addrspace(8), ptr addrspace(4) %arg, !tbaa !0 + %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 16, i32 0) %tmp24 = load <8 x i32>, ptr addrspace(4) %arg2, !tbaa !0 %tmp26 = load <4 x i32>, ptr addrspace(4) %arg1, !tbaa !0 %tmp27 = fcmp oeq float %tmp22, 0.000000e+00 @@ -318,8 +318,8 @@ ; CHECK: s_branch define amdgpu_ps void @sample_rsrc(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, ptr addrspace(4) inreg %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 { bb: - %tmp22 = load <4 x i32>, ptr addrspace(4) %arg1, !tbaa !3 - %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp22, i32 16, i32 0) + %tmp22 = load ptr addrspace(8), ptr addrspace(4) %arg1, !tbaa !3 + %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp22, i32 16, i32 0) %tmp26 = load <8 x i32>, ptr addrspace(4) %arg3, !tbaa !3 %tmp28 = load <4 x i32>, ptr addrspace(4) %arg2, !tbaa !3 %i.i = extractelement <2 x i32> %arg7, i32 0 @@ -401,7 +401,7 @@ declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2 -declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1 +declare float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8), i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll @@ -27,8 +27,11 @@ ; SI-NEXT: s_cbranch_execz .LBB0_5 ; SI-NEXT: ; %bb.4: ; %if2 ; SI-NEXT: s_mov_b32 s3, 0 +; SI-NEXT: s_movk_i32 s2, 0x1f8 +; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 +; SI-NEXT: s_movk_i32 s2, 0x1f0 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v2 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:4 glc @@ -67,8 +70,11 @@ ; FLAT-NEXT: s_cbranch_execz .LBB0_5 ; FLAT-NEXT: ; %bb.4: ; %if2 ; FLAT-NEXT: s_mov_b32 s3, 0 +; FLAT-NEXT: s_movk_i32 s2, 0x1f8 +; FLAT-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 +; FLAT-NEXT: s_movk_i32 s2, 0x1f0 ; FLAT-NEXT: v_add_f32_e32 v0, 1.0, v2 -; FLAT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; FLAT-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; FLAT-NEXT: v_cvt_i32_f32_e32 v0, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:4 glc @@ -105,9 +111,9 @@ br i1 %.0, label %if2, label %endif2 if2: - %.5 = getelementptr inbounds <4 x i32>, ptr addrspace(6) undef, i32 31, !amdgpu.uniform !0 - %.6 = load <4 x i32>, ptr addrspace(6) %.5, align 16, !invariant.load !0 - %.7 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 %.test1, <4 x i32> %.6, i32 4, i32 0, i32 0) + %.5 = getelementptr inbounds ptr addrspace(8), ptr addrspace(6) null, i32 31, !amdgpu.uniform !0 + %.6 = load ptr addrspace(8), ptr addrspace(6) %.5, align 16, !invariant.load !0 + %.7 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 %.test1, ptr addrspace(8) %.6, i32 4, i32 0, i32 0) %.8 = sitofp i32 %.7 to float br label %endif2 @@ -117,7 +123,7 @@ } -declare i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i32 immarg) #2 +declare i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32, ptr addrspace(8), i32, i32, i32 immarg) #2 declare i1 @llvm.amdgcn.wqm.vote(i1) #3 declare void @llvm.amdgcn.kill(i1) #4 declare float @llvm.amdgcn.wqm.f32(float) #1 diff --git a/llvm/test/CodeGen/AMDGPU/si-scheduler-exports.ll b/llvm/test/CodeGen/AMDGPU/si-scheduler-exports.ll --- a/llvm/test/CodeGen/AMDGPU/si-scheduler-exports.ll +++ b/llvm/test/CodeGen/AMDGPU/si-scheduler-exports.ll @@ -17,9 +17,9 @@ ; CHECK-NEXT: s_endpgm entry: call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, i1 false, i1 false) - call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> zeroinitializer, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> zeroinitializer, ptr addrspace(8) zeroinitializer, i32 0, i32 0, i32 0, i32 0) ret void } declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) -declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) +declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/si-scheduler.ll b/llvm/test/CodeGen/AMDGPU/si-scheduler.ll --- a/llvm/test/CodeGen/AMDGPU/si-scheduler.ll +++ b/llvm/test/CodeGen/AMDGPU/si-scheduler.ll @@ -69,8 +69,8 @@ %tmp = insertelement <2 x i32> zeroinitializer, i32 %arg, i32 0 %tmp1 = bitcast <2 x i32> %tmp to i64 %tmp2 = inttoptr i64 %tmp1 to ptr addrspace(4) - %tmp3 = load <4 x i32>, ptr addrspace(4) %tmp2, align 16 - %tmp4 = tail call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp3, i32 0, i32 0) #0 + %tmp3 = load ptr addrspace(8), ptr addrspace(4) %tmp2, align 16 + %tmp4 = tail call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %tmp3, i32 0, i32 0) #0 switch i32 %tmp4, label %bb [ i32 0, label %bb5 i32 1, label %bb6 @@ -87,4 +87,4 @@ } ; Function Attrs: nounwind readnone -declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg) #1 +declare i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8), i32, i32 immarg) #1 diff --git a/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll --- a/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -34,45 +34,45 @@ define amdgpu_ps void @main(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) { main_body: %lds = inttoptr i32 0 to ptr addrspace(3) - %tmp21 = load <4 x i32>, ptr addrspace(4) %arg, !tbaa !0 - %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 96, i32 0) - %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 100, i32 0) - %tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 104, i32 0) - %tmp25 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 112, i32 0) - %tmp26 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 116, i32 0) - %tmp27 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 120, i32 0) - %tmp28 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 128, i32 0) - %tmp29 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 132, i32 0) - %tmp30 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 140, i32 0) - %tmp31 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 144, i32 0) - %tmp32 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 160, i32 0) - %tmp33 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 176, i32 0) - %tmp34 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 180, i32 0) - %tmp35 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 184, i32 0) - %tmp36 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 192, i32 0) - %tmp37 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 196, i32 0) - %tmp38 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 200, i32 0) - %tmp39 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 208, i32 0) - %tmp40 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 212, i32 0) - %tmp41 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 216, i32 0) - %tmp42 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 224, i32 0) - %tmp43 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 240, i32 0) - %tmp44 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 244, i32 0) - %tmp45 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 248, i32 0) - %tmp46 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 256, i32 0) - %tmp47 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 272, i32 0) - %tmp48 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 276, i32 0) - %tmp49 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 280, i32 0) - %tmp50 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 288, i32 0) - %tmp51 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 292, i32 0) - %tmp52 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 296, i32 0) - %tmp53 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 304, i32 0) - %tmp54 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 308, i32 0) - %tmp55 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 312, i32 0) - %tmp56 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 368, i32 0) - %tmp57 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 372, i32 0) - %tmp58 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 376, i32 0) - %tmp59 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 384, i32 0) + %tmp21 = load ptr addrspace(8), ptr addrspace(4) %arg, !tbaa !0 + %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 96, i32 0) + %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 100, i32 0) + %tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 104, i32 0) + %tmp25 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 112, i32 0) + %tmp26 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 116, i32 0) + %tmp27 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 120, i32 0) + %tmp28 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 128, i32 0) + %tmp29 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 132, i32 0) + %tmp30 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 140, i32 0) + %tmp31 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 144, i32 0) + %tmp32 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 160, i32 0) + %tmp33 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 176, i32 0) + %tmp34 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 180, i32 0) + %tmp35 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 184, i32 0) + %tmp36 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 192, i32 0) + %tmp37 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 196, i32 0) + %tmp38 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 200, i32 0) + %tmp39 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 208, i32 0) + %tmp40 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 212, i32 0) + %tmp41 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 216, i32 0) + %tmp42 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 224, i32 0) + %tmp43 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 240, i32 0) + %tmp44 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 244, i32 0) + %tmp45 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 248, i32 0) + %tmp46 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 256, i32 0) + %tmp47 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 272, i32 0) + %tmp48 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 276, i32 0) + %tmp49 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 280, i32 0) + %tmp50 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 288, i32 0) + %tmp51 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 292, i32 0) + %tmp52 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 296, i32 0) + %tmp53 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 304, i32 0) + %tmp54 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 308, i32 0) + %tmp55 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 312, i32 0) + %tmp56 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 368, i32 0) + %tmp57 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 372, i32 0) + %tmp58 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 376, i32 0) + %tmp59 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 384, i32 0) %tmp61 = load <8 x i32>, ptr addrspace(4) %arg2, !tbaa !0 %tmp63 = load <4 x i32>, ptr addrspace(4) %arg1, !tbaa !0 %tmp63.bc = bitcast <4 x i32> %tmp63 to <4 x i32> @@ -650,110 +650,110 @@ ; TOVGPR: ScratchSize: 0{{$}} define amdgpu_ps void @main1(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { main_body: - %tmp21 = load <4 x i32>, ptr addrspace(4) %arg, !tbaa !0 - %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 0, i32 0) - %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 4, i32 0) - %tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 8, i32 0) - %tmp25 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 12, i32 0) - %tmp26 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 28, i32 0) - %tmp27 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 48, i32 0) - %tmp28 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 52, i32 0) - %tmp29 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 56, i32 0) - %tmp30 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 64, i32 0) - %tmp31 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 68, i32 0) - %tmp32 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 72, i32 0) - %tmp33 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 76, i32 0) - %tmp34 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 128, i32 0) - %tmp35 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 132, i32 0) - %tmp36 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 144, i32 0) - %tmp37 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 148, i32 0) - %tmp38 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 152, i32 0) - %tmp39 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 160, i32 0) - %tmp40 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 164, i32 0) - %tmp41 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 168, i32 0) - %tmp42 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 172, i32 0) - %tmp43 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 176, i32 0) - %tmp44 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 180, i32 0) - %tmp45 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 184, i32 0) - %tmp46 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 192, i32 0) - %tmp47 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 196, i32 0) - %tmp48 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 200, i32 0) - %tmp49 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 208, i32 0) - %tmp50 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 212, i32 0) - %tmp51 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 216, i32 0) - %tmp52 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 220, i32 0) - %tmp53 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 236, i32 0) - %tmp54 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 240, i32 0) - %tmp55 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 244, i32 0) - %tmp56 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 248, i32 0) - %tmp57 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 252, i32 0) - %tmp58 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 256, i32 0) - %tmp59 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 260, i32 0) - %tmp60 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 264, i32 0) - %tmp61 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 268, i32 0) - %tmp62 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 272, i32 0) - %tmp63 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 276, i32 0) - %tmp64 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 280, i32 0) - %tmp65 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 284, i32 0) - %tmp66 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 288, i32 0) - %tmp67 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 292, i32 0) - %tmp68 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 464, i32 0) - %tmp69 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 468, i32 0) - %tmp70 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 472, i32 0) - %tmp71 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 496, i32 0) - %tmp72 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 500, i32 0) - %tmp73 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 504, i32 0) - %tmp74 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 512, i32 0) - %tmp75 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 516, i32 0) - %tmp76 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 524, i32 0) - %tmp77 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 532, i32 0) - %tmp78 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 536, i32 0) - %tmp79 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 540, i32 0) - %tmp80 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 544, i32 0) - %tmp81 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 548, i32 0) - %tmp82 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 552, i32 0) - %tmp83 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 556, i32 0) - %tmp84 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 560, i32 0) - %tmp85 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 564, i32 0) - %tmp86 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 568, i32 0) - %tmp87 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 572, i32 0) - %tmp88 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 576, i32 0) - %tmp89 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 580, i32 0) - %tmp90 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 584, i32 0) - %tmp91 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 588, i32 0) - %tmp92 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 592, i32 0) - %tmp93 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 596, i32 0) - %tmp94 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 600, i32 0) - %tmp95 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 604, i32 0) - %tmp96 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 608, i32 0) - %tmp97 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 612, i32 0) - %tmp98 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 616, i32 0) - %tmp99 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 624, i32 0) - %tmp100 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 628, i32 0) - %tmp101 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 632, i32 0) - %tmp102 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 636, i32 0) - %tmp103 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 640, i32 0) - %tmp104 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 644, i32 0) - %tmp105 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 648, i32 0) - %tmp106 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 652, i32 0) - %tmp107 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 656, i32 0) - %tmp108 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 660, i32 0) - %tmp109 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 664, i32 0) - %tmp110 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 668, i32 0) - %tmp111 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 672, i32 0) - %tmp112 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 676, i32 0) - %tmp113 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 680, i32 0) - %tmp114 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 684, i32 0) - %tmp115 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 688, i32 0) - %tmp116 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 692, i32 0) - %tmp117 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 696, i32 0) - %tmp118 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 700, i32 0) - %tmp119 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 704, i32 0) - %tmp120 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 708, i32 0) - %tmp121 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 712, i32 0) - %tmp122 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 716, i32 0) - %tmp123 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 864, i32 0) - %tmp124 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 868, i32 0) + %tmp21 = load ptr addrspace(8), ptr addrspace(4) %arg, !tbaa !0 + %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 0, i32 0) + %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 4, i32 0) + %tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 8, i32 0) + %tmp25 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 12, i32 0) + %tmp26 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 28, i32 0) + %tmp27 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 48, i32 0) + %tmp28 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 52, i32 0) + %tmp29 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 56, i32 0) + %tmp30 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 64, i32 0) + %tmp31 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 68, i32 0) + %tmp32 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 72, i32 0) + %tmp33 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 76, i32 0) + %tmp34 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 128, i32 0) + %tmp35 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 132, i32 0) + %tmp36 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 144, i32 0) + %tmp37 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 148, i32 0) + %tmp38 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 152, i32 0) + %tmp39 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 160, i32 0) + %tmp40 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 164, i32 0) + %tmp41 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 168, i32 0) + %tmp42 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 172, i32 0) + %tmp43 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 176, i32 0) + %tmp44 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 180, i32 0) + %tmp45 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 184, i32 0) + %tmp46 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 192, i32 0) + %tmp47 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 196, i32 0) + %tmp48 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 200, i32 0) + %tmp49 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 208, i32 0) + %tmp50 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 212, i32 0) + %tmp51 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 216, i32 0) + %tmp52 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 220, i32 0) + %tmp53 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 236, i32 0) + %tmp54 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 240, i32 0) + %tmp55 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 244, i32 0) + %tmp56 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 248, i32 0) + %tmp57 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 252, i32 0) + %tmp58 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 256, i32 0) + %tmp59 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 260, i32 0) + %tmp60 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 264, i32 0) + %tmp61 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 268, i32 0) + %tmp62 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 272, i32 0) + %tmp63 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 276, i32 0) + %tmp64 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 280, i32 0) + %tmp65 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 284, i32 0) + %tmp66 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 288, i32 0) + %tmp67 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 292, i32 0) + %tmp68 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 464, i32 0) + %tmp69 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 468, i32 0) + %tmp70 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 472, i32 0) + %tmp71 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 496, i32 0) + %tmp72 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 500, i32 0) + %tmp73 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 504, i32 0) + %tmp74 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 512, i32 0) + %tmp75 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 516, i32 0) + %tmp76 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 524, i32 0) + %tmp77 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 532, i32 0) + %tmp78 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 536, i32 0) + %tmp79 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 540, i32 0) + %tmp80 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 544, i32 0) + %tmp81 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 548, i32 0) + %tmp82 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 552, i32 0) + %tmp83 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 556, i32 0) + %tmp84 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 560, i32 0) + %tmp85 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 564, i32 0) + %tmp86 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 568, i32 0) + %tmp87 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 572, i32 0) + %tmp88 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 576, i32 0) + %tmp89 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 580, i32 0) + %tmp90 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 584, i32 0) + %tmp91 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 588, i32 0) + %tmp92 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 592, i32 0) + %tmp93 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 596, i32 0) + %tmp94 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 600, i32 0) + %tmp95 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 604, i32 0) + %tmp96 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 608, i32 0) + %tmp97 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 612, i32 0) + %tmp98 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 616, i32 0) + %tmp99 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 624, i32 0) + %tmp100 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 628, i32 0) + %tmp101 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 632, i32 0) + %tmp102 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 636, i32 0) + %tmp103 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 640, i32 0) + %tmp104 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 644, i32 0) + %tmp105 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 648, i32 0) + %tmp106 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 652, i32 0) + %tmp107 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 656, i32 0) + %tmp108 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 660, i32 0) + %tmp109 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 664, i32 0) + %tmp110 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 668, i32 0) + %tmp111 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 672, i32 0) + %tmp112 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 676, i32 0) + %tmp113 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 680, i32 0) + %tmp114 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 684, i32 0) + %tmp115 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 688, i32 0) + %tmp116 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 692, i32 0) + %tmp117 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 696, i32 0) + %tmp118 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 700, i32 0) + %tmp119 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 704, i32 0) + %tmp120 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 708, i32 0) + %tmp121 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 712, i32 0) + %tmp122 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 716, i32 0) + %tmp123 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 864, i32 0) + %tmp124 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp21, i32 868, i32 0) %tmp126 = load <8 x i32>, ptr addrspace(4) %arg2, !tbaa !0 %tmp128 = load <4 x i32>, ptr addrspace(4) %arg1, !tbaa !0 %tmp129 = getelementptr [16 x <8 x i32>], ptr addrspace(4) %arg2, i64 0, i32 1 @@ -1685,7 +1685,7 @@ declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2 declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2 declare <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2 -declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1 +declare float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8), i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-cf.ll b/llvm/test/CodeGen/AMDGPU/si-spill-cf.ll --- a/llvm/test/CodeGen/AMDGPU/si-spill-cf.ll +++ b/llvm/test/CodeGen/AMDGPU/si-spill-cf.ll @@ -9,73 +9,73 @@ define amdgpu_ps void @main() #0 { main_body: - %tmp = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 16, i32 0) - %tmp1 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 32, i32 0) - %tmp2 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 80, i32 0) - %tmp3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 84, i32 0) - %tmp4 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 88, i32 0) - %tmp5 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 96, i32 0) - %tmp6 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 100, i32 0) - %tmp7 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 104, i32 0) - %tmp8 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 112, i32 0) - %tmp9 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 116, i32 0) - %tmp10 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 120, i32 0) - %tmp11 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 128, i32 0) - %tmp12 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 132, i32 0) - %tmp13 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 136, i32 0) - %tmp14 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 144, i32 0) - %tmp15 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 148, i32 0) - %tmp16 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 152, i32 0) - %tmp17 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 160, i32 0) - %tmp18 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 164, i32 0) - %tmp19 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 168, i32 0) - %tmp20 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 176, i32 0) - %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 180, i32 0) - %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 184, i32 0) - %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 192, i32 0) - %tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 196, i32 0) - %tmp25 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 200, i32 0) - %tmp26 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 208, i32 0) - %tmp27 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 212, i32 0) - %tmp28 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 216, i32 0) - %tmp29 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 224, i32 0) - %tmp30 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 228, i32 0) - %tmp31 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 232, i32 0) - %tmp32 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 240, i32 0) - %tmp33 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 244, i32 0) - %tmp34 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 248, i32 0) - %tmp35 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 256, i32 0) - %tmp36 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 260, i32 0) - %tmp37 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 264, i32 0) - %tmp38 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 272, i32 0) - %tmp39 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 276, i32 0) - %tmp40 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 280, i32 0) - %tmp41 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 288, i32 0) - %tmp42 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 292, i32 0) - %tmp43 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 296, i32 0) - %tmp44 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 304, i32 0) - %tmp45 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 308, i32 0) - %tmp46 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 312, i32 0) - %tmp47 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 320, i32 0) - %tmp48 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 324, i32 0) - %tmp49 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 328, i32 0) - %tmp50 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 336, i32 0) - %tmp51 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 340, i32 0) - %tmp52 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 344, i32 0) - %tmp53 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 352, i32 0) - %tmp54 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 356, i32 0) - %tmp55 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 360, i32 0) - %tmp56 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 368, i32 0) - %tmp57 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 372, i32 0) - %tmp58 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 376, i32 0) - %tmp59 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 384, i32 0) - %tmp60 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 388, i32 0) - %tmp61 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 392, i32 0) - %tmp62 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 400, i32 0) - %tmp63 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 404, i32 0) - %tmp64 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 408, i32 0) - %tmp65 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 416, i32 0) - %tmp66 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 420, i32 0) + %tmp = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 16, i32 0) + %tmp1 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 32, i32 0) + %tmp2 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 80, i32 0) + %tmp3 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 84, i32 0) + %tmp4 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 88, i32 0) + %tmp5 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 96, i32 0) + %tmp6 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 100, i32 0) + %tmp7 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 104, i32 0) + %tmp8 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 112, i32 0) + %tmp9 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 116, i32 0) + %tmp10 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 120, i32 0) + %tmp11 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 128, i32 0) + %tmp12 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 132, i32 0) + %tmp13 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 136, i32 0) + %tmp14 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 144, i32 0) + %tmp15 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 148, i32 0) + %tmp16 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 152, i32 0) + %tmp17 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 160, i32 0) + %tmp18 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 164, i32 0) + %tmp19 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 168, i32 0) + %tmp20 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 176, i32 0) + %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 180, i32 0) + %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 184, i32 0) + %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 192, i32 0) + %tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 196, i32 0) + %tmp25 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 200, i32 0) + %tmp26 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 208, i32 0) + %tmp27 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 212, i32 0) + %tmp28 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 216, i32 0) + %tmp29 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 224, i32 0) + %tmp30 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 228, i32 0) + %tmp31 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 232, i32 0) + %tmp32 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 240, i32 0) + %tmp33 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 244, i32 0) + %tmp34 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 248, i32 0) + %tmp35 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 256, i32 0) + %tmp36 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 260, i32 0) + %tmp37 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 264, i32 0) + %tmp38 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 272, i32 0) + %tmp39 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 276, i32 0) + %tmp40 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 280, i32 0) + %tmp41 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 288, i32 0) + %tmp42 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 292, i32 0) + %tmp43 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 296, i32 0) + %tmp44 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 304, i32 0) + %tmp45 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 308, i32 0) + %tmp46 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 312, i32 0) + %tmp47 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 320, i32 0) + %tmp48 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 324, i32 0) + %tmp49 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 328, i32 0) + %tmp50 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 336, i32 0) + %tmp51 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 340, i32 0) + %tmp52 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 344, i32 0) + %tmp53 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 352, i32 0) + %tmp54 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 356, i32 0) + %tmp55 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 360, i32 0) + %tmp56 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 368, i32 0) + %tmp57 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 372, i32 0) + %tmp58 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 376, i32 0) + %tmp59 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 384, i32 0) + %tmp60 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 388, i32 0) + %tmp61 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 392, i32 0) + %tmp62 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 400, i32 0) + %tmp63 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 404, i32 0) + %tmp64 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 408, i32 0) + %tmp65 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 416, i32 0) + %tmp66 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 420, i32 0) br label %LOOP LOOP: ; preds = %ENDIF2795, %main_body @@ -498,7 +498,7 @@ declare float @llvm.maxnum.f32(float, float) #1 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 -declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1 +declare float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8), i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll --- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -303,7 +303,7 @@ %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 %vaddr.add = add i32 %vaddr, 32 - call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %vdata, <4 x i32> undef, i32 %vaddr.add, i32 0, i32 0, i32 228, i32 3) + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %vdata, ptr addrspace(8) undef, i32 %vaddr.add, i32 0, i32 0, i32 228, i32 3) %tmp2 = load i32, ptr addrspace(3) %ptr2, align 4 @@ -314,7 +314,7 @@ declare void @llvm.amdgcn.s.barrier() #1 declare i32 @llvm.amdgcn.workitem.id.x() #2 -declare void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32 immarg, i32 immarg) #3 +declare void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32>, ptr addrspace(8), i32, i32, i32, i32 immarg, i32 immarg) #3 attributes #0 = { nounwind } attributes #1 = { convergent nounwind willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/smrd-gfx10.ll b/llvm/test/CodeGen/AMDGPU/smrd-gfx10.ll --- a/llvm/test/CodeGen/AMDGPU/smrd-gfx10.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd-gfx10.ll @@ -4,39 +4,39 @@ ; GCN-LABEL: {{^}}smrd_imm_dlc: ; GFX10: s_buffer_load_dword s0, s[0:3], 0x0 dlc ; encoding: [0x00,0x40,0x20,0xf4,0x00,0x00,0x00,0xfa] ; GFX11: s_buffer_load_b32 s0, s[0:3], 0x0 dlc ; encoding: [0x00,0x20,0x20,0xf4,0x00,0x00,0x00,0xf8] -define amdgpu_ps float @smrd_imm_dlc(<4 x i32> inreg %desc) #0 { +define amdgpu_ps float @smrd_imm_dlc(ptr addrspace(8) inreg %desc) #0 { main_body: - %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 0, i32 4) + %r = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %desc, i32 0, i32 4) ret float %r } ; GCN-LABEL: {{^}}smrd_sgpr_dlc: ; GFX10: s_buffer_load_dword s0, s[0:3], s4 dlc ; encoding: [0x00,0x40,0x20,0xf4,0x00,0x00,0x00,0x08] ; GFX11: s_buffer_load_b32 s0, s[0:3], s4 dlc ; encoding: [0x00,0x20,0x20,0xf4,0x00,0x00,0x00,0x08] -define amdgpu_ps float @smrd_sgpr_dlc(<4 x i32> inreg %desc, i32 inreg %offset) #0 { +define amdgpu_ps float @smrd_sgpr_dlc(ptr addrspace(8) inreg %desc, i32 inreg %offset) #0 { main_body: - %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 4) + %r = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %desc, i32 %offset, i32 4) ret float %r } ; GCN-LABEL: {{^}}smrd_imm_glc_dlc: ; GFX10: s_buffer_load_dword s0, s[0:3], 0x0 glc dlc ; encoding: [0x00,0x40,0x21,0xf4,0x00,0x00,0x00,0xfa] ; GFX11: s_buffer_load_b32 s0, s[0:3], 0x0 glc dlc ; encoding: [0x00,0x60,0x20,0xf4,0x00,0x00,0x00,0xf8] -define amdgpu_ps float @smrd_imm_glc_dlc(<4 x i32> inreg %desc) #0 { +define amdgpu_ps float @smrd_imm_glc_dlc(ptr addrspace(8) inreg %desc) #0 { main_body: - %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 0, i32 5) + %r = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %desc, i32 0, i32 5) ret float %r } ; GCN-LABEL: {{^}}smrd_sgpr_glc_dlc: ; GFX10: s_buffer_load_dword s0, s[0:3], s4 glc dlc ; encoding: [0x00,0x40,0x21,0xf4,0x00,0x00,0x00,0x08] ; GFX11: s_buffer_load_b32 s0, s[0:3], s4 glc dlc ; encoding: [0x00,0x60,0x20,0xf4,0x00,0x00,0x00,0x08] -define amdgpu_ps float @smrd_sgpr_glc_dlc(<4 x i32> inreg %desc, i32 inreg %offset) #0 { +define amdgpu_ps float @smrd_sgpr_glc_dlc(ptr addrspace(8) inreg %desc, i32 inreg %offset) #0 { main_body: - %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 5) + %r = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %desc, i32 %offset, i32 5) ret float %r } -declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) +declare float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8), i32, i32) !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll --- a/llvm/test/CodeGen/AMDGPU/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd.ll @@ -121,13 +121,11 @@ ; GCN-DAG: s_mov_b32 s0, 0 ; SI-NEXT: nop 3 ; GCN-NEXT: s_buffer_load_dword s0, s[0:3], 0x0 -define amdgpu_ps float @smrd_hazard(<4 x i32> inreg %desc) #0 { +define amdgpu_ps float @smrd_hazard(ptr addrspace(8) inreg %desc) #0 { main_body: - %d0 = insertelement <4 x i32> undef, i32 0, i32 0 - %d1 = insertelement <4 x i32> %d0, i32 1, i32 1 - %d2 = insertelement <4 x i32> %d1, i32 2, i32 2 - %d3 = insertelement <4 x i32> %d2, i32 3, i32 3 - %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %d3, i32 0, i32 0) + %desc2.int = bitcast <4 x i32> to i128 + %desc2 = inttoptr i128 %desc2.int to ptr addrspace(8) + %r = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %desc2, i32 0, i32 0) ret float %r } @@ -137,8 +135,8 @@ ; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10 define amdgpu_ps void @smrd_load_const0(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 { main_body: - %tmp20 = load <4 x i32>, ptr addrspace(4) %arg - %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 16, i32 0) + %tmp20 = load ptr addrspace(8), ptr addrspace(4) %arg + %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 16, i32 0) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 ret void } @@ -152,10 +150,10 @@ ; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x3fc glc ; define amdgpu_ps void @smrd_load_const1(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 { main_body: - %tmp20 = load <4 x i32>, ptr addrspace(4) %arg - %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 1020, i32 0) - %tmp22 = load <4 x i32>, ptr addrspace(4) %in - %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1020, i32 1) + %tmp20 = load ptr addrspace(8), ptr addrspace(4) %arg + %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 1020, i32 0) + %tmp22 = load ptr addrspace(8), ptr addrspace(4) %in + %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %tmp22, i32 1020, i32 1) %s.buffer.float = bitcast i32 %s.buffer to float call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0 ret void @@ -174,10 +172,10 @@ ; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x400 define amdgpu_ps void @smrd_load_const2(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 { main_body: - %tmp20 = load <4 x i32>, ptr addrspace(4) %arg - %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 1024, i32 0) - %tmp22 = load <4 x i32>, ptr addrspace(4) %in - %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1024, i32 0) + %tmp20 = load ptr addrspace(8), ptr addrspace(4) %arg + %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 1024, i32 0) + %tmp22 = load ptr addrspace(8), ptr addrspace(4) %in + %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %tmp22, i32 1024, i32 0) %s.buffer.float = bitcast i32 %s.buffer to float call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0 ret void @@ -194,10 +192,10 @@ ; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0xffffc define amdgpu_ps void @smrd_load_const3(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 { main_body: - %tmp20 = load <4 x i32>, ptr addrspace(4) %arg - %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 1048572, i32 0) - %tmp22 = load <4 x i32>, ptr addrspace(4) %in - %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1048572, i32 0) + %tmp20 = load ptr addrspace(8), ptr addrspace(4) %arg + %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 1048572, i32 0) + %tmp22 = load ptr addrspace(8), ptr addrspace(4) %in + %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %tmp22, i32 1048572, i32 0) %s.buffer.float = bitcast i32 %s.buffer to float call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0 ret void @@ -213,10 +211,10 @@ ; GCN: s_endpgm define amdgpu_ps void @smrd_load_const4(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 { main_body: - %tmp20 = load <4 x i32>, ptr addrspace(4) %arg - %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 1048576, i32 0) - %tmp22 = load <4 x i32>, ptr addrspace(4) %in - %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1048576, i32 0) + %tmp20 = load ptr addrspace(8), ptr addrspace(4) %arg + %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 1048576, i32 0) + %tmp22 = load ptr addrspace(8), ptr addrspace(4) %in + %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %tmp22, i32 1048576, i32 0) %s.buffer.float = bitcast i32 %s.buffer to float call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0 ret void @@ -228,8 +226,8 @@ ; SICI: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20 define amdgpu_ps void @s_buffer_load_dwordx2(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 { main_body: - %tmp22 = load <4 x i32>, ptr addrspace(4) %in - %s.buffer = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %tmp22, i32 128, i32 0) + %tmp22 = load ptr addrspace(8), ptr addrspace(4) %in + %s.buffer = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(ptr addrspace(8) %tmp22, i32 128, i32 0) %s.buffer.0 = extractelement <2 x i32> %s.buffer, i32 0 %s.buffer.0.float = bitcast i32 %s.buffer.0 to float %s.buffer.1 = extractelement <2 x i32> %s.buffer, i32 1 @@ -244,8 +242,8 @@ ; SICI: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20 define amdgpu_ps void @s_buffer_load_dwordx4(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 { main_body: - %tmp22 = load <4 x i32>, ptr addrspace(4) %in - %s.buffer = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %tmp22, i32 128, i32 0) + %tmp22 = load ptr addrspace(8), ptr addrspace(4) %in + %s.buffer = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(ptr addrspace(8) %tmp22, i32 128, i32 0) %s.buffer.0 = extractelement <4 x i32> %s.buffer, i32 0 %s.buffer.0.float = bitcast i32 %s.buffer.0 to float %s.buffer.1 = extractelement <4 x i32> %s.buffer, i32 1 @@ -264,8 +262,8 @@ ; SICI: s_buffer_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20 define amdgpu_ps void @s_buffer_load_dwordx8(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 { main_body: - %tmp22 = load <4 x i32>, ptr addrspace(4) %in - %s.buffer = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %tmp22, i32 128, i32 0) + %tmp22 = load ptr addrspace(8), ptr addrspace(4) %in + %s.buffer = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(ptr addrspace(8) %tmp22, i32 128, i32 0) %s.buffer.0 = extractelement <8 x i32> %s.buffer, i32 0 %s.buffer.0.float = bitcast i32 %s.buffer.0 to float %s.buffer.1 = extractelement <8 x i32> %s.buffer, i32 2 @@ -284,8 +282,8 @@ ; SICI: s_buffer_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20 define amdgpu_ps void @s_buffer_load_dwordx8_v8f32(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 { main_body: - %tmp22 = load <4 x i32>, ptr addrspace(4) %in - %s.buffer = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %tmp22, i32 128, i32 0) + %tmp22 = load ptr addrspace(8), ptr addrspace(4) %in + %s.buffer = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8) %tmp22, i32 128, i32 0) %s.buffer.0 = extractelement <8 x float> %s.buffer, i32 0 %s.buffer.1 = extractelement <8 x float> %s.buffer, i32 2 %s.buffer.2 = extractelement <8 x float> %s.buffer, i32 5 @@ -300,8 +298,8 @@ ; SICI: s_buffer_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20 define amdgpu_ps void @s_buffer_load_dwordx16(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 { main_body: - %tmp22 = load <4 x i32>, ptr addrspace(4) %in - %s.buffer = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %tmp22, i32 128, i32 0) + %tmp22 = load ptr addrspace(8), ptr addrspace(4) %in + %s.buffer = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(ptr addrspace(8) %tmp22, i32 128, i32 0) %s.buffer.0 = extractelement <16 x i32> %s.buffer, i32 0 %s.buffer.0.float = bitcast i32 %s.buffer.0 to float %s.buffer.1 = extractelement <16 x i32> %s.buffer, i32 3 @@ -319,8 +317,8 @@ ; SICI: s_buffer_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20 define amdgpu_ps void @s_buffer_load_dwordx16_v16f32(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 { main_body: - %tmp22 = load <4 x i32>, ptr addrspace(4) %in - %s.buffer = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %tmp22, i32 128, i32 0) + %tmp22 = load ptr addrspace(8), ptr addrspace(4) %in + %s.buffer = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(ptr addrspace(8) %tmp22, i32 128, i32 0) %s.buffer.0 = extractelement <16 x float> %s.buffer, i32 0 %s.buffer.1 = extractelement <16 x float> %s.buffer, i32 3 %s.buffer.2 = extractelement <16 x float> %s.buffer, i32 12 @@ -331,27 +329,27 @@ ; GCN-LABEL: {{^}}smrd_sgpr_offset: ; GCN: s_buffer_load_dword s{{[0-9]}}, s[0:3], s4 -define amdgpu_ps float @smrd_sgpr_offset(<4 x i32> inreg %desc, i32 inreg %offset) #0 { +define amdgpu_ps float @smrd_sgpr_offset(ptr addrspace(8) inreg %desc, i32 inreg %offset) #0 { main_body: - %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0) + %r = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %desc, i32 %offset, i32 0) ret float %r } ; GCN-LABEL: {{^}}smrd_vgpr_offset: ; GCN: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ; -define amdgpu_ps float @smrd_vgpr_offset(<4 x i32> inreg %desc, i32 %offset) #0 { +define amdgpu_ps float @smrd_vgpr_offset(ptr addrspace(8) inreg %desc, i32 %offset) #0 { main_body: - %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0) + %r = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %desc, i32 %offset, i32 0) ret float %r } ; GCN-LABEL: {{^}}smrd_vgpr_offset_imm: ; GCN-NEXT: %bb. ; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4092 ; -define amdgpu_ps float @smrd_vgpr_offset_imm(<4 x i32> inreg %desc, i32 %offset) #0 { +define amdgpu_ps float @smrd_vgpr_offset_imm(ptr addrspace(8) inreg %desc, i32 %offset) #0 { main_body: %off = add i32 %offset, 4092 - %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %off, i32 0) + %r = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %desc, i32 %off, i32 0) ret float %r } @@ -360,10 +358,10 @@ ; SICI-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}0x1000, v0 ; SICI-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ; ; VIGFX9_10-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 4 offen offset:4092 ; -define amdgpu_ps float @smrd_vgpr_offset_imm_too_large(<4 x i32> inreg %desc, i32 %offset) #0 { +define amdgpu_ps float @smrd_vgpr_offset_imm_too_large(ptr addrspace(8) inreg %desc, i32 %offset) #0 { main_body: %off = add i32 %offset, 4096 - %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %off, i32 0) + %r = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %desc, i32 %off, i32 0) ret float %r } @@ -374,14 +372,14 @@ ; GFX10-NEXT: s_clause ; VIGFX9_10-NEXT: s_buffer_load_dwordx4 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x4 ; VIGFX9_10-NEXT: s_buffer_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x1c -define amdgpu_ps void @smrd_imm_merged(<4 x i32> inreg %desc) #0 { +define amdgpu_ps void @smrd_imm_merged(ptr addrspace(8) inreg %desc) #0 { main_body: - %r1 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 4, i32 0) - %r2 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 8, i32 0) - %r3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 12, i32 0) - %r4 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 16, i32 0) - %r5 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 28, i32 0) - %r6 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 32, i32 0) + %r1 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %desc, i32 4, i32 0) + %r2 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %desc, i32 8, i32 0) + %r3 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %desc, i32 12, i32 0) + %r4 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %desc, i32 16, i32 0) + %r5 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %desc, i32 28, i32 0) + %r6 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %desc, i32 32, i32 0) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) #0 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) #0 ret void @@ -406,9 +404,9 @@ ; ; Merging is still thwarted on GFX9 due to s_set_gpr_idx ; -define amdgpu_ps float @smrd_imm_merge_m0(<4 x i32> inreg %desc, i32 inreg %prim, float %u, float %v) #0 { +define amdgpu_ps float @smrd_imm_merge_m0(ptr addrspace(8) inreg %desc, i32 inreg %prim, float %u, float %v) #0 { main_body: - %idx1.f = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 0, i32 0) + %idx1.f = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %desc, i32 0, i32 0) %idx1 = bitcast float %idx1.f to i32 %v0.x1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 0, i32 0, i32 %prim) @@ -433,7 +431,7 @@ %v1 = insertelement <3 x float> %v0.tmp1, float %v0.z, i32 2 %b = extractelement <3 x float> %v1, i32 %idx1 - %c = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 4, i32 0) + %c = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %desc, i32 4, i32 0) %res.tmp = fadd float %a, %b %res = fadd float %res.tmp, %c @@ -445,7 +443,7 @@ ; GFX10-NEXT: s_clause ; GCN-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 ; GCN-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 -define amdgpu_ps void @smrd_vgpr_merged(<4 x i32> inreg %desc, i32 %a) #0 { +define amdgpu_ps void @smrd_vgpr_merged(ptr addrspace(8) inreg %desc, i32 %a) #0 { main_body: %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 @@ -453,12 +451,12 @@ %a4 = add i32 %a, 16 %a5 = add i32 %a, 28 %a6 = add i32 %a, 32 - %r1 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a1, i32 0) - %r2 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a2, i32 0) - %r3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a3, i32 0) - %r4 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a4, i32 0) - %r5 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a5, i32 0) - %r6 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a6, i32 0) + %r1 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %desc, i32 %a1, i32 0) + %r2 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %desc, i32 %a2, i32 0) + %r3 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %desc, i32 %a3, i32 0) + %r4 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %desc, i32 %a4, i32 0) + %r5 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %desc, i32 %a5, i32 0) + %r6 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %desc, i32 %a6, i32 0) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) #0 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) #0 ret void @@ -483,15 +481,15 @@ br i1 %inner_br1, label %.inner_loop_body, label %ret_block .inner_loop_body: - %descriptor = load <4 x i32>, ptr addrspace(4) %0, align 16, !invariant.load !0 - %load1result = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %descriptor, i32 0, i32 0) + %descriptor = load ptr addrspace(8), ptr addrspace(4) %0, align 16, !invariant.load !0 + %load1result = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %descriptor, i32 0, i32 0) store float %load1result, ptr addrspace(1) undef %inner_br2 = icmp uge i32 %1, 10 br i1 %inner_br2, label %.inner_loop_header, label %.outer_loop_body .outer_loop_body: %offset = shl i32 %loopctr.2, 6 - %load2result = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %descriptor, i32 %offset, i32 0) + %load2result = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %descriptor, i32 %offset, i32 0) %outer_br = fcmp ueq float %load2result, 0x0 br i1 %outer_br, label %.outer_loop_header, label %ret_block } @@ -505,10 +503,10 @@ ; GCN: s_endpgm define amdgpu_ps void @smrd_load_nonconst0(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in, i32 inreg %ncoff) #0 { main_body: - %tmp20 = load <4 x i32>, ptr addrspace(4) %arg - %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 %ncoff, i32 0) - %tmp22 = load <4 x i32>, ptr addrspace(4) %in - %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 %ncoff, i32 0) + %tmp20 = load ptr addrspace(8), ptr addrspace(4) %arg + %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 %ncoff, i32 0) + %tmp22 = load ptr addrspace(8), ptr addrspace(4) %in + %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %tmp22, i32 %ncoff, i32 0) %s.buffer.float = bitcast i32 %s.buffer to float call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0 ret void @@ -523,10 +521,10 @@ ; GCN: s_endpgm define amdgpu_ps void @smrd_load_nonconst1(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in, i32 %ncoff) #0 { main_body: - %tmp20 = load <4 x i32>, ptr addrspace(4) %arg - %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 %ncoff, i32 0) - %tmp22 = load <4 x i32>, ptr addrspace(4) %in - %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 %ncoff, i32 0) + %tmp20 = load ptr addrspace(8), ptr addrspace(4) %arg + %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 %ncoff, i32 0) + %tmp22 = load ptr addrspace(8), ptr addrspace(4) %in + %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %tmp22, i32 %ncoff, i32 0) %s.buffer.float = bitcast i32 %s.buffer to float call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0 ret void @@ -541,10 +539,10 @@ ; GCN: s_endpgm define amdgpu_ps void @smrd_load_nonconst2(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in, i32 %ncoff) #0 { main_body: - %tmp20 = load <4 x i32>, ptr addrspace(4) %arg - %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 %ncoff, i32 0) - %tmp22 = load <4 x i32>, ptr addrspace(4) %in - %s.buffer = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %tmp22, i32 %ncoff, i32 0) + %tmp20 = load ptr addrspace(8), ptr addrspace(4) %arg + %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp20, i32 %ncoff, i32 0) + %tmp22 = load ptr addrspace(8), ptr addrspace(4) %in + %s.buffer = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(ptr addrspace(8) %tmp22, i32 %ncoff, i32 0) %s.buffer.elt = extractelement <8 x i32> %s.buffer, i32 1 %s.buffer.float = bitcast i32 %s.buffer.elt to float call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0 @@ -558,9 +556,9 @@ ; GCN-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ; ; GCN-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ; ; GCN: ; return to shader part epilog -define amdgpu_ps <16 x float> @smrd_load_nonconst3(<4 x i32> inreg %rsrc, i32 %off) #0 { +define amdgpu_ps <16 x float> @smrd_load_nonconst3(ptr addrspace(8) inreg %rsrc, i32 %off) #0 { main_body: - %ld = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %off, i32 0) + %ld = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(ptr addrspace(8) %rsrc, i32 %off, i32 0) %bc = bitcast <16 x i32> %ld to <16 x float> ret <16 x float> %bc } @@ -576,10 +574,10 @@ ; VIGFX9_10-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 56 offen offset:4064 ; ; VIGFX9_10-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 56 offen offset:4080 ; ; GCN: ; return to shader part epilog -define amdgpu_ps <16 x float> @smrd_load_nonconst4(<4 x i32> inreg %rsrc, i32 %off) #0 { +define amdgpu_ps <16 x float> @smrd_load_nonconst4(ptr addrspace(8) inreg %rsrc, i32 %off) #0 { main_body: %off.2 = add i32 %off, 4088 - %ld = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %off.2, i32 0) + %ld = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(ptr addrspace(8) %rsrc, i32 %off.2, i32 0) %bc = bitcast <16 x i32> %ld to <16 x float> ret <16 x float> %bc } @@ -596,10 +594,10 @@ ; VIGFX9_10-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], s4 offen offset:100 ; ; VIGFX9_10-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], s4 offen offset:116 ; ; GCN: ; return to shader part epilog -define amdgpu_ps <16 x float> @smrd_load_nonconst5(<4 x i32> inreg %rsrc, i32 %off) #0 { +define amdgpu_ps <16 x float> @smrd_load_nonconst5(ptr addrspace(8) inreg %rsrc, i32 %off) #0 { main_body: %off.2 = add i32 %off, 4100 - %ld = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %off.2, i32 0) + %ld = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(ptr addrspace(8) %rsrc, i32 %off.2, i32 0) %bc = bitcast <16 x i32> %ld to <16 x float> ret <16 x float> %bc } @@ -611,8 +609,8 @@ ; GCN: s_endpgm define amdgpu_ps void @smrd_load_dwordx2(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in, i32 inreg %ncoff) #0 { main_body: - %tmp22 = load <4 x i32>, ptr addrspace(4) %in - %s.buffer = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %tmp22, i32 %ncoff, i32 0) + %tmp22 = load ptr addrspace(8), ptr addrspace(4) %in + %s.buffer = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(ptr addrspace(8) %tmp22, i32 %ncoff, i32 0) %s.buffer.float = bitcast <2 x i32> %s.buffer to <2 x float> %r.1 = extractelement <2 x float> %s.buffer.float, i32 0 %r.2 = extractelement <2 x float> %s.buffer.float, i32 1 @@ -625,7 +623,7 @@ ; TODO: we should keep the loop counter in an SGPR ; ; GCN: s_buffer_load_dword -define amdgpu_ps float @smrd_uniform_loop(<4 x i32> inreg %desc, i32 %bound) #0 { +define amdgpu_ps float @smrd_uniform_loop(ptr addrspace(8) inreg %desc, i32 %bound) #0 { main_body: br label %loop @@ -633,7 +631,7 @@ %counter = phi i32 [ 0, %main_body ], [ %counter.next, %loop ] %sum = phi float [ 0.0, %main_body ], [ %sum.next, %loop ] %offset = shl i32 %counter, 2 - %v = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0) + %v = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %desc, i32 %offset, i32 0) %sum.next = fadd float %sum, %v %counter.next = add i32 %counter, 1 %cc = icmp uge i32 %counter.next, %bound @@ -651,7 +649,7 @@ ; TODO: we should keep the loop counter in an SGPR and use an S_BUFFER_LOAD ; ; GCN: buffer_load_dword -define amdgpu_ps float @smrd_uniform_loop2(<4 x i32> inreg %desc, i32 %bound, i32 %bound.a) #0 { +define amdgpu_ps float @smrd_uniform_loop2(ptr addrspace(8) inreg %desc, i32 %bound, i32 %bound.a) #0 { main_body: br label %loop @@ -659,7 +657,7 @@ %counter = phi i32 [ 0, %main_body ], [ %counter.next, %loop.a ], [ %counter.next, %loop.b ] %sum = phi float [ 0.0, %main_body ], [ %sum.next, %loop.a ], [ %sum.next.b, %loop.b ] %offset = shl i32 %counter, 2 - %v = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0) + %v = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %desc, i32 %offset, i32 0) %sum.next = fadd float %sum, %v %counter.next = add i32 %counter, 1 %cc = icmp uge i32 %counter.next, %bound @@ -696,46 +694,46 @@ endif1: ; preds = %if1, %main_body %tmp13 = extractelement <3 x i32> %arg4, i32 0 - %tmp97 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 %tmp13, i32 0) + %tmp97 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 %tmp13, i32 0) ret float %tmp97 } ; GCN-LABEL: {{^}}s_buffer_load_f32: ; GCN: s_buffer_load_dword s0, s[0:3], s4 -define amdgpu_ps void @s_buffer_load_f32(<4 x i32> inreg %rsrc, i32 inreg %offset) { - %sgpr = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) +define amdgpu_ps void @s_buffer_load_f32(ptr addrspace(8) inreg %rsrc, i32 inreg %offset) { + %sgpr = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %offset, i32 0) call void asm sideeffect "; use $0", "s"(float %sgpr) ret void } ; GCN-LABEL: {{^}}s_buffer_load_v2f32: ; GCN: s_buffer_load_dwordx2 s[0:1], s[0:3], s4 -define amdgpu_ps void @s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 inreg %offset) { - %sgpr = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %offset, i32 0) +define amdgpu_ps void @s_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 inreg %offset) { + %sgpr = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %offset, i32 0) call void asm sideeffect "; use $0", "s"(<2 x float> %sgpr) ret void } ; GCN-LABEL: {{^}}s_buffer_load_v4f32: ; GCN: s_buffer_load_dwordx4 s[0:3], s[0:3], s4 -define amdgpu_ps void @s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 inreg %offset) { - %sgpr = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %offset, i32 0) +define amdgpu_ps void @s_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 inreg %offset) { + %sgpr = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %offset, i32 0) call void asm sideeffect "; use $0", "s"(<4 x float> %sgpr) ret void } ; GCN-LABEL: {{^}}s_buffer_load_v8f32: ; GCN: s_buffer_load_dwordx8 s[0:7], s[0:3], s4 -define amdgpu_ps void @s_buffer_load_v8f32(<4 x i32> inreg %rsrc, i32 inreg %offset) { - %sgpr = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %offset, i32 0) +define amdgpu_ps void @s_buffer_load_v8f32(ptr addrspace(8) inreg %rsrc, i32 inreg %offset) { + %sgpr = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8) %rsrc, i32 %offset, i32 0) call void asm sideeffect "; use $0", "s"(<8 x float> %sgpr) ret void } ; GCN-LABEL: {{^}}s_buffer_load_v16f32: ; GCN: s_buffer_load_dwordx16 s[0:15], s[0:3], s4 -define amdgpu_ps void @s_buffer_load_v16f32(<4 x i32> inreg %rsrc, i32 inreg %offset) { - %sgpr = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %offset, i32 0) +define amdgpu_ps void @s_buffer_load_v16f32(ptr addrspace(8) inreg %rsrc, i32 inreg %offset) { + %sgpr = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(ptr addrspace(8) %rsrc, i32 %offset, i32 0) call void asm sideeffect "; use $0", "s"(<16 x float> %sgpr) ret void } @@ -744,17 +742,17 @@ declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2 -declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32) #1 -declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32) -declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) -declare <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32>, i32, i32) -declare <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32>, i32, i32) - -declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) -declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32>, i32, i32) -declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32) -declare <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32>, i32, i32) -declare <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32>, i32, i32) +declare i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8), i32, i32) #1 +declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(ptr addrspace(8), i32, i32) +declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(ptr addrspace(8), i32, i32) +declare <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(ptr addrspace(8), i32, i32) +declare <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(ptr addrspace(8), i32, i32) + +declare float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8), i32, i32) +declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8), i32, i32) +declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8), i32, i32) +declare <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(ptr addrspace(8), i32, i32) +declare <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(ptr addrspace(8), i32, i32) attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/split-smrd.ll b/llvm/test/CodeGen/AMDGPU/split-smrd.ll --- a/llvm/test/CodeGen/AMDGPU/split-smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/split-smrd.ll @@ -8,7 +8,7 @@ ; GCN: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 define amdgpu_ps void @split_smrd_add_worklist(ptr addrspace(4) inreg %arg) #0 { bb: - %tmp = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 96, i32 0) + %tmp = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) undef, i32 96, i32 0) %tmp1 = bitcast float %tmp to i32 br i1 undef, label %bb2, label %bb3 @@ -31,7 +31,7 @@ declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2 -declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1 +declare float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8), i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll --- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -47,97 +47,100 @@ ; CHECK-NEXT: KILL undef %132:sgpr_128 ; CHECK-NEXT: KILL %130.sub0, %130.sub1 ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: undef %304.sub0:sgpr_128 = S_MOV_B32 0 ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: undef %302.sub1:sgpr_128 = S_MOV_B32 0 - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: %304.sub1:sgpr_128 = COPY %304.sub0 + ; CHECK-NEXT: %304.sub2:sgpr_128 = COPY %304.sub0 + ; CHECK-NEXT: %304.sub3:sgpr_128 = COPY %304.sub0 + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from `ptr addrspace(8) undef`, align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.44, align 1, addrspace 8) ; CHECK-NEXT: KILL undef %89:sgpr_128 ; CHECK-NEXT: [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 29, implicit-def dead $scc ; CHECK-NEXT: [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 30, implicit-def dead $scc ; CHECK-NEXT: [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM1]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY5]], 64, implicit-def $scc ; CHECK-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %54:sreg_32, 0, implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %149.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: %149.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %156.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_1]], implicit-def $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %149, 0, 0 :: (invariant load (s128) from %ir.87, addrspace 4) - ; CHECK-NEXT: %156.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %163.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: %163.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %171:sreg_32, 31, implicit-def dead $scc - ; CHECK-NEXT: undef %176.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], undef %171:sreg_32, implicit-def $scc - ; CHECK-NEXT: %176.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %183.sub0:sreg_64 = S_ADD_U32 %50.sub0, [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: %183.sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %190.sub0:sreg_64 = S_ADD_U32 %50.sub0, [[S_LSHL_B32_1]], implicit-def $scc - ; CHECK-NEXT: %190.sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %200.sub0:sreg_64 = S_ADD_U32 %50.sub0, undef %171:sreg_32, implicit-def $scc - ; CHECK-NEXT: %200.sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef %151.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: %151.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef %158.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %151, 0, 0 :: (invariant load (s128) from %ir.87, addrspace 4) + ; CHECK-NEXT: %158.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef %165.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: %165.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %173:sreg_32, 31, implicit-def dead $scc + ; CHECK-NEXT: undef %178.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], undef %173:sreg_32, implicit-def $scc + ; CHECK-NEXT: %178.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef %185.sub0:sreg_64 = S_ADD_U32 %50.sub0, [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: %185.sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef %192.sub0:sreg_64 = S_ADD_U32 %50.sub0, [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: %192.sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef %202.sub0:sreg_64 = S_ADD_U32 %50.sub0, undef %173:sreg_32, implicit-def $scc + ; CHECK-NEXT: %202.sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_ADD_U32_1:%[0-9]+]]:sreg_32 = S_ADD_U32 %50.sub0, 224, implicit-def $scc ; CHECK-NEXT: [[S_ADDC_U32_1:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %51:sreg_32, 0, implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %210.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: %210.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %217.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_1]], implicit-def $scc - ; CHECK-NEXT: %217.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %224.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: %224.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef %212.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: %212.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef %219.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: %219.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef %226.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: %226.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_ADD_U32_2:%[0-9]+]]:sreg_32 = S_ADD_U32 %50.sub0, 576, implicit-def $scc ; CHECK-NEXT: [[S_ADDC_U32_2:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %51:sreg_32, 0, implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %241.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_2]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: %241.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %253.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_2]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: %253.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %261.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_2]], undef %171:sreg_32, implicit-def $scc - ; CHECK-NEXT: %261.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %273.sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: %273.sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %286.sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_1]], implicit-def $scc - ; CHECK-NEXT: %286.sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %293.sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: %293.sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef %243.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_2]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: %243.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef %255.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_2]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: %255.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef %263.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_2]], undef %173:sreg_32, implicit-def $scc + ; CHECK-NEXT: %263.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef %275.sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: %275.sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef %288.sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: %288.sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef %295.sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: %295.sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_]], 16, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_2]], 16, implicit-def dead $scc - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %302, [[S_ADD_I32_]], 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %302, undef %314:sreg_32, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %302, [[S_ADD_I32_1]], 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %302, 16, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %369:sgpr_128, undef %370:sreg_32, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %380:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %156, 0, 0 :: (invariant load (s128) from %ir.92, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %163, 0, 0 :: (invariant load (s128) from %ir.97, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %176, 0, 0 :: (invariant load (s128) from %ir.104, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %183, 0, 0 :: (invariant load (s128) from %ir.109, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %190, 0, 0 :: (invariant load (s128) from %ir.114, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %364:sgpr_128, [[S_ADD_I32_]], 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %375:sgpr_128, [[S_ADD_I32_1]], 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %304, [[S_ADD_I32_]], 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %304, undef %310:sreg_32, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %304, [[S_ADD_I32_1]], 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %304, 16, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from `ptr addrspace(8) undef`, align 1, addrspace 8) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %134:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %365:sgpr_128, undef %366:sreg_32, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %158, 0, 0 :: (invariant load (s128) from %ir.92, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %165, 0, 0 :: (invariant load (s128) from %ir.97, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %178, 0, 0 :: (invariant load (s128) from %ir.104, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %185, 0, 0 :: (invariant load (s128) from %ir.109, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %192, 0, 0 :: (invariant load (s128) from %ir.114, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.88, align 1, addrspace 8) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %360:sgpr_128, [[S_ADD_I32_]], 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %371:sgpr_128, [[S_ADD_I32_1]], 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR]], -98, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR1]], -114, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR2]], -130, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM2]], -178, implicit-def dead $scc - ; CHECK-NEXT: undef %327.sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: %327.sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %335.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: %335.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %343.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %200, 0, 0 :: (invariant load (s128) from %ir.121, addrspace 4) - ; CHECK-NEXT: %343.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %351.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: %351.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef %323.sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: %323.sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef %331.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: %331.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef %339.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %202, 0, 0 :: (invariant load (s128) from %ir.121, addrspace 4) + ; CHECK-NEXT: %339.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef %347.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: %347.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY10]], 4, implicit-def dead $scc - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.93, align 1, addrspace 8) ; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %396:sgpr_128, [[S_ADD_I32_6]], 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %390:sgpr_128, [[S_ADD_I32_6]], 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.98, align 1, addrspace 8) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %50, 224, 0 :: (invariant load (s128) from %ir.126, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %210, 0, 0 :: (invariant load (s128) from %ir.127, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %217, 0, 0 :: (invariant load (s128) from %ir.132, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %224, 0, 0 :: (invariant load (s128) from %ir.137, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %212, 0, 0 :: (invariant load (s128) from %ir.127, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.105, align 1, addrspace 8) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %219, 0, 0 :: (invariant load (s128) from %ir.132, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %226, 0, 0 :: (invariant load (s128) from %ir.137, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.110, align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.115, align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.122, align 1, addrspace 8) ; CHECK-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR4]], -217, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -233, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR5]], -249, implicit-def dead $scc @@ -148,86 +151,86 @@ ; CHECK-NEXT: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR6]], -441, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_U32_3:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], 160, implicit-def $scc ; CHECK-NEXT: [[S_ADDC_U32_3:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %36:sreg_32, 0, implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %411.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_3]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: %411.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef %405.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_3]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: %405.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY11]], 4, implicit-def dead $scc - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.128, align 1, addrspace 8) ; CHECK-NEXT: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc - ; CHECK-NEXT: undef %425.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_3]], [[S_LSHL_B32_4]], implicit-def $scc - ; CHECK-NEXT: %425.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef %419.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_3]], [[S_LSHL_B32_4]], implicit-def $scc + ; CHECK-NEXT: %419.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_ADD_U32_4:%[0-9]+]]:sreg_32 = S_ADD_U32 %56.sub0, 168, implicit-def $scc ; CHECK-NEXT: [[S_ADDC_U32_4:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %57:sreg_32, 0, implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %241, 0, 0 :: (invariant load (s128) from %ir.147, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %243, 0, 0 :: (invariant load (s128) from %ir.147, addrspace 4) ; CHECK-NEXT: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.133, align 1, addrspace 8) ; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc - ; CHECK-NEXT: undef %441.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_5]], implicit-def $scc - ; CHECK-NEXT: %441.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %441, 0, 0 :: (invariant load (s32) from %ir.269, align 8, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %253, 0, 0 :: (invariant load (s128) from %ir.154, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %261, 0, 0 :: (invariant load (s128) from %ir.159, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: undef %435.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_5]], implicit-def $scc + ; CHECK-NEXT: %435.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %435, 0, 0 :: (invariant load (s32) from %ir.269, align 8, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %255, 0, 0 :: (invariant load (s128) from %ir.154, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.138, align 1, addrspace 8) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %263, 0, 0 :: (invariant load (s128) from %ir.159, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.142, align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.148, align 1, addrspace 8) ; CHECK-NEXT: %71.sub3:sgpr_128 = S_MOV_B32 553734060 ; CHECK-NEXT: %71.sub2:sgpr_128 = S_MOV_B32 -1 ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sgpr_128 = COPY %71 - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %273, 0, 0 :: (invariant load (s128) from %ir.167, addrspace 4) - ; CHECK-NEXT: [[COPY13]].sub1:sgpr_128 = COPY %302.sub1 + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %275, 0, 0 :: (invariant load (s128) from %ir.167, addrspace 4) + ; CHECK-NEXT: [[COPY13]].sub1:sgpr_128 = COPY %304.sub0 ; CHECK-NEXT: [[COPY13]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY13]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %286, 0, 0 :: (invariant load (s128) from %ir.175, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %293, 0, 0 :: (invariant load (s128) from %ir.180, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.155, align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.160, align 1, addrspace 8) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %288, 0, 0 :: (invariant load (s128) from %ir.175, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %295, 0, 0 :: (invariant load (s128) from %ir.180, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.168, align 1, addrspace 8) ; CHECK-NEXT: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.44, align 1, addrspace 8) ; CHECK-NEXT: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc - ; CHECK-NEXT: undef %453.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_6]], implicit-def $scc - ; CHECK-NEXT: %453.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %453, 0, 0 :: (invariant load (s64) from %ir.277, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %327, 0, 0 :: (invariant load (s128) from %ir.202, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %335, 0, 0 :: (invariant load (s128) from %ir.208, addrspace 4) + ; CHECK-NEXT: undef %447.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_6]], implicit-def $scc + ; CHECK-NEXT: %447.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %447, 0, 0 :: (invariant load (s64) from %ir.277, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.176, align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.181, align 1, addrspace 8) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %323, 0, 0 :: (invariant load (s128) from %ir.202, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %331, 0, 0 :: (invariant load (s128) from %ir.208, addrspace 4) ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sgpr_128 = COPY %71 - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %343, 0, 0 :: (invariant load (s128) from %ir.213, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %339, 0, 0 :: (invariant load (s128) from %ir.213, addrspace 4) ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM]].sub1, 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY14]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 ; CHECK-NEXT: [[COPY14]].sub1:sgpr_128 = COPY [[S_AND_B32_]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY14]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %351, 0, 0 :: (invariant load (s128) from %ir.218, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %347, 0, 0 :: (invariant load (s128) from %ir.218, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.203, align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.209, align 1, addrspace 8) ; CHECK-NEXT: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 3, implicit-def dead $scc - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.214, align 1, addrspace 8) ; CHECK-NEXT: [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc - ; CHECK-NEXT: undef %468.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_7]], implicit-def $scc - ; CHECK-NEXT: %468.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %468, 0, 0 :: (invariant load (s64) from %ir.287, addrspace 4) + ; CHECK-NEXT: undef %462.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_7]], implicit-def $scc + ; CHECK-NEXT: %462.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.219, align 1, addrspace 8) + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %462, 0, 0 :: (invariant load (s64) from %ir.287, addrspace 4) ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY %71 ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY15]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 ; CHECK-NEXT: [[COPY15]].sub1:sgpr_128 = COPY [[S_AND_B32_1]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %411, 0, 0 :: (invariant load (s128) from %ir.253, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %488:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) undef`, addrspace 4) - ; CHECK-NEXT: KILL %411.sub0, %411.sub1 - ; CHECK-NEXT: KILL undef %488:sreg_64 + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %405, 0, 0 :: (invariant load (s128) from %ir.253, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %482:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) undef`, addrspace 4) + ; CHECK-NEXT: KILL undef %482:sreg_64 ; CHECK-NEXT: KILL [[COPY15]].sub0_sub1, [[COPY15]].sub2_sub3 + ; CHECK-NEXT: KILL %405.sub0, %405.sub1 ; CHECK-NEXT: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 3, implicit-def dead $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %425, 0, 0 :: (invariant load (s128) from %ir.261, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %419, 0, 0 :: (invariant load (s128) from %ir.261, addrspace 4) ; CHECK-NEXT: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc - ; CHECK-NEXT: undef %485.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_8]], implicit-def $scc - ; CHECK-NEXT: %485.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %485, 0, 0 :: (invariant load (s32) from %ir.298, align 8, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: undef %479.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_8]], implicit-def $scc + ; CHECK-NEXT: %479.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %479, 0, 0 :: (invariant load (s32) from %ir.298, align 8, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.254, align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.262, align 1, addrspace 8) ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]] ; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc @@ -243,22 +246,22 @@ ; CHECK-NEXT: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_U32_5:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], 96, implicit-def $scc ; CHECK-NEXT: [[S_ADDC_U32_5:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %33:sreg_32, 0, implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %514.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: %514.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %514, 0, 0 :: (invariant load (s128) from %ir.316, addrspace 4) - ; CHECK-NEXT: undef %522.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_1]], implicit-def $scc - ; CHECK-NEXT: %522.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %522, 0, 0 :: (invariant load (s128) from %ir.321, addrspace 4) - ; CHECK-NEXT: undef %530.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: %530.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %530, 0, 0 :: (invariant load (s128) from %ir.326, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: undef %508.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: %508.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %508, 0, 0 :: (invariant load (s128) from %ir.316, addrspace 4) + ; CHECK-NEXT: undef %516.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: %516.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %516, 0, 0 :: (invariant load (s128) from %ir.321, addrspace 4) + ; CHECK-NEXT: undef %524.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: %524.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %524, 0, 0 :: (invariant load (s128) from %ir.326, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.317, align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.322, align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.327, align 1, addrspace 8) + ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM27]] - ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM25]] ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]] - ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]] + ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM25]] ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec @@ -370,13 +373,13 @@ ; CHECK-NEXT: [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec - ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %564:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) undef`, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %558:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) undef`, addrspace 4) ; CHECK-NEXT: [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec ; CHECK-NEXT: [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc ; CHECK-NEXT: [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec - ; CHECK-NEXT: undef %693.sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec - ; CHECK-NEXT: IMAGE_STORE_V4_V2_gfx10 %693, undef %578:vreg_64, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8) + ; CHECK-NEXT: undef %687.sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec + ; CHECK-NEXT: IMAGE_STORE_V4_V2_gfx10 %687, undef %572:vreg_64, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 .expVert: %0 = extractelement <31 x i32> %userData, i64 2 @@ -424,255 +427,259 @@ %rootDesc58.ii1.i = load i32, ptr addrspace(4) %.i184.i, align 4 %40 = and i32 %rootDesc58.ii1.i, 65535 %41 = insertelement <4 x i32> , i32 %rootDesc58.ii0.i, i32 0 - %42 = insertelement <4 x i32> %41, i32 %40, i32 1 + %.42.vec = insertelement <4 x i32> %41, i32 %40, i32 1 + %.42.int = bitcast <4 x i32> %.42.vec to i128 + %42 = inttoptr i128 %.42.int to ptr addrspace(8) %43 = and i32 undef, 65535 - %44 = insertelement <4 x i32> undef, i32 %43, i32 1 - %45 = load <4 x i32>, ptr addrspace(4) undef, align 16 - %46 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %45, i32 0, i32 0, i32 0, i32 0) + %.44.vec = insertelement <4 x i32> undef, i32 %43, i32 1 + %.44.int = bitcast <4 x i32> %.44.vec to i128 + %44 = inttoptr i128 %.44.int to ptr addrspace(8) + %45 = load ptr addrspace(8), ptr addrspace(4) undef, align 16 + %46 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %45, i32 0, i32 0, i32 0, i32 0) %47 = add i32 %46, -1 %48 = shl i32 %0, 4 %49 = call i32 @llvm.amdgcn.readfirstlane(i32 %48) %50 = sext i32 %49 to i64 - %51 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0) + %51 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) undef, i32 0, i32 0, i32 0, i32 0) %52 = add i32 %51, -2 %53 = or i32 %52, %47 %54 = shl i32 %1, 4 %55 = call i32 @llvm.amdgcn.readfirstlane(i32 %54) %56 = sext i32 %55 to i64 - %57 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0) + %57 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) undef, i32 0, i32 0, i32 0, i32 0) %58 = add i32 %57, -3 %59 = or i32 %53, %58 %60 = shl i32 %2, 4 %61 = call i32 @llvm.amdgcn.readfirstlane(i32 %60) %62 = sext i32 %61 to i64 - %63 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0) + %63 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) undef, i32 0, i32 0, i32 0, i32 0) %64 = add i32 %63, -4 %65 = or i32 %59, %64 - %66 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0) + %66 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) undef, i32 0, i32 0, i32 0, i32 0) %67 = add i32 %66, -27 %68 = or i32 %65, %67 - %69 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> undef, i32 0, i32 0, i32 0) + %69 = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) undef, i32 0, i32 0, i32 0) %70 = add i32 %69, -28 %71 = or i32 %68, %70 %72 = call i32 @llvm.amdgcn.readfirstlane(i32 %0) %73 = getelementptr i8, ptr addrspace(4) %35, i64 16 - %74 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 0, i32 0) + %74 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) undef, i32 0, i32 0) %75 = add i32 %74, -29 %76 = or i32 %71, %75 %77 = call i32 @llvm.amdgcn.readfirstlane(i32 %1) %78 = shl i32 %77, 4 %79 = sext i32 %78 to i64 - %80 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 0, i32 0) + %80 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) undef, i32 0, i32 0) %81 = add i32 %80, -30 %82 = or i32 %76, %81 %83 = call i32 @llvm.amdgcn.readfirstlane(i32 %2) %84 = shl i32 %83, 4 %85 = sext i32 %84 to i64 %86 = getelementptr i8, ptr addrspace(4) %73, i64 %85 - %87 = load <4 x i32>, ptr addrspace(4) %86, align 16 - %88 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %87, i32 0, i32 0) + %87 = load ptr addrspace(8), ptr addrspace(4) %86, align 16 + %88 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %87, i32 0, i32 0) %89 = add i32 %88, -31 %90 = or i32 %82, %89 %91 = getelementptr i8, ptr addrspace(4) %35, i64 64 %92 = getelementptr i8, ptr addrspace(4) %91, i64 %50 - %93 = load <4 x i32>, ptr addrspace(4) %92, align 16 - %94 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %93, i32 0, i32 0, i32 0, i32 0) + %93 = load ptr addrspace(8), ptr addrspace(4) %92, align 16 + %94 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %93, i32 0, i32 0, i32 0, i32 0) %95 = add i32 %94, -32 %96 = or i32 %90, %95 %97 = getelementptr i8, ptr addrspace(4) %91, i64 %56 - %98 = load <4 x i32>, ptr addrspace(4) %97, align 16 - %99 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %98, i32 0, i32 0, i32 0, i32 0) + %98 = load ptr addrspace(8), ptr addrspace(4) %97, align 16 + %99 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %98, i32 0, i32 0, i32 0, i32 0) %100 = add i32 %99, -33 %101 = or i32 %96, %100 %102 = getelementptr i8, ptr addrspace(4) %91, i64 %62 - %103 = load <4 x i32>, ptr addrspace(4) %102, align 16 - %104 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %103, i32 0, i32 0, i32 0, i32 0) + %103 = load ptr addrspace(8), ptr addrspace(4) %102, align 16 + %104 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %103, i32 0, i32 0, i32 0, i32 0) %105 = add i32 %104, -34 %106 = or i32 %101, %105 %107 = call i32 @llvm.amdgcn.readfirstlane(i32 undef) %108 = sext i32 %107 to i64 %109 = getelementptr i8, ptr addrspace(4) %91, i64 %108 - %110 = load <4 x i32>, ptr addrspace(4) %109, align 16 - %111 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %110, i32 0, i32 0, i32 0, i32 0) + %110 = load ptr addrspace(8), ptr addrspace(4) %109, align 16 + %111 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %110, i32 0, i32 0, i32 0, i32 0) %112 = add i32 %111, -36 %113 = or i32 %106, %112 %114 = getelementptr i8, ptr addrspace(4) %32, i64 %50 - %115 = load <4 x i32>, ptr addrspace(4) %114, align 16 - %116 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %115, i32 0, i32 0, i32 0, i32 0) + %115 = load ptr addrspace(8), ptr addrspace(4) %114, align 16 + %116 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %115, i32 0, i32 0, i32 0, i32 0) %117 = add i32 %116, -37 %118 = or i32 %113, %117 %119 = getelementptr i8, ptr addrspace(4) %32, i64 %56 - %120 = load <4 x i32>, ptr addrspace(4) %119, align 16 - %121 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %120, i32 0, i32 0, i32 0, i32 0) + %120 = load ptr addrspace(8), ptr addrspace(4) %119, align 16 + %121 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %120, i32 0, i32 0, i32 0, i32 0) %122 = add i32 %121, -38 %123 = or i32 %118, %122 - %124 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0) + %124 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) undef, i32 0, i32 0, i32 0, i32 0) %125 = add i32 %124, -39 %126 = or i32 %123, %125 %127 = call i32 @llvm.amdgcn.readfirstlane(i32 undef) %128 = sext i32 %127 to i64 %129 = getelementptr i8, ptr addrspace(4) %32, i64 %128 - %130 = load <4 x i32>, ptr addrspace(4) %129, align 16 - %131 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %130, i32 0, i32 0, i32 0, i32 0) + %130 = load ptr addrspace(8), ptr addrspace(4) %129, align 16 + %131 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %130, i32 0, i32 0, i32 0, i32 0) %132 = add i32 %131, -50 %133 = or i32 %126, %132 %134 = getelementptr i8, ptr addrspace(4) %32, i64 224 %135 = getelementptr i8, ptr addrspace(4) %134, i64 %50 - %136 = load <4 x i32>, ptr addrspace(4) %135, align 16 - %137 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %136, i32 0, i32 0, i32 0, i32 0) + %136 = load ptr addrspace(8), ptr addrspace(4) %135, align 16 + %137 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %136, i32 0, i32 0, i32 0, i32 0) %138 = add i32 %137, -51 %139 = or i32 %133, %138 %140 = getelementptr i8, ptr addrspace(4) %134, i64 %56 - %141 = load <4 x i32>, ptr addrspace(4) %140, align 16 - %142 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %141, i32 0, i32 0, i32 0, i32 0) + %141 = load ptr addrspace(8), ptr addrspace(4) %140, align 16 + %142 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %141, i32 0, i32 0, i32 0, i32 0) %143 = add i32 %142, -52 %144 = or i32 %139, %143 %145 = getelementptr i8, ptr addrspace(4) %134, i64 %62 - %146 = load <4 x i32>, ptr addrspace(4) %145, align 16 - %147 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %146, i32 0, i32 0, i32 0, i32 0) + %146 = load ptr addrspace(8), ptr addrspace(4) %145, align 16 + %147 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %146, i32 0, i32 0, i32 0, i32 0) %148 = add i32 %147, -53 %149 = or i32 %144, %148 %150 = sext i32 undef to i64 %151 = getelementptr i8, ptr addrspace(4) %134, i64 %150 - %152 = load <4 x i32>, ptr addrspace(4) %151, align 16 - %153 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %152, i32 0, i32 0, i32 0, i32 0) + %152 = load ptr addrspace(8), ptr addrspace(4) %151, align 16 + %153 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %152, i32 0, i32 0, i32 0, i32 0) %154 = add i32 %153, -72 %155 = or i32 %149, %154 %156 = getelementptr i8, ptr addrspace(4) %32, i64 576 %157 = getelementptr i8, ptr addrspace(4) %156, i64 %50 - %158 = load <4 x i32>, ptr addrspace(4) %157, align 16 - %159 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %158, i32 0, i32 0, i32 0, i32 0) + %158 = load ptr addrspace(8), ptr addrspace(4) %157, align 16 + %159 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %158, i32 0, i32 0, i32 0, i32 0) %160 = add i32 %159, -73 %161 = or i32 %155, %160 - %162 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0) + %162 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) undef, i32 0, i32 0, i32 0, i32 0) %163 = add i32 %162, -74 %164 = or i32 %161, %163 %165 = getelementptr i8, ptr addrspace(4) %156, i64 %62 - %166 = load <4 x i32>, ptr addrspace(4) %165, align 16 - %167 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %166, i32 0, i32 0, i32 0, i32 0) + %166 = load ptr addrspace(8), ptr addrspace(4) %165, align 16 + %167 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %166, i32 0, i32 0, i32 0, i32 0) %168 = add i32 %167, -75 %169 = or i32 %164, %168 %170 = getelementptr i8, ptr addrspace(4) %156, i64 %108 - %171 = load <4 x i32>, ptr addrspace(4) %170, align 16 - %172 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %171, i32 0, i32 0, i32 0, i32 0) + %171 = load ptr addrspace(8), ptr addrspace(4) %170, align 16 + %172 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %171, i32 0, i32 0, i32 0, i32 0) %173 = add i32 %172, -77 %174 = or i32 %169, %173 - %175 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0) + %175 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) undef, i32 0, i32 0, i32 0, i32 0) %176 = add i32 %175, -93 %177 = or i32 %174, %176 %178 = inttoptr i64 %29 to ptr addrspace(4) %179 = getelementptr i8, ptr addrspace(4) %178, i64 %50 - %180 = load <4 x i32>, ptr addrspace(4) %179, align 16 - %181 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %180, i32 0, i32 0, i32 0, i32 0) + %180 = load ptr addrspace(8), ptr addrspace(4) %179, align 16 + %181 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %180, i32 0, i32 0, i32 0, i32 0) %182 = add i32 %181, -94 %183 = or i32 %177, %182 - %184 = load <4 x i32>, ptr addrspace(4) undef, align 16 - %185 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %184, i32 0, i32 0, i32 0) + %184 = load ptr addrspace(8), ptr addrspace(4) undef, align 16 + %185 = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) %184, i32 0, i32 0, i32 0) %186 = add i32 %185, -95 %187 = or i32 %183, %186 %188 = getelementptr i8, ptr addrspace(4) %27, i64 %79 - %189 = load <4 x i32>, ptr addrspace(4) %188, align 16 - %190 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %189, i32 0, i32 0, i32 0) + %189 = load ptr addrspace(8), ptr addrspace(4) %188, align 16 + %190 = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) %189, i32 0, i32 0, i32 0) %191 = add i32 %190, -96 %192 = or i32 %187, %191 %193 = getelementptr i8, ptr addrspace(4) %27, i64 %85 - %194 = load <4 x i32>, ptr addrspace(4) %193, align 16 - %195 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %194, i32 0, i32 0, i32 0) + %194 = load ptr addrspace(8), ptr addrspace(4) %193, align 16 + %195 = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) %194, i32 0, i32 0, i32 0) %196 = add i32 %195, -97 %197 = or i32 %192, %196 %198 = getelementptr <{ [4 x i32], [6 x %llpc.array.element] }>, ptr addrspace(6) null, i32 0, i32 1, i32 %0, i32 0 %199 = ptrtoint ptr addrspace(6) %198 to i32 - %200 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %44, i32 %199, i32 0) + %200 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %44, i32 %199, i32 0) %201 = add i32 %200, -98 %202 = or i32 %197, %201 - %203 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %44, i32 undef, i32 0) + %203 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %44, i32 undef, i32 0) %204 = add i32 %203, -114 %205 = or i32 %202, %204 %206 = getelementptr <{ [4 x i32], [6 x %llpc.array.element] }>, ptr addrspace(6) null, i32 0, i32 1, i32 %2, i32 0 %207 = ptrtoint ptr addrspace(6) %206 to i32 - %208 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %44, i32 %207, i32 0) + %208 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %44, i32 %207, i32 0) %209 = add i32 %208, -130 %210 = or i32 %205, %209 %211 = getelementptr <{ [4 x i32], [6 x %llpc.array.element] }>, ptr addrspace(6) null, i32 0, i32 1, i32 undef, i32 0 %212 = ptrtoint ptr addrspace(6) %211 to i32 - %213 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %44, i32 %212, i32 0) + %213 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %44, i32 %212, i32 0) %214 = add i32 %213, -178 %215 = or i32 %210, %214 %216 = inttoptr i64 %24 to ptr addrspace(4) %217 = getelementptr i8, ptr addrspace(4) %216, i64 %50 - %218 = load <4 x i32>, ptr addrspace(4) %217, align 16 - %219 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %218, i32 0, i32 0, i32 0, i32 0) + %218 = load ptr addrspace(8), ptr addrspace(4) %217, align 16 + %219 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %218, i32 0, i32 0, i32 0, i32 0) %220 = add i32 %219, -194 %221 = or i32 %215, %220 %222 = inttoptr i64 %22 to ptr addrspace(4) %223 = getelementptr i8, ptr addrspace(4) %222, i64 %50 - %224 = load <4 x i32>, ptr addrspace(4) %223, align 16 - %225 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %224, i32 0, i32 0, i32 0, i32 0) + %224 = load ptr addrspace(8), ptr addrspace(4) %223, align 16 + %225 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %224, i32 0, i32 0, i32 0, i32 0) %226 = add i32 %225, -195 %227 = or i32 %221, %226 %228 = getelementptr i8, ptr addrspace(4) %222, i64 %56 - %229 = load <4 x i32>, ptr addrspace(4) %228, align 16 - %230 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %229, i32 0, i32 0, i32 0, i32 0) + %229 = load ptr addrspace(8), ptr addrspace(4) %228, align 16 + %230 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %229, i32 0, i32 0, i32 0, i32 0) %231 = add i32 %230, -196 %232 = or i32 %227, %231 %233 = getelementptr i8, ptr addrspace(4) %222, i64 %62 - %234 = load <4 x i32>, ptr addrspace(4) %233, align 16 - %235 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %234, i32 0, i32 0, i32 0, i32 0) + %234 = load ptr addrspace(8), ptr addrspace(4) %233, align 16 + %235 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %234, i32 0, i32 0, i32 0, i32 0) %236 = add i32 %235, -197 %237 = or i32 %232, %236 - %238 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0) + %238 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) undef, i32 0, i32 0, i32 0, i32 0) %239 = add i32 %238, -216 %240 = or i32 %237, %239 %241 = getelementptr <{ [4 x i32], [6 x %llpc.array.element.2] }>, ptr addrspace(6) null, i32 0, i32 1, i32 %0, i32 0 %242 = ptrtoint ptr addrspace(6) %241 to i32 - %243 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 %242, i32 0) + %243 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) undef, i32 %242, i32 0) %244 = add i32 %243, -217 %245 = or i32 %240, %244 - %246 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 undef, i32 0) + %246 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) undef, i32 undef, i32 0) %247 = add i32 %246, -233 %248 = or i32 %245, %247 %249 = getelementptr <{ [4 x i32], [6 x %llpc.array.element.2] }>, ptr addrspace(6) null, i32 0, i32 1, i32 %2, i32 0 %250 = ptrtoint ptr addrspace(6) %249 to i32 - %251 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 %250, i32 0) + %251 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) undef, i32 %250, i32 0) %252 = add i32 %251, -249 %253 = or i32 %248, %252 %254 = getelementptr <{ [4 x i32], [6 x %llpc.array.element.2] }>, ptr addrspace(6) null, i32 0, i32 1, i32 undef, i32 0 %255 = ptrtoint ptr addrspace(6) %254 to i32 - %256 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 %255, i32 0) + %256 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) undef, i32 %255, i32 0) %257 = add i32 %256, -297 %258 = or i32 %253, %257 - %259 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 undef, i32 0) + %259 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) undef, i32 undef, i32 0) %260 = add i32 %259, -313 %261 = or i32 %258, %260 - %262 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 undef, i32 0) + %262 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) undef, i32 undef, i32 0) %263 = add i32 %262, -329 %264 = or i32 %261, %263 - %265 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 undef, i32 0) + %265 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) undef, i32 undef, i32 0) %266 = add i32 %265, -345 %267 = or i32 %264, %266 %268 = getelementptr <{ [4 x i32], [9 x %llpc.array.element.5] }>, ptr addrspace(6) null, i32 0, i32 1, i32 %4, i32 0 %269 = ptrtoint ptr addrspace(6) %268 to i32 - %270 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 %269, i32 0) + %270 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) undef, i32 %269, i32 0) %271 = add i32 %270, -441 %272 = or i32 %267, %271 %273 = getelementptr i8, ptr addrspace(4) %20, i64 160 - %274 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0) + %274 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) undef, i32 0, i32 0, i32 0, i32 0) %275 = add i32 %274, -457 %276 = or i32 %272, %275 - %277 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0) + %277 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) undef, i32 0, i32 0, i32 0, i32 0) %278 = add i32 %277, -458 %279 = or i32 %276, %278 %280 = getelementptr i8, ptr addrspace(4) %273, i64 %62 - %281 = load <4 x i32>, ptr addrspace(4) %280, align 16 - %282 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %281, i32 0, i32 0, i32 0, i32 0) + %281 = load ptr addrspace(8), ptr addrspace(4) %280, align 16 + %282 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %281, i32 0, i32 0, i32 0, i32 0) %283 = add i32 %282, -459 %284 = or i32 %279, %283 %285 = shl i32 %5, 4 %286 = call i32 @llvm.amdgcn.readfirstlane(i32 %285) %287 = sext i32 %286 to i64 %288 = getelementptr i8, ptr addrspace(4) %273, i64 %287 - %289 = load <4 x i32>, ptr addrspace(4) %288, align 16 - %290 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %289, i32 0, i32 0, i32 0, i32 0) + %289 = load ptr addrspace(8), ptr addrspace(4) %288, align 16 + %290 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %289, i32 0, i32 0, i32 0, i32 0) %291 = add i32 %290, -466 %292 = or i32 %284, %291 %293 = getelementptr i8, ptr addrspace(4) %38, i64 168 @@ -682,8 +689,10 @@ %.ii0.i = load i32, ptr addrspace(4) %296, align 8 %297 = and i32 undef, 65535 %298 = insertelement <4 x i32> , i32 %.ii0.i, i32 0 - %299 = insertelement <4 x i32> %298, i32 %297, i32 1 - %300 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %299, i32 0, i32 0) + %.299.vec = insertelement <4 x i32> %298, i32 %297, i32 1 + %.299.int = bitcast <4 x i32> %.299.vec to i128 + %299 = inttoptr i128 %.299.int to ptr addrspace(8) + %300 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %299, i32 0, i32 0) %301 = add i32 %300, -467 %302 = or i32 %292, %301 %303 = shl i32 %77, 3 @@ -694,8 +703,10 @@ %.ii192.i = load i32, ptr addrspace(4) %.i191.i, align 4 %306 = and i32 %.ii192.i, 65535 %307 = insertelement <4 x i32> , i32 %.ii090.i, i32 0 - %308 = insertelement <4 x i32> %307, i32 %306, i32 1 - %309 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %308, i32 0, i32 0) + %.308.vec = insertelement <4 x i32> %307, i32 %306, i32 1 + %.308.int = bitcast <4 x i32> %.308.vec to i128 + %308 = inttoptr i128 %.308.int to ptr addrspace(8) + %309 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %308, i32 0, i32 0) %310 = add i32 %309, -468 %311 = or i32 %302, %310 %312 = shl i32 %83, 3 @@ -706,8 +717,10 @@ %.ii198.i = load i32, ptr addrspace(4) %.i197.i, align 4 %315 = and i32 %.ii198.i, 65535 %316 = insertelement <4 x i32> , i32 %.ii096.i, i32 0 - %317 = insertelement <4 x i32> %316, i32 %315, i32 1 - %318 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %317, i32 0, i32 0) + %.317.vec = insertelement <4 x i32> %316, i32 %315, i32 1 + %.317.int = bitcast <4 x i32> %.317.vec to i128 + %317 = inttoptr i128 %.317.int to ptr addrspace(8) + %318 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %317, i32 0, i32 0) %319 = add i32 %318, -469 %320 = or i32 %311, %319 %321 = call i32 @llvm.amdgcn.readfirstlane(i32 %3) @@ -718,57 +731,59 @@ %.ii1104.i = load i32, ptr addrspace(4) undef, align 4 %325 = and i32 %.ii1104.i, 65535 %326 = insertelement <4 x i32> , i32 %.ii0102.i, i32 0 - %327 = insertelement <4 x i32> %326, i32 %325, i32 1 - %328 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %327, i32 0, i32 0) + %.327.vec = insertelement <4 x i32> %326, i32 %325, i32 1 + %.327.int = bitcast <4 x i32> %.327.vec to i128 + %327 = inttoptr i128 %.327.int to ptr addrspace(8) + %328 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %327, i32 0, i32 0) %329 = add i32 %328, -473 %330 = or i32 %320, %329 - %331 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 0, i32 0) + %331 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) undef, i32 0, i32 0) %332 = add i32 %331, -474 %333 = or i32 %330, %332 - %334 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 undef, i32 0) + %334 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) undef, i32 undef, i32 0) %335 = add i32 %334, -475 %336 = or i32 %333, %335 - %337 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 undef, i32 0) + %337 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) undef, i32 undef, i32 0) %338 = add i32 %337, -491 %339 = or i32 %336, %338 - %340 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 undef, i32 0) + %340 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) undef, i32 undef, i32 0) %341 = add i32 %340, -507 %342 = or i32 %339, %341 - %343 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 undef, i32 0) + %343 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) undef, i32 undef, i32 0) %344 = add i32 %343, -539 %345 = or i32 %342, %344 %346 = getelementptr i8, ptr addrspace(4) %17, i64 96 %347 = getelementptr i8, ptr addrspace(4) %346, i64 %50 - %348 = load <4 x i32>, ptr addrspace(4) %347, align 16 - %349 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %348, i32 0, i32 0, i32 0, i32 0) + %348 = load ptr addrspace(8), ptr addrspace(4) %347, align 16 + %349 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %348, i32 0, i32 0, i32 0, i32 0) %350 = add i32 %349, -555 %351 = or i32 %345, %350 %352 = getelementptr i8, ptr addrspace(4) %346, i64 %56 - %353 = load <4 x i32>, ptr addrspace(4) %352, align 16 - %354 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %353, i32 0, i32 0, i32 0, i32 0) + %353 = load ptr addrspace(8), ptr addrspace(4) %352, align 16 + %354 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %353, i32 0, i32 0, i32 0, i32 0) %355 = add i32 %354, -556 %356 = or i32 %351, %355 %357 = getelementptr i8, ptr addrspace(4) %346, i64 %62 - %358 = load <4 x i32>, ptr addrspace(4) %357, align 16 - %359 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %358, i32 0, i32 0, i32 0, i32 0) + %358 = load ptr addrspace(8), ptr addrspace(4) %357, align 16 + %359 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) %358, i32 0, i32 0, i32 0, i32 0) %360 = add i32 %359, -557 %361 = or i32 %356, %360 - %362 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0) + %362 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) undef, i32 0, i32 0, i32 0, i32 0) %363 = add i32 %362, -574 %364 = or i32 %361, %363 - %365 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0) + %365 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) undef, i32 0, i32 0, i32 0, i32 0) %366 = add i32 %365, -575 %367 = or i32 %364, %366 - %368 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0) + %368 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) undef, i32 0, i32 0, i32 0, i32 0) %369 = add i32 %368, -576 %370 = or i32 %367, %369 - %371 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0) + %371 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) undef, i32 0, i32 0, i32 0, i32 0) %372 = add i32 %371, -577 %373 = or i32 %370, %372 - %374 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0) + %374 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8) undef, i32 0, i32 0, i32 0, i32 0) %375 = add i32 %374, -593 %376 = or i32 %373, %375 - %377 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %42, i32 0, i32 0) + %377 = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %42, i32 0, i32 0) %378 = add i32 %377, -594 %379 = or i32 %376, %378 %.not.i = icmp eq i32 %379, 0 @@ -782,7 +797,7 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) -declare i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32>, i32, i32, i32, i32 immarg) -declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg) -declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg) +declare i32 @llvm.amdgcn.struct.buffer.load.format.i32(ptr addrspace(8), i32, i32, i32, i32 immarg) +declare i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8), i32, i32, i32 immarg) +declare i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8), i32, i32 immarg) declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll b/llvm/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll --- a/llvm/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll @@ -6,9 +6,9 @@ ; ; CHECK-LABEL: {{^}}main: ; CHECK: s_cbranch_vccnz -define amdgpu_ps float @main(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @main(ptr addrspace(8) inreg %rsrc) { main_body: - %v = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 1) + %v = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 1) %cc = fcmp une float %v, 1.000000e+00 br i1 %cc, label %if, label %else @@ -22,6 +22,6 @@ ret float %r } -declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32 immarg) #0 +declare float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8), i32, i32, i32 immarg) #0 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -69,18 +69,18 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_vccnz .LBB0_1 -; GFX11-NEXT: ; %bb.4: ; %DummyReturnBlock -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; %bb.4: ; %DummyReturnBlock +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: br label %bb0 bb0: - %desc = load <4 x i32>, ptr %arg, align 8 - tail call void @llvm.amdgcn.raw.buffer.store.f32(float undef, <4 x i32> %desc, i32 undef, i32 0, i32 0) + %desc = load ptr addrspace(8), ptr %arg, align 8 + tail call void @llvm.amdgcn.raw.buffer.store.f32(float undef, ptr addrspace(8) %desc, i32 undef, i32 0, i32 0) br label %bb0 } -declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #0 +declare void @llvm.amdgcn.raw.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32 immarg) #0 attributes #0 = { nounwind writeonly } diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -29,14 +29,15 @@ define amdgpu_vs void @main(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, ptr addrspace(4) inreg %arg3, ptr addrspace(4) inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 { bb: - %tmp11 = load <4 x i32>, ptr addrspace(4) %arg1, align 16, !tbaa !0 - %tmp12 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp11, i32 0, i32 0) - %tmp13 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp11, i32 16, i32 0) - %tmp14 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp11, i32 32, i32 0) + %tmp11 = load ptr addrspace(8), ptr addrspace(4) %arg1, align 16, !tbaa !0 + %tmp12 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp11, i32 0, i32 0) + %tmp13 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp11, i32 16, i32 0) + %tmp14 = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %tmp11, i32 32, i32 0) %tmp16 = load <4 x i32>, ptr addrspace(4) %arg4, align 16, !tbaa !0 %tmp17 = add i32 %arg5, %arg7 - %tmp16.cast = bitcast <4 x i32> %tmp16 to <4 x i32> - %tmp18 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp16.cast, i32 %tmp17, i32 0, i32 0, i32 0) + %tmp16.cast.int = bitcast <4 x i32> %tmp16 to i128 + %tmp16.cast = inttoptr i128 %tmp16.cast.int to ptr addrspace(8) + %tmp18 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %tmp16.cast, i32 %tmp17, i32 0, i32 0, i32 0) %tmp19 = extractelement <4 x float> %tmp18, i32 0 %tmp20 = extractelement <4 x float> %tmp18, i32 1 %tmp21 = extractelement <4 x float> %tmp18, i32 2 @@ -486,8 +487,8 @@ declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 -declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32 immarg) #2 +declare float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8), i32, i32) #1 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8), i32, i32, i32, i32 immarg) #2 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/vopc_dpp.ll b/llvm/test/CodeGen/AMDGPU/vopc_dpp.ll --- a/llvm/test/CodeGen/AMDGPU/vopc_dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/vopc_dpp.ll @@ -7,9 +7,9 @@ %1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 0, i32 0, i32 15, i32 15, i1 false) %2 = icmp ne i32 %1, %0 %spec.select.3 = select i1 %2, i32 0, i32 1 - call void @llvm.amdgcn.raw.buffer.store.i32(i32 %spec.select.3, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %spec.select.3, ptr addrspace(8) zeroinitializer, i32 0, i32 0, i32 0) ret void } declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) -declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.i32(i32, ptr addrspace(8), i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/wait.ll b/llvm/test/CodeGen/AMDGPU/wait.ll --- a/llvm/test/CodeGen/AMDGPU/wait.ll +++ b/llvm/test/CodeGen/AMDGPU/wait.ll @@ -16,8 +16,9 @@ define amdgpu_vs void @main(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, ptr addrspace(4) inreg %arg3, ptr addrspace(4) inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, ptr addrspace(4) inreg %constptr) #0 { main_body: %tmp10 = load <16 x i8>, ptr addrspace(4) %arg3, !tbaa !0 - %tmp10.cast = bitcast <16 x i8> %tmp10 to <4 x i32> - %tmp11 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp10.cast, i32 %arg6, i32 0, i32 0, i32 0) + %tmp10.cast.int = bitcast <16 x i8> %tmp10 to i128 + %tmp10.cast = inttoptr i128 %tmp10.cast.int to ptr addrspace(8) + %tmp11 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %tmp10.cast, i32 %arg6, i32 0, i32 0, i32 0) %tmp12 = extractelement <4 x float> %tmp11, i32 0 %tmp13 = extractelement <4 x float> %tmp11, i32 1 call void @llvm.amdgcn.s.barrier() #1 @@ -25,8 +26,9 @@ %tmp15 = load float, ptr addrspace(4) %constptr, align 4 %tmp16 = getelementptr <16 x i8>, ptr addrspace(4) %arg3, i32 1 %tmp17 = load <16 x i8>, ptr addrspace(4) %tmp16, !tbaa !0 - %tmp17.cast = bitcast <16 x i8> %tmp17 to <4 x i32> - %tmp18 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp17.cast, i32 %arg6, i32 0, i32 0, i32 0) + %tmp17.cast.int = bitcast <16 x i8> %tmp17 to i128 + %tmp17.cast = inttoptr i128 %tmp17.cast.int to ptr addrspace(8) + %tmp18 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %tmp17.cast, i32 %arg6, i32 0, i32 0, i32 0) %tmp19 = extractelement <4 x float> %tmp18, i32 0 %tmp20 = extractelement <4 x float> %tmp18, i32 1 %tmp21 = extractelement <4 x float> %tmp18, i32 2 @@ -49,8 +51,9 @@ main_body: %tmp11 = load <16 x i8>, ptr addrspace(4) %arg4, align 16, !tbaa !0 %tmp12 = add i32 %arg5, %arg7 - %tmp11.cast = bitcast <16 x i8> %tmp11 to <4 x i32> - %tmp13 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp11.cast, i32 %tmp12, i32 0, i32 0, i32 0) + %tmp11.cast.int = bitcast <16 x i8> %tmp11 to i128 + %tmp11.cast = inttoptr i128 %tmp11.cast.int to ptr addrspace(8) + %tmp13 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %tmp11.cast, i32 %tmp12, i32 0, i32 0, i32 0) %tmp14 = extractelement <4 x float> %tmp13, i32 0 %tmp15 = extractelement <4 x float> %tmp13, i32 1 %tmp16 = extractelement <4 x float> %tmp13, i32 2 @@ -58,8 +61,9 @@ %tmp18 = getelementptr [16 x <16 x i8>], ptr addrspace(4) %arg4, i64 0, i64 1 %tmp19 = load <16 x i8>, ptr addrspace(4) %tmp18, align 16, !tbaa !0 %tmp20 = add i32 %arg5, %arg7 - %tmp19.cast = bitcast <16 x i8> %tmp19 to <4 x i32> - %tmp21 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp19.cast, i32 %tmp20, i32 0, i32 0, i32 0) + %tmp19.cast.int = bitcast <16 x i8> %tmp19 to i128 + %tmp19.cast = inttoptr i128 %tmp19.cast.int to ptr addrspace(8) + %tmp21 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %tmp19.cast, i32 %tmp20, i32 0, i32 0, i32 0) %tmp22 = extractelement <4 x float> %tmp21, i32 0 %tmp23 = extractelement <4 x float> %tmp21, i32 1 %tmp24 = extractelement <4 x float> %tmp21, i32 2 @@ -70,7 +74,7 @@ } declare void @llvm.amdgcn.s.barrier() #1 -declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32 immarg) #2 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8), i32, i32, i32, i32 immarg) #2 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1959,7 +1959,7 @@ br i1 %cc, label %endif, label %if if: - %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + %src = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) %out = fadd float %src, %src %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) %out.1 = fadd float %src, %out.0 @@ -2044,7 +2044,7 @@ br i1 %cc, label %endif, label %if if: - %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + %src = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) %out = fadd float %src, %src %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) %out.1 = fadd float %src, %out.0 @@ -2126,8 +2126,8 @@ ; GFX1064-NEXT: s_and_b64 exec, exec, s[2:3] ; GFX1064-NEXT: ; return to shader part epilog main_body: - %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) - %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) %out = fadd float %src0, %src1 %out.0 = bitcast float %out to i32 %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0) @@ -2945,7 +2945,7 @@ declare i32 @llvm.amdgcn.wqm.i32(i32) declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) -declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32 immarg) +declare float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8), i32, i32, i32, i32 immarg) declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) declare i64 @llvm.amdgcn.fcmp.i64.f32(float, float, i32) diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -96,7 +96,7 @@ %tex.1 = bitcast <4 x float> %tex to <4 x i32> %tex.2 = extractelement <4 x i32> %tex.1, i32 0 - call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex, ptr addrspace(8) undef, i32 %tex.2, i32 0, i32 0, i32 0) ret <4 x float> %tex } @@ -181,7 +181,7 @@ main_body: %c.1 = mul i32 %c, %d - call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> undef, ptr addrspace(8) undef, i32 %c.1, i32 0, i32 0, i32 0) %c.1.bc = bitcast i32 %c.1 to float %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %tex0 = extractelement <4 x float> %tex, i32 0 @@ -223,8 +223,8 @@ ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: - %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) - %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) %out = fadd float %src0, %src1 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) ret float %out.0 @@ -262,8 +262,8 @@ ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: - %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) - %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) %out = fadd float %src0, %src1 %out.0 = bitcast float %out to i32 %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0) @@ -303,8 +303,8 @@ ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: - %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) - %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) %out = fadd float %src0, %src1 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) ret float %out.0 @@ -340,8 +340,8 @@ ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: - %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) - %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) %src0.0 = bitcast float %src0 to i32 %src1.0 = bitcast float %src1 to i32 %out = add i32 %src0.0, %src1.0 @@ -405,7 +405,7 @@ br i1 %cc, label %endif, label %if if: - %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + %src = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) %out = fadd float %src, %src %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) %out.1 = fadd float %src, %out.0 @@ -468,7 +468,7 @@ br i1 %cc, label %endif, label %if if: - %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + %src = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) %out = fadd float %src, %src %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) br label %endif @@ -523,9 +523,9 @@ ; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: - %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) - call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) - %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) %temp = fadd float %src1, %src1 %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp) %out = fadd float %temp.0, %temp.0 @@ -720,13 +720,13 @@ ; GFX10-W32-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen ; GFX10-W32-NEXT: s_endpgm main_body: - %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + %src = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) %src.0 = bitcast float %src to i32 %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0) %out = add i32 %src.1, %src.1 %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out) %out.1 = bitcast i32 %out.0 to float - call void @llvm.amdgcn.struct.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %out.1, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) ret void } @@ -762,8 +762,8 @@ ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: - %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) - %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) %out = fadd float %src0, %src1 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) ret float %out.0 @@ -801,8 +801,8 @@ ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: - %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) - %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) %src0.0 = bitcast float %src0 to i32 %src1.0 = bitcast float %src1 to i32 %out = add i32 %src0.0, %src1.0 @@ -868,7 +868,7 @@ br i1 %cc, label %endif, label %if if: - %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + %src = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) %out = fadd float %src, %src %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) %out.1 = fadd float %src, %out.0 @@ -933,7 +933,7 @@ br i1 %cc, label %endif, label %if if: - %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + %src = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) %out = fadd float %src, %src %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) br label %endif @@ -991,9 +991,9 @@ ; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: - %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) - call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) - %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) %temp = fadd float %src1, %src1 %temp.0 = call float @llvm.amdgcn.strict.wqm.f32(float %temp) %out = fadd float %temp.0, %temp.0 @@ -1200,15 +1200,15 @@ ; GFX10-W32-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen ; GFX10-W32-NEXT: s_endpgm main_body: - %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) %src1.0 = bitcast float %src1 to i32 %src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 undef) - %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) %src0.0 = bitcast float %src0 to i32 %src0.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %src0.0) %out = add i32 %src0.1, %src1.1 %out.0 = bitcast i32 %out to float - call void @llvm.amdgcn.struct.buffer.store.f32(float %out.0, <4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %out.0, ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) ret void } @@ -1285,7 +1285,7 @@ br label %END ELSE: - call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 %c, i32 0, i32 0, i32 0) br label %END END: @@ -1364,7 +1364,7 @@ br label %END ELSE: - call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 %c, i32 0, i32 0, i32 0) br label %END END: @@ -1437,16 +1437,16 @@ main_body: %idx.1 = extractelement <3 x i32> %idx, i32 0 %data.1 = extractelement <2 x float> %data, i32 0 - call void @llvm.amdgcn.struct.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %data.1, ptr addrspace(8) undef, i32 %idx.1, i32 0, i32 0, i32 0) ; The load that determines the branch (and should therefore be WQM) is ; surrounded by stores that require disabled WQM. %idx.2 = extractelement <3 x i32> %idx, i32 1 - %z = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i32 0, i32 0) + %z = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx.2, i32 0, i32 0, i32 0) %idx.3 = extractelement <3 x i32> %idx, i32 2 %data.3 = extractelement <2 x float> %data, i32 1 - call void @llvm.amdgcn.struct.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %data.3, ptr addrspace(8) undef, i32 %idx.3, i32 0, i32 0, i32 0) %cc = fcmp ogt float %z, 0.0 br i1 %cc, label %IF, label %ELSE @@ -1540,7 +1540,7 @@ %tex0 = extractelement <4 x float> %tex, i32 0 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %dtex.1 = extractelement <4 x float> %dtex, i32 0 - call void @llvm.amdgcn.struct.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %dtex.1, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) %cc = fcmp ogt float %dtex.1, 0.0 br i1 %cc, label %IF, label %ELSE @@ -1611,8 +1611,8 @@ br i1 %cond, label %IF, label %END IF: - %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> undef, i32 0, i32 0, i32 0) - call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i32 0, i32 0) + %data = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) undef, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 1, i32 0, i32 0, i32 0) br label %END END: @@ -1692,14 +1692,14 @@ %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %idx.0 = extractelement <2 x i32> %idx, i32 0 %data.0 = extractelement <2 x float> %data, i32 0 - call void @llvm.amdgcn.struct.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %data.0, ptr addrspace(8) undef, i32 %idx.0, i32 0, i32 0, i32 0) %z.cmp = fcmp olt float %z, 0.0 call void @llvm.amdgcn.kill(i1 %z.cmp) %idx.1 = extractelement <2 x i32> %idx, i32 1 %data.1 = extractelement <2 x float> %data, i32 1 - call void @llvm.amdgcn.struct.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %data.1, ptr addrspace(8) undef, i32 %idx.1, i32 0, i32 0, i32 0) %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %tex2.0 = extractelement <4 x float> %tex2, i32 0 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex2.0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 @@ -1773,7 +1773,7 @@ %tex0 = extractelement <4 x float> %tex, i32 0 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 - call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 0, i32 0, i32 0) %z.cmp = fcmp olt float %z, 0.0 call void @llvm.amdgcn.kill(i1 %z.cmp) @@ -2001,17 +2001,17 @@ entry: %array = alloca [32 x i32], align 4, addrspace(5) - call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 0, i32 0, i32 0) store volatile i32 %a, ptr addrspace(5) %array, align 4 - call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 1, i32 0, i32 0, i32 0) %c.gep = getelementptr [32 x i32], ptr addrspace(5) %array, i32 0, i32 %idx %c = load i32, ptr addrspace(5) %c.gep, align 4 %c.bc = bitcast i32 %c to float %t = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 - call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %t, ptr addrspace(8) undef, i32 0, i32 0, i32 0) ret void } @@ -2188,7 +2188,7 @@ end: %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ] - call void @llvm.amdgcn.struct.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float 1.0, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) ret <4 x float> %r } @@ -2306,8 +2306,8 @@ ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: - %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) - %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) %out = fadd float %src0, %src1 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) ret float %out.0 @@ -2343,8 +2343,8 @@ ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: - %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) - %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) %src0.0 = bitcast float %src0 to i32 %src1.0 = bitcast float %src1 to i32 %out = add i32 %src0.0, %src1.0 @@ -2408,7 +2408,7 @@ br i1 %cc, label %endif, label %if if: - %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + %src = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) %out = fadd float %src, %src %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) %out.1 = fadd float %src, %out.0 @@ -2471,7 +2471,7 @@ br i1 %cc, label %endif, label %if if: - %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + %src = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) %out = fadd float %src, %src %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) br label %endif @@ -2526,9 +2526,9 @@ ; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: - %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) - call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) - %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) %temp = fadd float %src1, %src1 %temp.0 = call float @llvm.amdgcn.strict.wwm.f32(float %temp) %out = fadd float %temp.0, %temp.0 @@ -2719,13 +2719,13 @@ ; GFX10-W32-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen ; GFX10-W32-NEXT: s_endpgm main_body: - %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + %src = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) %src.0 = bitcast float %src to i32 %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0) %out = add i32 %src.1, %src.1 %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out) %out.1 = bitcast i32 %out.0 to float - call void @llvm.amdgcn.struct.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %out.1, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) ret void } @@ -2884,7 +2884,7 @@ } ;TODO: StrictWQM -> WQM transition could be improved. WQM could use the exec from the previous state instead of calling s_wqm again. -define amdgpu_ps float @test_strict_wqm_strict_wwm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, <4 x i32> inreg %res2, float %inp, <8 x i32> inreg %res3) { +define amdgpu_ps float @test_strict_wqm_strict_wwm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, ptr addrspace(8) inreg %res2, float %inp, <8 x i32> inreg %res3) { ; GFX9-W64-LABEL: test_strict_wqm_strict_wwm_wqm: ; GFX9-W64: ; %bb.0: ; %main_body ; GFX9-W64-NEXT: s_mov_b64 s[28:29], exec @@ -2988,21 +2988,23 @@ ; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: - call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) - %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0) + %reload = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0) %temp = fadd float %reload, %reload %temp2 = call float @llvm.amdgcn.strict.wqm.f32(float %temp) %temp3 = fadd float %temp2, %temp2 - %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res2, i32 %idx0, i32 0, i32 0, i32 0) + %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %res2, i32 %idx0, i32 0, i32 0, i32 0) %temp4 = call float @llvm.amdgcn.strict.wwm.f32(float %reload_wwm) %temp5 = fadd float %temp3, %temp4 - %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res3, <4 x i32> %res, i1 false, i32 0, i32 0) - call void @llvm.amdgcn.struct.buffer.store.f32(float %tex, <4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) - %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) + %res.int = ptrtoint ptr addrspace(8) %res to i128 + %res.vec = bitcast i128 %res.int to <4 x i32> + %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res3, <4 x i32> %res.vec, i1 false, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %tex, ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0) + %out = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0) ret float %out } -define amdgpu_ps float @test_strict_wwm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, float %inp, <8 x i32> inreg %res2) { +define amdgpu_ps float @test_strict_wwm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, float %inp, <8 x i32> inreg %res2) { ; GFX9-W64-LABEL: test_strict_wwm_strict_wqm_wqm: ; GFX9-W64: ; %bb.0: ; %main_body ; GFX9-W64-NEXT: s_mov_b64 s[20:21], exec @@ -3098,22 +3100,24 @@ ; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: - call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) - %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0) + %reload = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0) %temp = fadd float %reload, %reload %temp2 = call float @llvm.amdgcn.strict.wwm.f32(float %temp) %temp3 = fadd float %temp2, %temp2 - %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) + %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0) %temp4 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm) %temp5 = fadd float %temp3, %temp4 - %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0) - call void @llvm.amdgcn.struct.buffer.store.f32(float %tex, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) - %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) + %res.int = ptrtoint ptr addrspace(8) %res to i128 + %res.vec = bitcast i128 %res.int to <4 x i32> + %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %tex, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0) + %out = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0) ret float %out } ;TODO: WQM -> StrictWQM transition could be improved. StrictWQM could use the exec from the previous state instead of calling s_wqm again. -define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, float %inp, <8 x i32> inreg %res2) { +define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, float %inp, <8 x i32> inreg %res2) { ; GFX9-W64-LABEL: test_wqm_strict_wqm_wqm: ; GFX9-W64: ; %bb.0: ; %main_body ; GFX9-W64-NEXT: s_mov_b64 s[20:21], exec @@ -3199,17 +3203,19 @@ ; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: - call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) - %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0) + %reload = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0) %temp = fadd float %reload, %reload - %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0) + %res.int = ptrtoint ptr addrspace(8) %res to i128 + %res.vec = bitcast i128 %res.int to <4 x i32> + %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0) %temp2 = fadd float %tex, %tex - %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) + %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0) %temp3 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm) %temp4 = fadd float %temp2, %temp3 - %tex2 = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp4, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0) - call void @llvm.amdgcn.struct.buffer.store.f32(float %tex2, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) - %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) + %tex2 = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp4, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %tex2, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0) + %out = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0) ret float %out } @@ -3250,21 +3256,23 @@ main_body: %1 = ptrtoint ptr addrspace(6) %0 to i32 %2 = insertelement <4 x i32> , i32 %1, i32 0 - %3 = call nsz arcp float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %2, i32 0, i32 0) #3 - %4 = fcmp nsz arcp ugt float %3, 0.000000e+00 - call void @llvm.amdgcn.kill(i1 %4) #1 + %3 = bitcast <4 x i32> %2 to i128 + %4 = inttoptr i128 %3 to ptr addrspace(8) + %5 = call nsz arcp float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %4, i32 0, i32 0) #3 + %6 = fcmp nsz arcp ugt float %5, 0.000000e+00 + call void @llvm.amdgcn.kill(i1 %6) #1 ret void } declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1 -declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2 -declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2 -declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32 immarg) #2 -declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #2 -declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #3 -declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #3 +declare void @llvm.amdgcn.struct.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg) #2 +declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32, i32 immarg) #2 +declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32 immarg) #2 +declare void @llvm.amdgcn.raw.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32 immarg) #2 +declare float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8), i32, i32, i32) #3 +declare float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8), i32, i32, i32, i32) #3 declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #3 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 @@ -3287,7 +3295,7 @@ declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) -declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #7 +declare float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8), i32, i32 immarg) #7 attributes #1 = { nounwind } attributes #2 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -2,7 +2,7 @@ ; RUN: llc -O0 -mtriple=amdgcn- -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-O0 %s ; RUN: llc -mtriple=amdgcn- -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-O3 %s -define amdgpu_gfx void @strict_wwm_no_cfg(<4 x i32> inreg %tmp14) { +define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) { ; GFX9-O0-LABEL: strict_wwm_no_cfg: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -11,12 +11,20 @@ ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: s_mov_b32 s36, s4 +; GFX9-O0-NEXT: s_mov_b32 s40, s6 +; GFX9-O0-NEXT: s_mov_b32 s34, s4 +; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41 +; GFX9-O0-NEXT: s_mov_b32 s41, s7 +; GFX9-O0-NEXT: s_mov_b32 s42, s41 +; GFX9-O0-NEXT: s_mov_b32 s43, s40 +; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 +; GFX9-O0-NEXT: s_mov_b32 s35, s5 +; GFX9-O0-NEXT: s_mov_b32 s44, s35 +; GFX9-O0-NEXT: s_mov_b32 s36, s34 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 -; GFX9-O0-NEXT: s_mov_b32 s37, s5 -; GFX9-O0-NEXT: s_mov_b32 s38, s6 -; GFX9-O0-NEXT: s_mov_b32 s39, s7 -; GFX9-O0-NEXT: ; kill: def $sgpr40_sgpr41_sgpr42_sgpr43 killed $sgpr36_sgpr37_sgpr38_sgpr39 +; GFX9-O0-NEXT: s_mov_b32 s37, s44 +; GFX9-O0-NEXT: s_mov_b32 s38, s43 +; GFX9-O0-NEXT: s_mov_b32 s39, s42 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 ; GFX9-O0-NEXT: buffer_load_dwordx2 v[5:6], off, s[36:39], s34 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) @@ -103,7 +111,7 @@ ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] - %tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %tmp14, i32 0, i32 0, i32 0) + %tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) %tmp14, i32 0, i32 0, i32 0) %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32> %tmp102 = extractelement <2 x i32> %tmp101, i32 0 %tmp103 = extractelement <2 x i32> %tmp101, i32 1 @@ -124,11 +132,11 @@ %tmp140 = shl nsw i32 %tmp139, 1 %tmp141 = and i32 %tmp140, 2 %tmp145 = bitcast i32 %tmp141 to float - call void @llvm.amdgcn.raw.buffer.store.f32(float %tmp145, <4 x i32> %tmp14, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %tmp145, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0) ret void } -define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) { +define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-LABEL: strict_wwm_cfg: ; GFX9-O0: ; %bb.0: ; %entry ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -137,18 +145,26 @@ ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: s_mov_b32 s36, s4 +; GFX9-O0-NEXT: s_mov_b32 s40, s6 +; GFX9-O0-NEXT: s_mov_b32 s34, s4 +; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41 +; GFX9-O0-NEXT: s_mov_b32 s41, s7 +; GFX9-O0-NEXT: s_mov_b32 s42, s41 +; GFX9-O0-NEXT: s_mov_b32 s43, s40 +; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 +; GFX9-O0-NEXT: s_mov_b32 s35, s5 +; GFX9-O0-NEXT: s_mov_b32 s44, s35 +; GFX9-O0-NEXT: s_mov_b32 s36, s34 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 -; GFX9-O0-NEXT: s_mov_b32 s37, s5 -; GFX9-O0-NEXT: s_mov_b32 s38, s6 -; GFX9-O0-NEXT: s_mov_b32 s39, s7 -; GFX9-O0-NEXT: s_mov_b64 s[42:43], s[38:39] -; GFX9-O0-NEXT: s_mov_b64 s[40:41], s[36:37] +; GFX9-O0-NEXT: s_mov_b32 s37, s44 +; GFX9-O0-NEXT: s_mov_b32 s38, s43 +; GFX9-O0-NEXT: s_mov_b32 s39, s42 ; GFX9-O0-NEXT: v_writelane_b32 v3, s40, 0 ; GFX9-O0-NEXT: v_writelane_b32 v3, s41, 1 -; GFX9-O0-NEXT: v_writelane_b32 v3, s42, 2 -; GFX9-O0-NEXT: v_writelane_b32 v3, s43, 3 +; GFX9-O0-NEXT: v_writelane_b32 v3, s34, 2 +; GFX9-O0-NEXT: v_writelane_b32 v3, s35, 3 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 +; GFX9-O0-NEXT: s_nop 2 ; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], s34 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill @@ -199,20 +215,28 @@ ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_readlane_b32 s34, v3, 4 -; GFX9-O0-NEXT: v_readlane_b32 s35, v3, 5 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[34:35] -; GFX9-O0-NEXT: v_readlane_b32 s36, v3, 0 -; GFX9-O0-NEXT: v_readlane_b32 s37, v3, 1 -; GFX9-O0-NEXT: v_readlane_b32 s38, v3, 2 -; GFX9-O0-NEXT: v_readlane_b32 s39, v3, 3 +; GFX9-O0-NEXT: v_readlane_b32 s36, v3, 4 +; GFX9-O0-NEXT: v_readlane_b32 s37, v3, 5 +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[36:37] +; GFX9-O0-NEXT: v_readlane_b32 s38, v3, 0 +; GFX9-O0-NEXT: v_readlane_b32 s39, v3, 1 +; GFX9-O0-NEXT: v_readlane_b32 s34, v3, 2 +; GFX9-O0-NEXT: v_readlane_b32 s35, v3, 3 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[34:35], v0, v4 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] -; GFX9-O0-NEXT: s_mov_b32 s34, 1 -; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s34, v0 -; GFX9-O0-NEXT: s_mov_b32 s34, 2 -; GFX9-O0-NEXT: v_and_b32_e64 v0, v0, s34 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v0, v4 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[36:37] +; GFX9-O0-NEXT: s_mov_b32 s36, 1 +; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s36, v0 +; GFX9-O0-NEXT: s_mov_b32 s36, 2 +; GFX9-O0-NEXT: v_and_b32_e64 v0, v0, s36 +; GFX9-O0-NEXT: s_mov_b32 s40, s35 +; GFX9-O0-NEXT: s_mov_b32 s36, s34 +; GFX9-O0-NEXT: s_mov_b32 s34, s39 +; GFX9-O0-NEXT: s_mov_b32 s35, s38 +; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 +; GFX9-O0-NEXT: s_mov_b32 s37, s40 +; GFX9-O0-NEXT: s_mov_b32 s38, s35 +; GFX9-O0-NEXT: s_mov_b32 s39, s34 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -275,7 +299,7 @@ ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] entry: - %tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %tmp14, i32 0, i32 0, i32 0) + %tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) %tmp14, i32 0, i32 0, i32 0) %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32> %tmp102 = extractelement <2 x i32> %tmp101, i32 0 %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0) @@ -302,7 +326,7 @@ %tmp140 = shl nsw i32 %tmp139, 1 %tmp141 = and i32 %tmp140, 2 %tmp145 = bitcast i32 %tmp141 to float - call void @llvm.amdgcn.raw.buffer.store.f32(float %tmp145, <4 x i32> %tmp14, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %tmp145, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0) ret void } @@ -328,26 +352,34 @@ ret i32 %sub } -define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg) { +define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-LABEL: strict_wwm_call: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s35, s33 +; GFX9-O0-NEXT: s_mov_b32 s48, s33 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 -; GFX9-O0-NEXT: s_xor_saveexec_b64 s[36:37], -1 +; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400 ; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0 ; GFX9-O0-NEXT: v_writelane_b32 v3, s31, 1 -; GFX9-O0-NEXT: s_mov_b32 s36, s4 +; GFX9-O0-NEXT: s_mov_b32 s40, s6 +; GFX9-O0-NEXT: s_mov_b32 s34, s4 +; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41 +; GFX9-O0-NEXT: s_mov_b32 s41, s7 +; GFX9-O0-NEXT: s_mov_b32 s42, s41 +; GFX9-O0-NEXT: s_mov_b32 s43, s40 +; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 +; GFX9-O0-NEXT: s_mov_b32 s35, s5 +; GFX9-O0-NEXT: s_mov_b32 s44, s35 +; GFX9-O0-NEXT: s_mov_b32 s36, s34 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 -; GFX9-O0-NEXT: s_mov_b32 s37, s5 -; GFX9-O0-NEXT: s_mov_b32 s38, s6 -; GFX9-O0-NEXT: s_mov_b32 s39, s7 -; GFX9-O0-NEXT: ; kill: def $sgpr40_sgpr41_sgpr42_sgpr43 killed $sgpr36_sgpr37_sgpr38_sgpr39 +; GFX9-O0-NEXT: s_mov_b32 s37, s44 +; GFX9-O0-NEXT: s_mov_b32 s38, s43 +; GFX9-O0-NEXT: s_mov_b32 s39, s42 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-O0-NEXT: s_not_b64 exec, exec @@ -370,13 +402,13 @@ ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4 ; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0 -; GFX9-O0-NEXT: s_xor_saveexec_b64 s[36:37], -1 +; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; GFX9-O0-NEXT: s_mov_b32 s33, s35 +; GFX9-O0-NEXT: s_mov_b32 s33, s48 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] ; @@ -423,7 +455,7 @@ %tmp134 = call amdgpu_gfx i32 @strict_wwm_called(i32 %tmp107) %tmp136 = add i32 %tmp134, %tmp107 %tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136) - call void @llvm.amdgcn.raw.buffer.store.i32(i32 %tmp137, <4 x i32> %tmp14, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %tmp137, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0) ret void } @@ -512,11 +544,11 @@ ret i64 %sub } -define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %arg) { +define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) { ; GFX9-O0-LABEL: strict_wwm_call_i64: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s42, s33 +; GFX9-O0-NEXT: s_mov_b32 s46, s33 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill @@ -538,18 +570,27 @@ ; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0 ; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 1 ; GFX9-O0-NEXT: s_mov_b32 s34, s8 -; GFX9-O0-NEXT: s_mov_b32 s36, s4 -; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 -; GFX9-O0-NEXT: s_mov_b32 s37, s5 ; GFX9-O0-NEXT: s_mov_b32 s38, s6 +; GFX9-O0-NEXT: s_mov_b32 s36, s4 +; GFX9-O0-NEXT: ; kill: def $sgpr38 killed $sgpr38 def $sgpr38_sgpr39 ; GFX9-O0-NEXT: s_mov_b32 s39, s7 -; GFX9-O0-NEXT: v_writelane_b32 v10, s36, 2 -; GFX9-O0-NEXT: v_writelane_b32 v10, s37, 3 -; GFX9-O0-NEXT: v_writelane_b32 v10, s38, 4 -; GFX9-O0-NEXT: v_writelane_b32 v10, s39, 5 +; GFX9-O0-NEXT: s_mov_b32 s35, s39 +; GFX9-O0-NEXT: s_mov_b32 s44, s38 +; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37 +; GFX9-O0-NEXT: s_mov_b32 s37, s5 +; GFX9-O0-NEXT: s_mov_b32 s45, s37 +; GFX9-O0-NEXT: s_mov_b32 s40, s36 +; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41_sgpr42_sgpr43 +; GFX9-O0-NEXT: s_mov_b32 s41, s45 +; GFX9-O0-NEXT: s_mov_b32 s42, s44 +; GFX9-O0-NEXT: s_mov_b32 s43, s35 +; GFX9-O0-NEXT: v_writelane_b32 v10, s40, 2 +; GFX9-O0-NEXT: v_writelane_b32 v10, s41, 3 +; GFX9-O0-NEXT: v_writelane_b32 v10, s42, 4 +; GFX9-O0-NEXT: v_writelane_b32 v10, s43, 5 ; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 ; GFX9-O0-NEXT: s_mov_b32 s35, s9 -; GFX9-O0-NEXT: ; kill: def $sgpr40_sgpr41 killed $sgpr34_sgpr35 +; GFX9-O0-NEXT: ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35 ; GFX9-O0-NEXT: s_mov_b64 s[36:37], 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, s34 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, s35 @@ -621,7 +662,7 @@ ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff400 -; GFX9-O0-NEXT: s_mov_b32 s33, s42 +; GFX9-O0-NEXT: s_mov_b32 s33, s46 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] ; @@ -694,11 +735,11 @@ %tmp136 = add i64 %tmp134, %tmp107 %tmp137 = tail call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp136) %tmp138 = bitcast i64 %tmp137 to <2 x i32> - call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %tmp138, <4 x i32> %tmp14, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %tmp138, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0) ret void } -define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) { +define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(ptr addrspace(8) inreg %desc, i32 %index) { ; GFX9-O0-LABEL: strict_wwm_amdgpu_cs_main: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -707,12 +748,20 @@ ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: s_mov_b32 s36, s4 +; GFX9-O0-NEXT: s_mov_b32 s40, s6 +; GFX9-O0-NEXT: s_mov_b32 s34, s4 +; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41 +; GFX9-O0-NEXT: s_mov_b32 s41, s7 +; GFX9-O0-NEXT: s_mov_b32 s42, s41 +; GFX9-O0-NEXT: s_mov_b32 s43, s40 +; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 +; GFX9-O0-NEXT: s_mov_b32 s35, s5 +; GFX9-O0-NEXT: s_mov_b32 s44, s35 +; GFX9-O0-NEXT: s_mov_b32 s36, s34 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 -; GFX9-O0-NEXT: s_mov_b32 s37, s5 -; GFX9-O0-NEXT: s_mov_b32 s38, s6 -; GFX9-O0-NEXT: s_mov_b32 s39, s7 -; GFX9-O0-NEXT: ; kill: def $sgpr40_sgpr41_sgpr42_sgpr43 killed $sgpr36_sgpr37_sgpr38_sgpr39 +; GFX9-O0-NEXT: s_mov_b32 s37, s44 +; GFX9-O0-NEXT: s_mov_b32 s38, s43 +; GFX9-O0-NEXT: s_mov_b32 s39, s42 ; GFX9-O0-NEXT: s_mov_b32 s34, 5 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s34, v0 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 @@ -850,10 +899,10 @@ ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] %tmp17 = shl i32 %index, 5 - %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0) + %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(ptr addrspace(8) %desc, i32 %tmp17, i32 0) %.i0.upto1.bc = bitcast <4 x i32> %tmp18 to <2 x i64> %tmp19 = or i32 %tmp17, 16 - %tmp20 = tail call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %tmp19, i32 0) + %tmp20 = tail call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(ptr addrspace(8) %desc, i32 %tmp19, i32 0) %.i0.upto1.extract = extractelement <2 x i64> %.i0.upto1.bc, i32 0 %tmp22 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i0.upto1.extract, i64 9223372036854775807) %tmp97 = tail call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp22) @@ -867,8 +916,8 @@ %.cast6 = bitcast i64 %tmp174 to <2 x float> %.cast7 = bitcast i64 %tmp251 to <2 x float> %tmp254 = shufflevector <2 x float> %.cast, <2 x float> %.cast6, <4 x i32> - tail call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %tmp254, <4 x i32> %desc, i32 %tmp17, i32 0, i32 0) - tail call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %.cast7, <4 x i32> %desc, i32 %tmp19, i32 0, i32 0) + tail call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %tmp254, ptr addrspace(8) %desc, i32 %tmp17, i32 0, i32 0) + tail call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %.cast7, ptr addrspace(8) %desc, i32 %tmp19, i32 0, i32 0) ret void } @@ -1387,13 +1436,13 @@ declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) -declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) -declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) -declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32) -declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32) -declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) -declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) -declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32) -declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) +declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8), i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.i32(i32, ptr addrspace(8), i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, ptr addrspace(8), i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, ptr addrspace(8), i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32) +declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(ptr addrspace(8), i32, i32) +declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(ptr addrspace(8), i32, i32) attributes #0 = { "amdgpu-waves-per-eu"="5,5" } diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -4,8 +4,8 @@ ; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead. ; GFX9-LABEL: {{^}}no_cfg: -define amdgpu_cs void @no_cfg(<4 x i32> inreg %tmp14) { - %tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %tmp14, i32 0, i32 0, i32 0) +define amdgpu_cs void @no_cfg(ptr addrspace(8) inreg %tmp14) { + %tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) %tmp14, i32 0, i32 0, i32 0) %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32> %tmp102 = extractelement <2 x i32> %tmp101, i32 0 %tmp103 = extractelement <2 x i32> %tmp101, i32 1 @@ -37,14 +37,14 @@ %tmp140 = shl nsw i32 %tmp139, 1 %tmp141 = and i32 %tmp140, 2 %tmp145 = bitcast i32 %tmp141 to float - call void @llvm.amdgcn.raw.buffer.store.f32(float %tmp145, <4 x i32> %tmp14, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %tmp145, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0) ret void } ; GFX9-LABEL: {{^}}cfg: -define amdgpu_cs void @cfg(<4 x i32> inreg %tmp14, i32 %arg) { +define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { entry: - %tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %tmp14, i32 0, i32 0, i32 0) + %tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) %tmp14, i32 0, i32 0, i32 0) %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32> %tmp102 = extractelement <2 x i32> %tmp101, i32 0 %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0) @@ -85,7 +85,7 @@ %tmp140 = shl nsw i32 %tmp139, 1 %tmp141 = and i32 %tmp140, 2 %tmp145 = bitcast i32 %tmp141 to float - call void @llvm.amdgcn.raw.buffer.store.f32(float %tmp145, <4 x i32> %tmp14, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %tmp145, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0) ret void } @@ -103,7 +103,7 @@ } ; GFX9-LABEL: {{^}}call: -define amdgpu_kernel void @call(<4 x i32> inreg %tmp14, i32 inreg %arg) { +define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-DAG: s_load_dword [[ARG:s[0-9]+]] ; GFX9-O0-DAG: s_mov_b32 s4, 0{{$}} ; GFX9-O0-DAG: v_mov_b32_e32 v2, [[ARG]] @@ -124,7 +124,7 @@ %tmp136 = add i32 %tmp134, %tmp107 %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136) ; GFX9: buffer_store_dword v0 - call void @llvm.amdgcn.raw.buffer.store.i32(i32 %tmp137, <4 x i32> %tmp14, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %tmp137, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0) ret void } @@ -137,8 +137,8 @@ } ; GFX9-LABEL: {{^}}call_i64: -define amdgpu_kernel void @call_i64(<4 x i32> inreg %tmp14, i64 inreg %arg) { -; GFX9: s_load_dwordx2 s[[[ARG_LO:[0-9]+]]:[[ARG_HI:[0-9]+]]] +define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) { +; GFX9: s_load_dwordx2 s[[[ARG_LO:[0-9]+]]:[[ARG_HI:[0-9]+]]], {{.*}}, 0x34 ; GFX9-O0: s_mov_b64 s[[[ZERO_LO:[0-9]+]]:[[ZERO_HI:[0-9]+]]], 0{{$}} ; GFX9-O0-DAG: v_mov_b32_e32 v9, s[[ARG_HI]] @@ -160,19 +160,19 @@ %tmp137 = tail call i64 @llvm.amdgcn.wwm.i64(i64 %tmp136) %tmp138 = bitcast i64 %tmp137 to <2 x i32> ; GFX9: buffer_store_dwordx2 - call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %tmp138, <4 x i32> %tmp14, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %tmp138, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0) ret void } ; GFX9-LABEL: {{^}}_amdgpu_cs_main: -define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) { +define amdgpu_cs void @_amdgpu_cs_main(ptr addrspace(8) inreg %desc, i32 %index) { %tmp17 = shl i32 %index, 5 ; GFX9: buffer_load_dwordx4 - %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0) + %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(ptr addrspace(8) %desc, i32 %tmp17, i32 0) %.i0.upto1.bc = bitcast <4 x i32> %tmp18 to <2 x i64> %tmp19 = or i32 %tmp17, 16 ; GFX9: buffer_load_dwordx2 - %tmp20 = tail call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %tmp19, i32 0) + %tmp20 = tail call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(ptr addrspace(8) %desc, i32 %tmp19, i32 0) %.i0.upto1.extract = extractelement <2 x i64> %.i0.upto1.bc, i32 0 %tmp22 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i0.upto1.extract, i64 9223372036854775807) %tmp97 = tail call i64 @llvm.amdgcn.wwm.i64(i64 %tmp22) @@ -187,16 +187,16 @@ %.cast7 = bitcast i64 %tmp251 to <2 x float> %tmp254 = shufflevector <2 x float> %.cast, <2 x float> %.cast6, <4 x i32> ; GFX9: buffer_store_dwordx4 - tail call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %tmp254, <4 x i32> %desc, i32 %tmp17, i32 0, i32 0) + tail call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %tmp254, ptr addrspace(8) %desc, i32 %tmp17, i32 0, i32 0) ; GFX9: buffer_store_dwordx2 - tail call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %.cast7, <4 x i32> %desc, i32 %tmp19, i32 0, i32 0) + tail call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %.cast7, ptr addrspace(8) %desc, i32 %tmp19, i32 0, i32 0) ret void } ; GFX9-LABEL: {{^}}strict_wwm_no_cfg: -define amdgpu_cs void @strict_wwm_no_cfg(<4 x i32> inreg %tmp14) { - %tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %tmp14, i32 0, i32 0, i32 0) +define amdgpu_cs void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) { + %tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) %tmp14, i32 0, i32 0, i32 0) %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32> %tmp102 = extractelement <2 x i32> %tmp101, i32 0 %tmp103 = extractelement <2 x i32> %tmp101, i32 1 @@ -228,14 +228,14 @@ %tmp140 = shl nsw i32 %tmp139, 1 %tmp141 = and i32 %tmp140, 2 %tmp145 = bitcast i32 %tmp141 to float - call void @llvm.amdgcn.raw.buffer.store.f32(float %tmp145, <4 x i32> %tmp14, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %tmp145, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0) ret void } ; GFX9-LABEL: {{^}}strict_wwm_cfg: -define amdgpu_cs void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) { +define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { entry: - %tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %tmp14, i32 0, i32 0, i32 0) + %tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) %tmp14, i32 0, i32 0, i32 0) %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32> %tmp102 = extractelement <2 x i32> %tmp101, i32 0 %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0) @@ -276,7 +276,7 @@ %tmp140 = shl nsw i32 %tmp139, 1 %tmp141 = and i32 %tmp140, 2 %tmp145 = bitcast i32 %tmp141 to float - call void @llvm.amdgcn.raw.buffer.store.f32(float %tmp145, <4 x i32> %tmp14, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %tmp145, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0) ret void } @@ -294,7 +294,7 @@ } ; GFX9-LABEL: {{^}}strict_wwm_call: -define amdgpu_kernel void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg) { +define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-DAG: s_load_dword [[ARG:s[0-9]+]] ; GFX9-O0-DAG: s_mov_b32 s4, 0{{$}} ; GFX9-O0-DAG: v_mov_b32_e32 v2, [[ARG]] @@ -315,7 +315,7 @@ %tmp136 = add i32 %tmp134, %tmp107 %tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136) ; GFX9: buffer_store_dword v0 - call void @llvm.amdgcn.raw.buffer.store.i32(i32 %tmp137, <4 x i32> %tmp14, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %tmp137, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0) ret void } @@ -328,8 +328,8 @@ } ; GFX9-LABEL: {{^}}strict_wwm_call_i64: -define amdgpu_kernel void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %arg) { -; GFX9: s_load_dwordx2 s[[[ARG_LO:[0-9]+]]:[[ARG_HI:[0-9]+]]] +define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) { +; GFX9: s_load_dwordx2 s[[[ARG_LO:[0-9]+]]:[[ARG_HI:[0-9]+]]], {{.*}}, 0x34 ; GFX9-O0: s_mov_b64 s[[[ZERO_LO:[0-9]+]]:[[ZERO_HI:[0-9]+]]], 0{{$}} ; GFX9-O0-DAG: v_mov_b32_e32 v9, s[[ARG_HI]] @@ -351,19 +351,19 @@ %tmp137 = tail call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp136) %tmp138 = bitcast i64 %tmp137 to <2 x i32> ; GFX9: buffer_store_dwordx2 - call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %tmp138, <4 x i32> %tmp14, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %tmp138, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0) ret void } ; GFX9-LABEL: {{^}}strict_wwm_amdgpu_cs_main: -define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) { +define amdgpu_cs void @strict_wwm_amdgpu_cs_main(ptr addrspace(8) inreg %desc, i32 %index) { %tmp17 = shl i32 %index, 5 ; GFX9: buffer_load_dwordx4 - %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0) + %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(ptr addrspace(8) %desc, i32 %tmp17, i32 0) %.i0.upto1.bc = bitcast <4 x i32> %tmp18 to <2 x i64> %tmp19 = or i32 %tmp17, 16 ; GFX9: buffer_load_dwordx2 - %tmp20 = tail call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %tmp19, i32 0) + %tmp20 = tail call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(ptr addrspace(8) %desc, i32 %tmp19, i32 0) %.i0.upto1.extract = extractelement <2 x i64> %.i0.upto1.bc, i32 0 %tmp22 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i0.upto1.extract, i64 9223372036854775807) %tmp97 = tail call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp22) @@ -378,9 +378,9 @@ %.cast7 = bitcast i64 %tmp251 to <2 x float> %tmp254 = shufflevector <2 x float> %.cast, <2 x float> %.cast6, <4 x i32> ; GFX9: buffer_store_dwordx4 - tail call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %tmp254, <4 x i32> %desc, i32 %tmp17, i32 0, i32 0) + tail call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %tmp254, ptr addrspace(8) %desc, i32 %tmp17, i32 0, i32 0) ; GFX9: buffer_store_dwordx2 - tail call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %.cast7, <4 x i32> %desc, i32 %tmp19, i32 0, i32 0) + tail call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %.cast7, ptr addrspace(8) %desc, i32 %tmp19, i32 0, i32 0) ret void } @@ -391,11 +391,11 @@ declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) -declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) -declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) -declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32) -declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32) -declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) -declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) -declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32) -declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) +declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8), i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.i32(i32, ptr addrspace(8), i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, ptr addrspace(8), i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, ptr addrspace(8), i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32) +declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(ptr addrspace(8), i32, i32) +declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(ptr addrspace(8), i32, i32) diff --git a/llvm/test/CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll b/llvm/test/CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll --- a/llvm/test/CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll @@ -4,14 +4,14 @@ ; Test that custom pseudo source values can be round trip serialized through MIR. ; CHECK-LABEL: {{^}}name: shader -; CHECK: %[[#]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET killed %17, %18, 4, 0, 0, implicit $exec :: (dereferenceable load (s32) from unknown-address + 4, align 1, addrspace 8) +; CHECK: %[[#]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET killed %[[#]], %[[#]], 4, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.arg3 + 4, align 1, addrspace 8) ; CHECK: IMAGE_STORE_V4_V3_nsa_gfx10 killed %[[#]], %[[#]], %[[#]], %[[#]], killed %[[#]], 15, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8) ; CHECK: DS_GWS_BARRIER %[[#]], 63, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") -define amdgpu_cs void @shader(i32 %arg0, i32 %arg1, <8 x i32> inreg %arg2, <4 x i32> inreg %arg3) { - %bload0 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %arg3, i32 4, i32 0, i32 0) - %bload1 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %arg3, i32 8, i32 0, i32 0) - %bload2 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %arg3, i32 12, i32 0, i32 0) - %bload3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %arg3, i32 16, i32 0, i32 0) +define amdgpu_cs void @shader(i32 %arg0, i32 %arg1, <8 x i32> inreg %arg2, ptr addrspace(8) inreg %arg3) { + %bload0 = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) %arg3, i32 4, i32 0, i32 0) + %bload1 = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) %arg3, i32 8, i32 0, i32 0) + %bload2 = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) %arg3, i32 12, i32 0, i32 0) + %bload3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) %arg3, i32 16, i32 0, i32 0) %bload0.f = bitcast i32 %bload0 to float %bload1.f = bitcast i32 %bload1 to float %bload2.f = bitcast i32 %bload2 to float @@ -26,7 +26,7 @@ } declare void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float>, i32 immarg, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0 -declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg) #1 +declare i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8), i32, i32, i32 immarg) #1 declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #2 attributes #0 = { nounwind willreturn writeonly } diff --git a/llvm/test/Transforms/EarlyCSE/AMDGPU/intrinsics.ll b/llvm/test/Transforms/EarlyCSE/AMDGPU/intrinsics.ll --- a/llvm/test/Transforms/EarlyCSE/AMDGPU/intrinsics.ll +++ b/llvm/test/Transforms/EarlyCSE/AMDGPU/intrinsics.ll @@ -1,36 +1,36 @@ ; RUN: opt < %s -S -mtriple=amdgcn-- -passes=early-cse -earlycse-debug-hash | FileCheck %s ; CHECK-LABEL: @no_cse -; CHECK: call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 0, i32 0) -; CHECK: call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 4, i32 0) -define void @no_cse(ptr addrspace(1) %out, <4 x i32> %in) { - %a = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 0, i32 0) - %b = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 4, i32 0) +; CHECK: call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %in, i32 0, i32 0) +; CHECK: call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %in, i32 4, i32 0) +define void @no_cse(ptr addrspace(1) %out, ptr addrspace(8) %in) { + %a = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %in, i32 0, i32 0) + %b = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %in, i32 4, i32 0) %c = add i32 %a, %b store i32 %c, ptr addrspace(1) %out ret void } ; CHECK-LABEL: @cse_zero_offset -; CHECK: [[CSE:%[a-z0-9A-Z]+]] = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 0, i32 0) +; CHECK: [[CSE:%[a-z0-9A-Z]+]] = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %in, i32 0, i32 0) ; CHECK: add i32 [[CSE]], [[CSE]] -define void @cse_zero_offset(ptr addrspace(1) %out, <4 x i32> %in) { - %a = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 0, i32 0) - %b = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 0, i32 0) +define void @cse_zero_offset(ptr addrspace(1) %out, ptr addrspace(8) %in) { + %a = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %in, i32 0, i32 0) + %b = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %in, i32 0, i32 0) %c = add i32 %a, %b store i32 %c, ptr addrspace(1) %out ret void } ; CHECK-LABEL: @cse_nonzero_offset -; CHECK: [[CSE:%[a-z0-9A-Z]+]] = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 4, i32 0) +; CHECK: [[CSE:%[a-z0-9A-Z]+]] = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %in, i32 4, i32 0) ; CHECK: add i32 [[CSE]], [[CSE]] -define void @cse_nonzero_offset(ptr addrspace(1) %out, <4 x i32> %in) { - %a = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 4, i32 0) - %b = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 4, i32 0) +define void @cse_nonzero_offset(ptr addrspace(1) %out, ptr addrspace(8) %in) { + %a = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %in, i32 4, i32 0) + %b = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) %in, i32 4, i32 0) %c = add i32 %a, %b store i32 %c, ptr addrspace(1) %out ret void } -declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> nocapture, i32, i32) +declare i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) nocapture, i32, i32) diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll @@ -5,180 +5,180 @@ ; llvm.amdgcn.buffer.load ; -------------------------------------------------------------------- -define amdgpu_ps float @buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps float @buffer_load_f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @buffer_load_f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) ret float %data } -define amdgpu_ps <1 x float> @buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <1 x float> @buffer_load_v1f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @buffer_load_v1f32( -; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: ret <1 x float> [[DATA]] ; - %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) ret <1 x float> %data } -define amdgpu_ps <2 x float> @buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) ret <2 x float> %data } -define amdgpu_ps <4 x float> @buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <4 x float> @buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: ret <4 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) ret <4 x float> %data } -define amdgpu_ps float @extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt0_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt1_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt0_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt0_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt1_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt2_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt2_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps float @extract_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt3_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt3_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i64 3 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt2_elt3_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt2_elt3_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: ret <3 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_elt3_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt2_elt3_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps { float, float } @extract_elt0_elt1_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps { float, float } @extract_elt0_elt1_buffer_load_v4f32_2(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32_2( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x float> [[DATA]], i64 0 ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: [[INS0:%.*]] = insertvalue { float, float } undef, float [[ELT0]], 0 ; CHECK-NEXT: [[INS1:%.*]] = insertvalue { float, float } [[INS0]], float [[ELT1]], 1 ; CHECK-NEXT: ret { float, float } [[INS1]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <4 x float> %data, i32 0 %elt1 = extractelement <4 x float> %data, i32 1 %ins0 = insertvalue { float, float } undef, float %elt0, 0 @@ -186,9 +186,9 @@ ret { float, float } %ins1 } -define amdgpu_ps { float, float, float } @extract_elt0_elt1_elt2_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps { float, float, float } @extract_elt0_elt1_elt2_buffer_load_v4f32_2(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_2( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[ELT0:%.*]] = extractelement <3 x float> [[DATA]], i64 0 ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 1 ; CHECK-NEXT: [[ELT2:%.*]] = extractelement <3 x float> [[DATA]], i64 2 @@ -197,7 +197,7 @@ ; CHECK-NEXT: [[INS2:%.*]] = insertvalue { float, float, float } [[INS1]], float [[ELT2]], 2 ; CHECK-NEXT: ret { float, float, float } [[INS2]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <4 x float> %data, i32 0 %elt1 = extractelement <4 x float> %data, i32 1 %elt2 = extractelement <4 x float> %data, i32 2 @@ -207,15 +207,15 @@ ret { float, float, float } %ins2 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_3(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_3(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_3( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> undef, <2 x i32> ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]] ; CHECK-NEXT: ret <2 x float> [[RET]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <4 x float> %data, i32 0 %elt2 = extractelement <4 x float> %data, i32 2 %ins0 = insertelement <2 x float> poison, float %elt0, i32 0 @@ -225,15 +225,15 @@ ret <2 x float> %ret } -define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_4(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_4(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_4( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> undef, <2 x i32> ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]] ; CHECK-NEXT: ret <2 x float> [[RET]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <4 x float> %data, i32 0 %elt2 = extractelement <4 x float> %data, i32 2 %ins0 = insertelement <2 x float> poison, float %elt0, i32 0 @@ -243,15 +243,15 @@ ret <2 x float> %ret } -define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_5(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_5(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_5( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> undef, <2 x i32> ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]] ; CHECK-NEXT: ret <2 x float> [[RET]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt2 = extractelement <4 x float> %data, i32 2 %ins0 = insertelement <2 x float> poison, float %elt2, i32 0 %ins1 = insertelement <2 x float> %ins0, float %elt2, i32 1 @@ -260,114 +260,114 @@ ret <2 x float> %ret } -define amdgpu_ps float @extract_elt0_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt0_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt1_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt2_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt2_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps float @preserve_metadata_extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps float @preserve_metadata_extract_elt0_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @preserve_metadata_extract_elt0_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false), !fpmath !0 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false), !fpmath !0 ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0 + %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1 +declare float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8), i32, i32, i1, i1) #1 +declare <1 x float> @llvm.amdgcn.buffer.load.v1f32(ptr addrspace(8), i32, i32, i1, i1) #1 +declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8), i32, i32, i1, i1) #1 +declare <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8), i32, i32, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8), i32, i32, i1, i1) #1 ; -------------------------------------------------------------------- ; llvm.amdgcn.buffer.load.format ; -------------------------------------------------------------------- -define amdgpu_ps <1 x float> @buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <1 x float> @buffer_load_format_v1f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @buffer_load_format_v1f32( -; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 true) +; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 true) ; CHECK-NEXT: ret <1 x float> [[DATA]] ; - %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true) + %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true) ret <1 x float> %data } -define amdgpu_ps float @extract_elt0_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt0_buffer_load_format_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_buffer_load_format_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 true, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 true, i1 false) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false) + %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } @@ -375,13 +375,13 @@ ; The initial insertion point is at the extractelement define double @extract01_bitcast_buffer_load_format_v4f32(i32 %arg) #0 { ; CHECK-LABEL: @extract01_bitcast_buffer_load_format_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false) +; CHECK-NEXT: [[VAR:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(ptr addrspace(8) undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false) ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[VAR]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[VAR1:%.*]] = bitcast <4 x float> [[TMP1]] to <2 x double> ; CHECK-NEXT: [[VAR2:%.*]] = extractelement <2 x double> [[VAR1]], i64 0 ; CHECK-NEXT: ret double [[VAR2]] ; - %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 + %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) undef, i32 %arg, i32 16, i1 false, i1 false) #3 %var1 = bitcast <4 x float> %var to <2 x double> %var2 = extractelement <2 x double> %var1, i32 0 ret double %var2 @@ -389,11 +389,11 @@ define i32 @extract0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 { ; CHECK-LABEL: @extract0_bitcast_buffer_load_format_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false) +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(ptr addrspace(8) undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false) ; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 ; CHECK-NEXT: ret i32 [[VAR2]] ; - %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 + %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) undef, i32 %arg, i32 16, i1 false, i1 false) #3 %var1 = bitcast <4 x float> %var to <4 x i32> %var2 = extractelement <4 x i32> %var1, i32 0 ret i32 %var2 @@ -401,2499 +401,2499 @@ define i16 @extract_lo16_0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 { ; CHECK-LABEL: @extract_lo16_0_bitcast_buffer_load_format_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false) +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(ptr addrspace(8) undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false) ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[VAR]] to i32 ; CHECK-NEXT: [[VAR2:%.*]] = trunc i32 [[TMP1]] to i16 ; CHECK-NEXT: ret i16 [[VAR2]] ; - %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 + %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) undef, i32 %arg, i32 16, i1 false, i1 false) #3 %var1 = bitcast <4 x float> %var to <8 x i16> %var2 = extractelement <8 x i16> %var1, i32 0 ret i16 %var2 } -declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1 +declare float @llvm.amdgcn.buffer.load.format.f32(ptr addrspace(8), i32, i32, i1, i1) #1 +declare <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(ptr addrspace(8), i32, i32, i1, i1) #1 +declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(ptr addrspace(8), i32, i32, i1, i1) #1 +declare <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(ptr addrspace(8), i32, i32, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8), i32, i32, i1, i1) #1 ; -------------------------------------------------------------------- ; llvm.amdgcn.raw.buffer.load ; -------------------------------------------------------------------- -define amdgpu_ps float @raw_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @raw_buffer_load_f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @raw_buffer_load_f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) ret float %data } -define amdgpu_ps <1 x float> @raw_buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <1 x float> @raw_buffer_load_v1f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @raw_buffer_load_v1f32( -; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <1 x float> [[DATA]] ; - %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) ret <1 x float> %data } -define amdgpu_ps <2 x float> @raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @raw_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @raw_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) ret <2 x float> %data } -define amdgpu_ps <4 x float> @raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <4 x float> @raw_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @raw_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <4 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) ret <4 x float> %data } -define amdgpu_ps float @extract_elt0_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt0_raw_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt1_raw_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt0_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt0_raw_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_raw_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt1_raw_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt2_raw_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt2_raw_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps float @extract_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt3_raw_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 12 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <3 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <3 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps float @extract_elt0_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt0_raw_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_raw_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt1_raw_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt2_raw_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt2_raw_buffer_load_v3f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v3f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define i32 @extract0_bitcast_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define i32 @extract0_bitcast_raw_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 ; CHECK-NEXT: ret i32 [[VAR2]] ; - %var = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %var = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %var1 = bitcast <4 x float> %var to <4 x i32> %var2 = extractelement <4 x i32> %var1, i32 0 ret i32 %var2 } -define float @extract0_bitcast_raw_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define float @extract0_bitcast_raw_buffer_load_v4i32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_v4i32( -; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[VAR2:%.*]] = bitcast i32 [[VAR]] to float ; CHECK-NEXT: ret float [[VAR2]] ; - %var = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %var = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %var1 = bitcast <4 x i32> %var to <4 x float> %var2 = extractelement <4 x float> %var1, i32 0 ret float %var2 } -define amdgpu_ps float @preserve_metadata_extract_elt0_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @preserve_metadata_extract_elt0_raw_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0 ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 + %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #1 -declare <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(<4 x i32>, i32, i32, i32) #1 -declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) #1 -declare <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32>, i32, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #1 +declare float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8), i32, i32, i32) #1 +declare <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(ptr addrspace(8), i32, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8), i32, i32, i32) #1 +declare <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(ptr addrspace(8), i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8), i32, i32, i32) #1 -declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32) #1 +declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(ptr addrspace(8), i32, i32, i32) #1 -define amdgpu_ps half @extract_elt0_raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps half @extract_elt0_raw_buffer_load_v2f16(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2f16( -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x half> %data, i32 0 ret half %elt0 } -define amdgpu_ps half @extract_elt1_raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps half @extract_elt1_raw_buffer_load_v2f16(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f16( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x half> %data, i32 1 ret half %elt1 } -define amdgpu_ps half @extract_elt1_raw_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps half @extract_elt1_raw_buffer_load_v3f16(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f16( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <3 x half> %data, i32 1 ret half %elt0 } -define amdgpu_ps half @extract_elt1_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps half @extract_elt1_raw_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f16( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x half> %data, i32 1 ret half %elt1 } -define amdgpu_ps half @extract_elt3_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps half @extract_elt3_raw_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f16( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 6 -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x half> %data, i32 3 ret half %elt1 } -define amdgpu_ps <2 x half> @extract_elt0_elt1_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x half> @extract_elt0_elt1_raw_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4f16( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x half> [[DATA]] ; - %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> ret <2 x half> %shuf } -declare half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32>, i32, i32, i32) #1 -declare <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32>, i32, i32, i32) #1 -declare <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(<4 x i32>, i32, i32, i32) #1 -declare <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32>, i32, i32, i32) #1 +declare half @llvm.amdgcn.raw.buffer.load.f16(ptr addrspace(8), i32, i32, i32) #1 +declare <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(ptr addrspace(8), i32, i32, i32) #1 +declare <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(ptr addrspace(8), i32, i32, i32) #1 +declare <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(ptr addrspace(8), i32, i32, i32) #1 -define amdgpu_ps i8 @extract_elt0_raw_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps i8 @extract_elt0_raw_buffer_load_v2i8(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2i8( -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x i8> %data, i32 0 ret i8 %elt0 } -define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v2i8(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2i8( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x i8> %data, i32 1 ret i8 %elt1 } -define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v3i8(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3i8( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <3 x i8> @llvm.amdgcn.raw.buffer.load.v3i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x i8> @llvm.amdgcn.raw.buffer.load.v3i8(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <3 x i8> %data, i32 1 ret i8 %elt0 } -define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v4i8(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4i8( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x i8> %data, i32 1 ret i8 %elt1 } -define amdgpu_ps i8 @extract_elt3_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps i8 @extract_elt3_raw_buffer_load_v4i8(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4i8( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 3 -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x i8> %data, i32 3 ret i8 %elt1 } -define amdgpu_ps <2 x i8> @extract_elt0_elt1_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x i8> @extract_elt0_elt1_raw_buffer_load_v4i8(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4i8( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x i8> [[DATA]] ; - %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x i8> %data, <4 x i8> poison, <2 x i32> ret <2 x i8> %shuf } -declare i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32>, i32, i32, i32) #1 -declare <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32>, i32, i32, i32) #1 -declare <3 x i8> @llvm.amdgcn.raw.buffer.load.v3i8(<4 x i32>, i32, i32, i32) #1 -declare <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32>, i32, i32, i32) #1 +declare i8 @llvm.amdgcn.raw.buffer.load.i8(ptr addrspace(8), i32, i32, i32) #1 +declare <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(ptr addrspace(8), i32, i32, i32) #1 +declare <3 x i8> @llvm.amdgcn.raw.buffer.load.v3i8(ptr addrspace(8), i32, i32, i32) #1 +declare <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(ptr addrspace(8), i32, i32, i32) #1 ; -------------------------------------------------------------------- ; llvm.amdgcn.s.buffer.load ; -------------------------------------------------------------------- -define amdgpu_ps float @s_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps float @s_buffer_load_f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @s_buffer_load_f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) ret float %data } -define amdgpu_ps <2 x float> @s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @s_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @s_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) ret <2 x float> %data } -define amdgpu_ps <4 x float> @s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps <4 x float> @s_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @s_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret <4 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) ret <4 x float> %data } -define amdgpu_ps float @extract_elt0_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt0_s_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt1_s_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt0_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt0_s_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_s_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt1_s_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt2_s_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt2_s_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps float @extract_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt3_s_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 12 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt2_elt3_s_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt2_elt3_s_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_s_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_s_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret <3 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_s_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt2_elt3_s_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps float @extract_elt0_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt0_s_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_s_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt1_s_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt2_s_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt2_s_buffer_load_v3f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v3f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } ; Do not trim to vec3 s_buffer_load in instcombine, as the load will most likely be widened ; to vec4 anyway during lowering. -define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_s_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_elt3_s_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define i32 @extract0_bitcast_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define i32 @extract0_bitcast_s_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 ; CHECK-NEXT: ret i32 [[VAR2]] ; - %var = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %var = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %var1 = bitcast <4 x float> %var to <4 x i32> %var2 = extractelement <4 x i32> %var1, i32 0 ret i32 %var2 } -define float @extract0_bitcast_s_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define float @extract0_bitcast_s_buffer_load_v4i32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4i32( -; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: [[VAR2:%.*]] = bitcast i32 [[VAR]] to float ; CHECK-NEXT: ret float [[VAR2]] ; - %var = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %var = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %var1 = bitcast <4 x i32> %var to <4 x float> %var2 = extractelement <4 x float> %var1, i32 0 ret float %var2 } -define amdgpu_ps float @preserve_metadata_extract_elt0_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps float @preserve_metadata_extract_elt0_s_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @preserve_metadata_extract_elt0_s_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0), !fpmath !0 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0), !fpmath !0 ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0), !fpmath !0 + %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1 -declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32>, i32, i32) #1 -declare <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32>, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32) #1 -declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) #1 +declare float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8), i32, i32) #1 +declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8), i32, i32) #1 +declare <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(ptr addrspace(8), i32, i32) #1 +declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8), i32, i32) #1 +declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(ptr addrspace(8), i32, i32) #1 -define amdgpu_ps half @extract_elt0_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps half @extract_elt0_s_buffer_load_v2f16(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f16( -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt0 = extractelement <2 x half> %data, i32 0 ret half %elt0 } -define amdgpu_ps half @extract_elt1_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps half @extract_elt1_s_buffer_load_v2f16(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f16( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <2 x half> %data, i32 1 ret half %elt1 } -define amdgpu_ps half @extract_elt1_s_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps half @extract_elt1_s_buffer_load_v3f16(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f16( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <3 x half> %data, i32 1 ret half %elt1 } -define amdgpu_ps half @extract_elt1_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps half @extract_elt1_s_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f16( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x half> %data, i32 1 ret half %elt1 } -define amdgpu_ps half @extract_elt3_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps half @extract_elt3_s_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f16( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 6 -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x half> %data, i32 3 ret half %elt1 } -define amdgpu_ps <2 x half> @extract_elt0_elt1_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps <2 x half> @extract_elt0_elt1_s_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f16( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x half> [[DATA]] ; - %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> ret <2 x half> %shuf } -declare half @llvm.amdgcn.s.buffer.load.f16(<4 x i32>, i32, i32) #1 -declare <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32>, i32, i32) #1 -declare <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(<4 x i32>, i32, i32) #1 -declare <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32>, i32, i32) #1 +declare half @llvm.amdgcn.s.buffer.load.f16(ptr addrspace(8), i32, i32) #1 +declare <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(ptr addrspace(8), i32, i32) #1 +declare <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(ptr addrspace(8), i32, i32) #1 +declare <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(ptr addrspace(8), i32, i32) #1 -define amdgpu_ps i8 @extract_elt0_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps i8 @extract_elt0_s_buffer_load_v2i8(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_s_buffer_load_v2i8( -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt0 = extractelement <2 x i8> %data, i32 0 ret i8 %elt0 } -define amdgpu_ps i8 @extract_elt1_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps i8 @extract_elt1_s_buffer_load_v2i8(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_s_buffer_load_v2i8( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <2 x i8> %data, i32 1 ret i8 %elt1 } -define amdgpu_ps i8 @extract_elt1_s_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps i8 @extract_elt1_s_buffer_load_v3i8(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_s_buffer_load_v3i8( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <3 x i8> %data, i32 1 ret i8 %elt1 } -define amdgpu_ps i8 @extract_elt1_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps i8 @extract_elt1_s_buffer_load_v4i8(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_s_buffer_load_v4i8( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x i8> %data, i32 1 ret i8 %elt1 } -define amdgpu_ps i8 @extract_elt3_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps i8 @extract_elt3_s_buffer_load_v4i8(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt3_s_buffer_load_v4i8( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 3 -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x i8> %data, i32 3 ret i8 %elt1 } -define amdgpu_ps <2 x i8> @extract_elt0_elt1_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps <2 x i8> @extract_elt0_elt1_s_buffer_load_v4i8(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4i8( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x i8> [[DATA]] ; - %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x i8> %data, <4 x i8> poison, <2 x i32> ret <2 x i8> %shuf } -declare i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32>, i32, i32) #1 -declare <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32>, i32, i32) #1 -declare <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(<4 x i32>, i32, i32) #1 -declare <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32>, i32, i32) #1 +declare i8 @llvm.amdgcn.s.buffer.load.i8(ptr addrspace(8), i32, i32) #1 +declare <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(ptr addrspace(8), i32, i32) #1 +declare <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(ptr addrspace(8), i32, i32) #1 +declare <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(ptr addrspace(8), i32, i32) #1 ; -------------------------------------------------------------------- ; llvm.amdgcn.raw.buffer.load.format ; -------------------------------------------------------------------- -define amdgpu_ps float @raw_buffer_load_format_f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @raw_buffer_load_format_f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @raw_buffer_load_format_f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) ret float %data } -define amdgpu_ps <1 x float> @raw_buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <1 x float> @raw_buffer_load_format_v1f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @raw_buffer_load_format_v1f32( -; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <1 x float> [[DATA]] ; - %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) ret <1 x float> %data } -define amdgpu_ps <2 x float> @raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @raw_buffer_load_format_v2f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @raw_buffer_load_format_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) ret <2 x float> %data } -define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @raw_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <4 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) ret <4 x float> %data } -define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v2f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v2f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt2_raw_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps float @extract_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt3_raw_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt3_raw_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i64 3 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <3 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt2_raw_buffer_load_format_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_format_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_format_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_format_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define i32 @extract0_bitcast_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define i32 @extract0_bitcast_raw_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_format_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 ; CHECK-NEXT: ret i32 [[VAR2]] ; - %var = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %var = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %var1 = bitcast <4 x float> %var to <4 x i32> %var2 = extractelement <4 x i32> %var1, i32 0 ret i32 %var2 } -define float @extract0_bitcast_raw_buffer_load_format_v4i32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define float @extract0_bitcast_raw_buffer_load_format_v4i32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_format_v4i32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[VAR]] ; - %var = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %var = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %var1 = extractelement <4 x float> %var, i32 0 ret float %var1 } -define amdgpu_ps float @preserve_metadata_extract_elt0_raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @preserve_metadata_extract_elt0_raw_buffer_load_format_v2f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_format_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0 ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 + %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -declare float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32>, i32, i32, i32) #1 -declare <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(<4 x i32>, i32, i32, i32) #1 -declare <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32) #1 -declare <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) #1 +declare float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8), i32, i32, i32) #1 +declare <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(ptr addrspace(8), i32, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8), i32, i32, i32) #1 +declare <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8), i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8), i32, i32, i32) #1 ; -------------------------------------------------------------------- ; llvm.amdgcn.struct.buffer.load ; -------------------------------------------------------------------- -define amdgpu_ps float @struct_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @struct_buffer_load_f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @struct_buffer_load_f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret float %data } -define amdgpu_ps <1 x float> @struct_buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <1 x float> @struct_buffer_load_v1f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @struct_buffer_load_v1f32( -; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <1 x float> [[DATA]] ; - %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret <1 x float> %data } -define amdgpu_ps <2 x float> @struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @struct_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @struct_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret <2 x float> %data } -define amdgpu_ps <4 x float> @struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <4 x float> @struct_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @struct_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <4 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret <4 x float> %data } -define amdgpu_ps float @extract_elt0_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt0_struct_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt1_struct_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt0_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt0_struct_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_struct_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt1_struct_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt2_struct_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt2_struct_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps float @extract_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt3_struct_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 12 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <3 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <3 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps float @extract_elt0_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt0_struct_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_struct_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt1_struct_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt2_struct_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt2_struct_buffer_load_v3f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v3f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define i32 @extract0_bitcast_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define i32 @extract0_bitcast_struct_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 ; CHECK-NEXT: ret i32 [[VAR2]] ; - %var = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %var = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %var1 = bitcast <4 x float> %var to <4 x i32> %var2 = extractelement <4 x i32> %var1, i32 0 ret i32 %var2 } -define float @extract0_bitcast_struct_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define float @extract0_bitcast_struct_buffer_load_v4i32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_v4i32( -; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[VAR2:%.*]] = bitcast i32 [[VAR]] to float ; CHECK-NEXT: ret float [[VAR2]] ; - %var = call <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %var = call <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %var1 = bitcast <4 x i32> %var to <4 x float> %var2 = extractelement <4 x float> %var1, i32 0 ret float %var2 } -define amdgpu_ps float @preserve_metadata_extract_elt0_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @preserve_metadata_extract_elt0_struct_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0 ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 + %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #1 -declare <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(<4 x i32>, i32, i32, i32, i32) #1 -declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) #1 -declare <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32>, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #1 +declare float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8), i32, i32, i32, i32) #1 -declare <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #1 +declare <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(ptr addrspace(8), i32, i32, i32, i32) #1 -define amdgpu_ps half @extract_elt0_struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps half @extract_elt0_struct_buffer_load_v2f16(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2f16( -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x half> %data, i32 0 ret half %elt0 } -define amdgpu_ps half @extract_elt1_struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps half @extract_elt1_struct_buffer_load_v2f16(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f16( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x half> %data, i32 1 ret half %elt1 } -define amdgpu_ps half @extract_elt1_struct_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps half @extract_elt1_struct_buffer_load_v3f16(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f16( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x half> %data, i32 1 ret half %elt1 } -define amdgpu_ps half @extract_elt1_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps half @extract_elt1_struct_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f16( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x half> %data, i32 1 ret half %elt1 } -define amdgpu_ps half @extract_elt3_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps half @extract_elt3_struct_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f16( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 6 -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x half> %data, i32 3 ret half %elt1 } -define amdgpu_ps <2 x half> @extract_elt0_elt1_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x half> @extract_elt0_elt1_struct_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4f16( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x half> [[DATA]] ; - %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> ret <2 x half> %shuf } -declare half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32>, i32, i32, i32, i32) #1 -declare <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32>, i32, i32, i32, i32) #1 -declare <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(<4 x i32>, i32, i32, i32, i32) #1 -declare <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32>, i32, i32, i32, i32) #1 +declare half @llvm.amdgcn.struct.buffer.load.f16(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(ptr addrspace(8), i32, i32, i32, i32) #1 -define amdgpu_ps i8 @extract_elt0_struct_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps i8 @extract_elt0_struct_buffer_load_v2i8(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2i8( -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x i8> %data, i32 0 ret i8 %elt0 } -define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v2i8(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2i8( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x i8> %data, i32 1 ret i8 %elt1 } -define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v3i8(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3i8( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <3 x i8> @llvm.amdgcn.struct.buffer.load.v3i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x i8> @llvm.amdgcn.struct.buffer.load.v3i8(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x i8> %data, i32 1 ret i8 %elt1 } -define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v4i8(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4i8( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x i8> %data, i32 1 ret i8 %elt1 } -define amdgpu_ps i8 @extract_elt3_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps i8 @extract_elt3_struct_buffer_load_v4i8(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4i8( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 3 -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x i8> %data, i32 3 ret i8 %elt1 } -define amdgpu_ps <2 x i8> @extract_elt0_elt1_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x i8> @extract_elt0_elt1_struct_buffer_load_v4i8(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4i8( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x i8> [[DATA]] ; - %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x i8> %data, <4 x i8> poison, <2 x i32> ret <2 x i8> %shuf } -declare i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32>, i32, i32, i32, i32) #1 -declare <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32>, i32, i32, i32, i32) #1 -declare <3 x i8> @llvm.amdgcn.struct.buffer.load.v3i8(<4 x i32>, i32, i32, i32, i32) #1 -declare <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32>, i32, i32, i32, i32) #1 +declare i8 @llvm.amdgcn.struct.buffer.load.i8(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <3 x i8> @llvm.amdgcn.struct.buffer.load.v3i8(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(ptr addrspace(8), i32, i32, i32, i32) #1 ; -------------------------------------------------------------------- ; llvm.amdgcn.struct.buffer.load.format ; -------------------------------------------------------------------- -define amdgpu_ps float @struct_buffer_load_format_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @struct_buffer_load_format_f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @struct_buffer_load_format_f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret float %data } -define amdgpu_ps <1 x float> @struct_buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <1 x float> @struct_buffer_load_format_v1f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @struct_buffer_load_format_v1f32( -; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <1 x float> [[DATA]] ; - %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret <1 x float> %data } -define amdgpu_ps <2 x float> @struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @struct_buffer_load_format_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @struct_buffer_load_format_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret <2 x float> %data } -define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @struct_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <4 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret <4 x float> %data } -define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt2_struct_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps float @extract_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt3_struct_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt3_struct_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i64 3 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <3 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt2_struct_buffer_load_format_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_format_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_format_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_format_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define i32 @extract0_bitcast_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define i32 @extract0_bitcast_struct_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_format_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 ; CHECK-NEXT: ret i32 [[VAR2]] ; - %var = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %var = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %var1 = bitcast <4 x float> %var to <4 x i32> %var2 = extractelement <4 x i32> %var1, i32 0 ret i32 %var2 } -define amdgpu_ps float @preserve_metadata_extract_elt0_struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @preserve_metadata_extract_elt0_struct_buffer_load_format_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_buffer_load_format_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0 ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 + %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32) #1 -declare <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(<4 x i32>, i32, i32, i32, i32) #1 -declare <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32, i32) #1 -declare <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32) #1 +declare float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8), i32, i32, i32, i32) #1 -declare <4 x i32> @llvm.amdgcn.struct.buffer.load.format.v4i32(<4 x i32>, i32, i32, i32, i32) #1 +declare <4 x i32> @llvm.amdgcn.struct.buffer.load.format.v4i32(ptr addrspace(8), i32, i32, i32, i32) #1 ; -------------------------------------------------------------------- ; llvm.amdgcn.raw.tbuffer.load ; -------------------------------------------------------------------- -define amdgpu_ps float @raw_tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps float @raw_tbuffer_load_f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @raw_tbuffer_load_f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) ret float %data } -define amdgpu_ps <2 x float> @raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps <2 x float> @raw_tbuffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @raw_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) ret <2 x float> %data } -define amdgpu_ps <4 x float> @raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps <4 x float> @raw_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @raw_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret <4 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) ret <4 x float> %data } -define amdgpu_ps float @extract_elt0_raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps float @extract_elt0_raw_tbuffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps float @extract_elt1_raw_tbuffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt0_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps float @extract_elt0_raw_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps float @extract_elt1_raw_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps float @extract_elt2_raw_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps float @extract_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps float @extract_elt3_raw_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt3_raw_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i64 3 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt0_elt1_raw_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt1_elt2_raw_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt2_elt3_raw_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret <3 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps float @extract_elt0_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps float @extract_elt0_raw_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps float @extract_elt1_raw_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps float @extract_elt2_raw_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt0_elt1_raw_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt1_elt2_raw_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define i32 @extract0_bitcast_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define i32 @extract0_bitcast_raw_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract0_bitcast_raw_tbuffer_load_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 ; CHECK-NEXT: ret i32 [[VAR2]] ; - %var = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %var = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %var1 = bitcast <4 x float> %var to <4 x i32> %var2 = extractelement <4 x i32> %var1, i32 0 ret i32 %var2 } -define amdgpu_ps float @preserve_metadata_extract_elt0_raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps float @preserve_metadata_extract_elt0_raw_tbuffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0), !fpmath !0 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0), !fpmath !0 ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0), !fpmath !0 + %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -declare float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32>, i32, i32, i32, i32) #1 -declare <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) #1 -declare <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32>, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #1 +declare float @llvm.amdgcn.raw.tbuffer.load.f32(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8), i32, i32, i32, i32) #1 -declare <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #1 +declare <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(ptr addrspace(8), i32, i32, i32, i32) #1 -define amdgpu_ps half @extract_elt3_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps half @extract_elt3_raw_tbuffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt3_raw_tbuffer_load_v4f16( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x half> [[DATA]], i64 3 ; CHECK-NEXT: ret half [[ELT1]] ; - %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x half> %data, i32 3 ret half %elt1 } -define amdgpu_ps half @extract_elt2_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps half @extract_elt2_raw_tbuffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v4f16( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x half> [[DATA]], i64 2 ; CHECK-NEXT: ret half [[ELT1]] ; - %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x half> %data, i32 2 ret half %elt1 } -define amdgpu_ps half @extract_elt1_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps half @extract_elt1_raw_tbuffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v4f16( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[DATA]], i64 1 ; CHECK-NEXT: ret half [[ELT1]] ; - %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x half> %data, i32 1 ret half %elt1 } -define amdgpu_ps half @extract_elt0_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps half @extract_elt0_raw_tbuffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v4f16( -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.tbuffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x half> %data, i32 0 ret half %elt1 } -declare half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32) #1 -declare <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32) #1 -declare <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32>, i32, i32, i32, i32) #1 -declare <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32) #1 +declare half @llvm.amdgcn.raw.tbuffer.load.f16(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(ptr addrspace(8), i32, i32, i32, i32) #1 ; -------------------------------------------------------------------- ; llvm.amdgcn.struct.tbuffer.load ; -------------------------------------------------------------------- -define amdgpu_ps float @struct_tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps float @struct_tbuffer_load_f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @struct_tbuffer_load_f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) ret float %data } -define amdgpu_ps <2 x float> @struct_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps <2 x float> @struct_tbuffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @struct_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) ret <2 x float> %data } -define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @struct_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret <4 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) ret <4 x float> %data } -define amdgpu_ps float @extract_elt0_struct_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps float @extract_elt0_struct_tbuffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_struct_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps float @extract_elt1_struct_tbuffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt0_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps float @extract_elt0_struct_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps float @extract_elt1_struct_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps float @extract_elt2_struct_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt2_struct_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps float @extract_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps float @extract_elt3_struct_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt3_struct_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i64 3 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt0_elt1_struct_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt1_elt2_struct_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt2_elt3_struct_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret <3 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps float @extract_elt0_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps float @extract_elt0_struct_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps float @extract_elt1_struct_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps float @extract_elt2_struct_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt2_struct_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt0_elt1_struct_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt1_elt2_struct_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define i32 @extract0_bitcast_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define i32 @extract0_bitcast_struct_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract0_bitcast_struct_tbuffer_load_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 ; CHECK-NEXT: ret i32 [[VAR2]] ; - %var = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %var = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %var1 = bitcast <4 x float> %var to <4 x i32> %var2 = extractelement <4 x i32> %var1, i32 0 ret i32 %var2 } -define amdgpu_ps float @preserve_metadata_extract_elt0_struct_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps float @preserve_metadata_extract_elt0_struct_tbuffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0), !fpmath !0 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0), !fpmath !0 ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0), !fpmath !0 + %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -declare float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32>, i32, i32, i32, i32, i32) #1 -declare <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32>, i32, i32, i32, i32, i32) #1 -declare <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32>, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32) #1 +declare float @llvm.amdgcn.struct.tbuffer.load.f32(ptr addrspace(8), i32, i32, i32, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(ptr addrspace(8), i32, i32, i32, i32, i32) #1 +declare <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(ptr addrspace(8), i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8), i32, i32, i32, i32, i32) #1 -declare <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32) #1 +declare <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(ptr addrspace(8), i32, i32, i32, i32, i32) #1 ; -------------------------------------------------------------------- ; llvm.amdgcn.tbuffer.load ; -------------------------------------------------------------------- -define amdgpu_ps float @tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps float @tbuffer_load_f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @tbuffer_load_f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call float @llvm.amdgcn.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ret float %data } -define amdgpu_ps <2 x float> @tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps <2 x float> @tbuffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ret <2 x float> %data } -define amdgpu_ps <4 x float> @tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps <4 x float> @tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: ret <4 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ret <4 x float> %data } -define amdgpu_ps float @extract_elt0_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps float @extract_elt0_tbuffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt0_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps float @extract_elt1_tbuffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt1_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt0_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps float @extract_elt0_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt0_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps float @extract_elt1_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt1_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps float @extract_elt2_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt2_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps float @extract_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps float @extract_elt3_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt3_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i64 3 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps <2 x float> @extract_elt2_elt3_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt2_elt3_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: ret <3 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt1_elt2_elt3_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt0_elt2_elt3_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps float @extract_elt0_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps float @extract_elt0_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt0_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps float @extract_elt1_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt1_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps float @extract_elt2_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt2_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define i32 @extract0_bitcast_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define i32 @extract0_bitcast_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract0_bitcast_tbuffer_load_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 ; CHECK-NEXT: ret i32 [[VAR2]] ; - %var = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %var = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %var1 = bitcast <4 x float> %var to <4 x i32> %var2 = extractelement <4 x i32> %var1, i32 0 ret i32 %var2 } -define amdgpu_ps float @preserve_metadata_extract_elt0_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps float @preserve_metadata_extract_elt0_tbuffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @preserve_metadata_extract_elt0_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0 ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0 + %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -declare float @llvm.amdgcn.tbuffer.load.f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1 -declare <1 x float> @llvm.amdgcn.tbuffer.load.v1f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1 -declare <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1 -declare <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1 +declare float @llvm.amdgcn.tbuffer.load.f32(ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) #1 +declare <1 x float> @llvm.amdgcn.tbuffer.load.v1f32(ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) #1 +declare <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) #1 +declare <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) #1 -declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1 +declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) #1 ; -------------------------------------------------------------------- ; llvm.amdgcn.image.sample diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll @@ -5,180 +5,180 @@ ; llvm.amdgcn.buffer.load ; -------------------------------------------------------------------- -define amdgpu_ps float @buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps float @buffer_load_f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @buffer_load_f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) ret float %data } -define amdgpu_ps <1 x float> @buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <1 x float> @buffer_load_v1f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @buffer_load_v1f32( -; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: ret <1 x float> [[DATA]] ; - %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) ret <1 x float> %data } -define amdgpu_ps <2 x float> @buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) ret <2 x float> %data } -define amdgpu_ps <4 x float> @buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <4 x float> @buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: ret <4 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) ret <4 x float> %data } -define amdgpu_ps float @extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt0_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt1_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt0_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt0_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt1_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt2_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt2_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps float @extract_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt3_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt3_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i64 3 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt2_elt3_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt2_elt3_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: ret <3 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_elt3_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt2_elt3_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps { float, float } @extract_elt0_elt1_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps { float, float } @extract_elt0_elt1_buffer_load_v4f32_2(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32_2( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x float> [[DATA]], i64 0 ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: [[INS0:%.*]] = insertvalue { float, float } undef, float [[ELT0]], 0 ; CHECK-NEXT: [[INS1:%.*]] = insertvalue { float, float } [[INS0]], float [[ELT1]], 1 ; CHECK-NEXT: ret { float, float } [[INS1]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <4 x float> %data, i32 0 %elt1 = extractelement <4 x float> %data, i32 1 %ins0 = insertvalue { float, float } undef, float %elt0, 0 @@ -186,9 +186,9 @@ ret { float, float } %ins1 } -define amdgpu_ps { float, float, float } @extract_elt0_elt1_elt2_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps { float, float, float } @extract_elt0_elt1_elt2_buffer_load_v4f32_2(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_2( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[ELT0:%.*]] = extractelement <3 x float> [[DATA]], i64 0 ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 1 ; CHECK-NEXT: [[ELT2:%.*]] = extractelement <3 x float> [[DATA]], i64 2 @@ -197,7 +197,7 @@ ; CHECK-NEXT: [[INS2:%.*]] = insertvalue { float, float, float } [[INS1]], float [[ELT2]], 2 ; CHECK-NEXT: ret { float, float, float } [[INS2]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <4 x float> %data, i32 0 %elt1 = extractelement <4 x float> %data, i32 1 %elt2 = extractelement <4 x float> %data, i32 2 @@ -207,15 +207,15 @@ ret { float, float, float } %ins2 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_3(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_3(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_3( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> undef, <2 x i32> ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]] ; CHECK-NEXT: ret <2 x float> [[RET]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <4 x float> %data, i32 0 %elt2 = extractelement <4 x float> %data, i32 2 %ins0 = insertelement <2 x float> undef, float %elt0, i32 0 @@ -225,15 +225,15 @@ ret <2 x float> %ret } -define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_4(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_4(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_4( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> undef, <2 x i32> ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]] ; CHECK-NEXT: ret <2 x float> [[RET]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <4 x float> %data, i32 0 %elt2 = extractelement <4 x float> %data, i32 2 %ins0 = insertelement <2 x float> undef, float %elt0, i32 0 @@ -243,15 +243,15 @@ ret <2 x float> %ret } -define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_5(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_5(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_5( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> undef, <2 x i32> ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]] ; CHECK-NEXT: ret <2 x float> [[RET]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt2 = extractelement <4 x float> %data, i32 2 %ins0 = insertelement <2 x float> undef, float %elt2, i32 0 %ins1 = insertelement <2 x float> %ins0, float %elt2, i32 1 @@ -260,114 +260,114 @@ ret <2 x float> %ret } -define amdgpu_ps float @extract_elt0_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt0_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt1_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt2_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt2_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps float @preserve_metadata_extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps float @preserve_metadata_extract_elt0_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @preserve_metadata_extract_elt0_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false), !fpmath !0 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false), !fpmath !0 ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0 + %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1 +declare float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8), i32, i32, i1, i1) #1 +declare <1 x float> @llvm.amdgcn.buffer.load.v1f32(ptr addrspace(8), i32, i32, i1, i1) #1 +declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(ptr addrspace(8), i32, i32, i1, i1) #1 +declare <3 x float> @llvm.amdgcn.buffer.load.v3f32(ptr addrspace(8), i32, i32, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(ptr addrspace(8), i32, i32, i1, i1) #1 ; -------------------------------------------------------------------- ; llvm.amdgcn.buffer.load.format ; -------------------------------------------------------------------- -define amdgpu_ps <1 x float> @buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <1 x float> @buffer_load_format_v1f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @buffer_load_format_v1f32( -; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 true) +; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 true) ; CHECK-NEXT: ret <1 x float> [[DATA]] ; - %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true) + %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true) ret <1 x float> %data } -define amdgpu_ps float @extract_elt0_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt0_buffer_load_format_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_buffer_load_format_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 true, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 true, i1 false) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false) + %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } @@ -375,13 +375,13 @@ ; The initial insertion point is at the extractelement define double @extract01_bitcast_buffer_load_format_v4f32(i32 %arg) #0 { ; CHECK-LABEL: @extract01_bitcast_buffer_load_format_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false) +; CHECK-NEXT: [[VAR:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(ptr addrspace(8) undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false) ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[VAR]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[VAR1:%.*]] = bitcast <4 x float> [[TMP1]] to <2 x double> ; CHECK-NEXT: [[VAR2:%.*]] = extractelement <2 x double> [[VAR1]], i64 0 ; CHECK-NEXT: ret double [[VAR2]] ; - %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 + %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) undef, i32 %arg, i32 16, i1 false, i1 false) #3 %var1 = bitcast <4 x float> %var to <2 x double> %var2 = extractelement <2 x double> %var1, i32 0 ret double %var2 @@ -389,11 +389,11 @@ define i32 @extract0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 { ; CHECK-LABEL: @extract0_bitcast_buffer_load_format_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false) +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(ptr addrspace(8) undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false) ; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 ; CHECK-NEXT: ret i32 [[VAR2]] ; - %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 + %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) undef, i32 %arg, i32 16, i1 false, i1 false) #3 %var1 = bitcast <4 x float> %var to <4 x i32> %var2 = extractelement <4 x i32> %var1, i32 0 ret i32 %var2 @@ -401,2499 +401,2499 @@ define i16 @extract_lo16_0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 { ; CHECK-LABEL: @extract_lo16_0_bitcast_buffer_load_format_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false) +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(ptr addrspace(8) undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false) ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[VAR]] to i32 ; CHECK-NEXT: [[VAR2:%.*]] = trunc i32 [[TMP1]] to i16 ; CHECK-NEXT: ret i16 [[VAR2]] ; - %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 + %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8) undef, i32 %arg, i32 16, i1 false, i1 false) #3 %var1 = bitcast <4 x float> %var to <8 x i16> %var2 = extractelement <8 x i16> %var1, i32 0 ret i16 %var2 } -declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1 +declare float @llvm.amdgcn.buffer.load.format.f32(ptr addrspace(8), i32, i32, i1, i1) #1 +declare <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(ptr addrspace(8), i32, i32, i1, i1) #1 +declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(ptr addrspace(8), i32, i32, i1, i1) #1 +declare <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(ptr addrspace(8), i32, i32, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(ptr addrspace(8), i32, i32, i1, i1) #1 ; -------------------------------------------------------------------- ; llvm.amdgcn.raw.buffer.load ; -------------------------------------------------------------------- -define amdgpu_ps float @raw_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @raw_buffer_load_f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @raw_buffer_load_f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) ret float %data } -define amdgpu_ps <1 x float> @raw_buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <1 x float> @raw_buffer_load_v1f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @raw_buffer_load_v1f32( -; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <1 x float> [[DATA]] ; - %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) ret <1 x float> %data } -define amdgpu_ps <2 x float> @raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @raw_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @raw_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) ret <2 x float> %data } -define amdgpu_ps <4 x float> @raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <4 x float> @raw_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @raw_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <4 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) ret <4 x float> %data } -define amdgpu_ps float @extract_elt0_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt0_raw_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt1_raw_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt0_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt0_raw_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_raw_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt1_raw_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt2_raw_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt2_raw_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps float @extract_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt3_raw_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 12 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <3 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <3 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps float @extract_elt0_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt0_raw_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_raw_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt1_raw_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt2_raw_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt2_raw_buffer_load_v3f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v3f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define i32 @extract0_bitcast_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define i32 @extract0_bitcast_raw_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 ; CHECK-NEXT: ret i32 [[VAR2]] ; - %var = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %var = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %var1 = bitcast <4 x float> %var to <4 x i32> %var2 = extractelement <4 x i32> %var1, i32 0 ret i32 %var2 } -define float @extract0_bitcast_raw_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define float @extract0_bitcast_raw_buffer_load_v4i32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_v4i32( -; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[VAR2:%.*]] = bitcast i32 [[VAR]] to float ; CHECK-NEXT: ret float [[VAR2]] ; - %var = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %var = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %var1 = bitcast <4 x i32> %var to <4 x float> %var2 = extractelement <4 x float> %var1, i32 0 ret float %var2 } -define amdgpu_ps float @preserve_metadata_extract_elt0_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @preserve_metadata_extract_elt0_raw_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0 ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 + %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #1 -declare <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(<4 x i32>, i32, i32, i32) #1 -declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) #1 -declare <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32>, i32, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #1 +declare float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8), i32, i32, i32) #1 +declare <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(ptr addrspace(8), i32, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(ptr addrspace(8), i32, i32, i32) #1 +declare <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(ptr addrspace(8), i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8), i32, i32, i32) #1 -declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32) #1 +declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(ptr addrspace(8), i32, i32, i32) #1 -define amdgpu_ps half @extract_elt0_raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps half @extract_elt0_raw_buffer_load_v2f16(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2f16( -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x half> %data, i32 0 ret half %elt0 } -define amdgpu_ps half @extract_elt1_raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps half @extract_elt1_raw_buffer_load_v2f16(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f16( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x half> %data, i32 1 ret half %elt1 } -define amdgpu_ps half @extract_elt1_raw_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps half @extract_elt1_raw_buffer_load_v3f16(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f16( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <3 x half> %data, i32 1 ret half %elt0 } -define amdgpu_ps half @extract_elt1_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps half @extract_elt1_raw_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f16( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x half> %data, i32 1 ret half %elt1 } -define amdgpu_ps half @extract_elt3_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps half @extract_elt3_raw_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f16( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 6 -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x half> %data, i32 3 ret half %elt1 } -define amdgpu_ps <2 x half> @extract_elt0_elt1_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x half> @extract_elt0_elt1_raw_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4f16( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x half> [[DATA]] ; - %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> ret <2 x half> %shuf } -declare half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32>, i32, i32, i32) #1 -declare <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32>, i32, i32, i32) #1 -declare <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(<4 x i32>, i32, i32, i32) #1 -declare <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32>, i32, i32, i32) #1 +declare half @llvm.amdgcn.raw.buffer.load.f16(ptr addrspace(8), i32, i32, i32) #1 +declare <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(ptr addrspace(8), i32, i32, i32) #1 +declare <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(ptr addrspace(8), i32, i32, i32) #1 +declare <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(ptr addrspace(8), i32, i32, i32) #1 -define amdgpu_ps i8 @extract_elt0_raw_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps i8 @extract_elt0_raw_buffer_load_v2i8(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2i8( -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x i8> %data, i32 0 ret i8 %elt0 } -define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v2i8(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2i8( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x i8> %data, i32 1 ret i8 %elt1 } -define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v3i8(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3i8( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <3 x i8> @llvm.amdgcn.raw.buffer.load.v3i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x i8> @llvm.amdgcn.raw.buffer.load.v3i8(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <3 x i8> %data, i32 1 ret i8 %elt0 } -define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v4i8(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4i8( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x i8> %data, i32 1 ret i8 %elt1 } -define amdgpu_ps i8 @extract_elt3_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps i8 @extract_elt3_raw_buffer_load_v4i8(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4i8( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 3 -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x i8> %data, i32 3 ret i8 %elt1 } -define amdgpu_ps <2 x i8> @extract_elt0_elt1_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x i8> @extract_elt0_elt1_raw_buffer_load_v4i8(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4i8( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x i8> [[DATA]] ; - %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x i8> %data, <4 x i8> poison, <2 x i32> ret <2 x i8> %shuf } -declare i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32>, i32, i32, i32) #1 -declare <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32>, i32, i32, i32) #1 -declare <3 x i8> @llvm.amdgcn.raw.buffer.load.v3i8(<4 x i32>, i32, i32, i32) #1 -declare <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32>, i32, i32, i32) #1 +declare i8 @llvm.amdgcn.raw.buffer.load.i8(ptr addrspace(8), i32, i32, i32) #1 +declare <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(ptr addrspace(8), i32, i32, i32) #1 +declare <3 x i8> @llvm.amdgcn.raw.buffer.load.v3i8(ptr addrspace(8), i32, i32, i32) #1 +declare <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(ptr addrspace(8), i32, i32, i32) #1 ; -------------------------------------------------------------------- ; llvm.amdgcn.s.buffer.load ; -------------------------------------------------------------------- -define amdgpu_ps float @s_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps float @s_buffer_load_f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @s_buffer_load_f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) ret float %data } -define amdgpu_ps <2 x float> @s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @s_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @s_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) ret <2 x float> %data } -define amdgpu_ps <4 x float> @s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps <4 x float> @s_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @s_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret <4 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) ret <4 x float> %data } -define amdgpu_ps float @extract_elt0_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt0_s_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt1_s_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt0_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt0_s_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_s_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt1_s_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt2_s_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt2_s_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps float @extract_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt3_s_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 12 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt2_elt3_s_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt2_elt3_s_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_s_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_s_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret <3 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_s_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt2_elt3_s_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps float @extract_elt0_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt0_s_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_s_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt1_s_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps float @extract_elt2_s_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt2_s_buffer_load_v3f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v3f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } ; Do not trim to vec3 s_buffer_load in instcombine, as the load will most likely be widened ; to vec4 anyway during lowering. -define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_s_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_elt3_s_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define i32 @extract0_bitcast_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define i32 @extract0_bitcast_s_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 ; CHECK-NEXT: ret i32 [[VAR2]] ; - %var = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %var = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %var1 = bitcast <4 x float> %var to <4 x i32> %var2 = extractelement <4 x i32> %var1, i32 0 ret i32 %var2 } -define float @extract0_bitcast_s_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define float @extract0_bitcast_s_buffer_load_v4i32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4i32( -; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.s.buffer.load.i32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: [[VAR2:%.*]] = bitcast i32 [[VAR]] to float ; CHECK-NEXT: ret float [[VAR2]] ; - %var = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %var = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %var1 = bitcast <4 x i32> %var to <4 x float> %var2 = extractelement <4 x float> %var1, i32 0 ret float %var2 } -define amdgpu_ps float @preserve_metadata_extract_elt0_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps float @preserve_metadata_extract_elt0_s_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @preserve_metadata_extract_elt0_s_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0), !fpmath !0 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0), !fpmath !0 ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0), !fpmath !0 + %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1 -declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32>, i32, i32) #1 -declare <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32>, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32) #1 -declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) #1 +declare float @llvm.amdgcn.s.buffer.load.f32(ptr addrspace(8), i32, i32) #1 +declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(ptr addrspace(8), i32, i32) #1 +declare <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(ptr addrspace(8), i32, i32) #1 +declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(ptr addrspace(8), i32, i32) #1 +declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(ptr addrspace(8), i32, i32) #1 -define amdgpu_ps half @extract_elt0_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps half @extract_elt0_s_buffer_load_v2f16(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f16( -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt0 = extractelement <2 x half> %data, i32 0 ret half %elt0 } -define amdgpu_ps half @extract_elt1_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps half @extract_elt1_s_buffer_load_v2f16(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f16( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <2 x half> %data, i32 1 ret half %elt1 } -define amdgpu_ps half @extract_elt1_s_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps half @extract_elt1_s_buffer_load_v3f16(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f16( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <3 x half> %data, i32 1 ret half %elt1 } -define amdgpu_ps half @extract_elt1_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps half @extract_elt1_s_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f16( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x half> %data, i32 1 ret half %elt1 } -define amdgpu_ps half @extract_elt3_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps half @extract_elt3_s_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f16( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 6 -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x half> %data, i32 3 ret half %elt1 } -define amdgpu_ps <2 x half> @extract_elt0_elt1_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps <2 x half> @extract_elt0_elt1_s_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f16( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x half> [[DATA]] ; - %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> ret <2 x half> %shuf } -declare half @llvm.amdgcn.s.buffer.load.f16(<4 x i32>, i32, i32) #1 -declare <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32>, i32, i32) #1 -declare <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(<4 x i32>, i32, i32) #1 -declare <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32>, i32, i32) #1 +declare half @llvm.amdgcn.s.buffer.load.f16(ptr addrspace(8), i32, i32) #1 +declare <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(ptr addrspace(8), i32, i32) #1 +declare <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(ptr addrspace(8), i32, i32) #1 +declare <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(ptr addrspace(8), i32, i32) #1 -define amdgpu_ps i8 @extract_elt0_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps i8 @extract_elt0_s_buffer_load_v2i8(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_s_buffer_load_v2i8( -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt0 = extractelement <2 x i8> %data, i32 0 ret i8 %elt0 } -define amdgpu_ps i8 @extract_elt1_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps i8 @extract_elt1_s_buffer_load_v2i8(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_s_buffer_load_v2i8( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <2 x i8> %data, i32 1 ret i8 %elt1 } -define amdgpu_ps i8 @extract_elt1_s_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps i8 @extract_elt1_s_buffer_load_v3i8(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_s_buffer_load_v3i8( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <3 x i8> %data, i32 1 ret i8 %elt1 } -define amdgpu_ps i8 @extract_elt1_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps i8 @extract_elt1_s_buffer_load_v4i8(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt1_s_buffer_load_v4i8( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x i8> %data, i32 1 ret i8 %elt1 } -define amdgpu_ps i8 @extract_elt3_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps i8 @extract_elt3_s_buffer_load_v4i8(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt3_s_buffer_load_v4i8( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 3 -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[TMP1]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x i8> %data, i32 3 ret i8 %elt1 } -define amdgpu_ps <2 x i8> @extract_elt0_elt1_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +define amdgpu_ps <2 x i8> @extract_elt0_elt1_s_buffer_load_v4i8(ptr addrspace(8) inreg %rsrc, i32 %ofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4i8( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x i8> [[DATA]] ; - %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0) + %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(ptr addrspace(8) %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x i8> %data, <4 x i8> poison, <2 x i32> ret <2 x i8> %shuf } -declare i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32>, i32, i32) #1 -declare <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32>, i32, i32) #1 -declare <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(<4 x i32>, i32, i32) #1 -declare <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32>, i32, i32) #1 +declare i8 @llvm.amdgcn.s.buffer.load.i8(ptr addrspace(8), i32, i32) #1 +declare <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(ptr addrspace(8), i32, i32) #1 +declare <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(ptr addrspace(8), i32, i32) #1 +declare <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(ptr addrspace(8), i32, i32) #1 ; -------------------------------------------------------------------- ; llvm.amdgcn.raw.buffer.load.format ; -------------------------------------------------------------------- -define amdgpu_ps float @raw_buffer_load_format_f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @raw_buffer_load_format_f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @raw_buffer_load_format_f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) ret float %data } -define amdgpu_ps <1 x float> @raw_buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <1 x float> @raw_buffer_load_format_v1f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @raw_buffer_load_format_v1f32( -; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <1 x float> [[DATA]] ; - %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) ret <1 x float> %data } -define amdgpu_ps <2 x float> @raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @raw_buffer_load_format_v2f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @raw_buffer_load_format_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) ret <2 x float> %data } -define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @raw_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <4 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) ret <4 x float> %data } -define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v2f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v2f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt2_raw_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps float @extract_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt3_raw_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt3_raw_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i64 3 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <3 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt2_raw_buffer_load_format_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_format_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_format_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_format_v3f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define i32 @extract0_bitcast_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define i32 @extract0_bitcast_raw_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_format_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 ; CHECK-NEXT: ret i32 [[VAR2]] ; - %var = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %var = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %var1 = bitcast <4 x float> %var to <4 x i32> %var2 = extractelement <4 x i32> %var1, i32 0 ret i32 %var2 } -define float @extract0_bitcast_raw_buffer_load_format_v4i32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define float @extract0_bitcast_raw_buffer_load_format_v4i32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_format_v4i32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[VAR]] ; - %var = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %var = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0) %var1 = extractelement <4 x float> %var, i32 0 ret float %var1 } -define amdgpu_ps float @preserve_metadata_extract_elt0_raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @preserve_metadata_extract_elt0_raw_buffer_load_format_v2f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_format_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0 ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 + %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -declare float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32>, i32, i32, i32) #1 -declare <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(<4 x i32>, i32, i32, i32) #1 -declare <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32) #1 -declare <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) #1 +declare float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8), i32, i32, i32) #1 +declare <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(ptr addrspace(8), i32, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(ptr addrspace(8), i32, i32, i32) #1 +declare <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(ptr addrspace(8), i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(ptr addrspace(8), i32, i32, i32) #1 ; -------------------------------------------------------------------- ; llvm.amdgcn.struct.buffer.load ; -------------------------------------------------------------------- -define amdgpu_ps float @struct_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @struct_buffer_load_f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @struct_buffer_load_f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret float %data } -define amdgpu_ps <1 x float> @struct_buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <1 x float> @struct_buffer_load_v1f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @struct_buffer_load_v1f32( -; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <1 x float> [[DATA]] ; - %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret <1 x float> %data } -define amdgpu_ps <2 x float> @struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @struct_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @struct_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret <2 x float> %data } -define amdgpu_ps <4 x float> @struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <4 x float> @struct_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @struct_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <4 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret <4 x float> %data } -define amdgpu_ps float @extract_elt0_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt0_struct_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt1_struct_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt0_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt0_struct_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_struct_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt1_struct_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt2_struct_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt2_struct_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps float @extract_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt3_struct_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 12 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <3 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <3 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps float @extract_elt0_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt0_struct_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_struct_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt1_struct_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt2_struct_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt2_struct_buffer_load_v3f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v3f32( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define i32 @extract0_bitcast_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define i32 @extract0_bitcast_struct_buffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 ; CHECK-NEXT: ret i32 [[VAR2]] ; - %var = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %var = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %var1 = bitcast <4 x float> %var to <4 x i32> %var2 = extractelement <4 x i32> %var1, i32 0 ret i32 %var2 } -define float @extract0_bitcast_struct_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define float @extract0_bitcast_struct_buffer_load_v4i32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_v4i32( -; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[VAR2:%.*]] = bitcast i32 [[VAR]] to float ; CHECK-NEXT: ret float [[VAR2]] ; - %var = call <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %var = call <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %var1 = bitcast <4 x i32> %var to <4 x float> %var2 = extractelement <4 x float> %var1, i32 0 ret float %var2 } -define amdgpu_ps float @preserve_metadata_extract_elt0_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @preserve_metadata_extract_elt0_struct_buffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0 ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 + %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #1 -declare <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(<4 x i32>, i32, i32, i32, i32) #1 -declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) #1 -declare <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32>, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #1 +declare float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(ptr addrspace(8), i32, i32, i32, i32) #1 -declare <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #1 +declare <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(ptr addrspace(8), i32, i32, i32, i32) #1 -define amdgpu_ps half @extract_elt0_struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps half @extract_elt0_struct_buffer_load_v2f16(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2f16( -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x half> %data, i32 0 ret half %elt0 } -define amdgpu_ps half @extract_elt1_struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps half @extract_elt1_struct_buffer_load_v2f16(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f16( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x half> %data, i32 1 ret half %elt1 } -define amdgpu_ps half @extract_elt1_struct_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps half @extract_elt1_struct_buffer_load_v3f16(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f16( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x half> %data, i32 1 ret half %elt1 } -define amdgpu_ps half @extract_elt1_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps half @extract_elt1_struct_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f16( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x half> %data, i32 1 ret half %elt1 } -define amdgpu_ps half @extract_elt3_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps half @extract_elt3_struct_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f16( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 6 -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x half> %data, i32 3 ret half %elt1 } -define amdgpu_ps <2 x half> @extract_elt0_elt1_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x half> @extract_elt0_elt1_struct_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4f16( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x half> [[DATA]] ; - %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> ret <2 x half> %shuf } -declare half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32>, i32, i32, i32, i32) #1 -declare <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32>, i32, i32, i32, i32) #1 -declare <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(<4 x i32>, i32, i32, i32, i32) #1 -declare <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32>, i32, i32, i32, i32) #1 +declare half @llvm.amdgcn.struct.buffer.load.f16(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(ptr addrspace(8), i32, i32, i32, i32) #1 -define amdgpu_ps i8 @extract_elt0_struct_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps i8 @extract_elt0_struct_buffer_load_v2i8(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2i8( -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x i8> %data, i32 0 ret i8 %elt0 } -define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v2i8(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2i8( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x i8> %data, i32 1 ret i8 %elt1 } -define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v3i8(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3i8( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <3 x i8> @llvm.amdgcn.struct.buffer.load.v3i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x i8> @llvm.amdgcn.struct.buffer.load.v3i8(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x i8> %data, i32 1 ret i8 %elt1 } -define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v4i8(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4i8( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x i8> %data, i32 1 ret i8 %elt1 } -define amdgpu_ps i8 @extract_elt3_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps i8 @extract_elt3_struct_buffer_load_v4i8(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4i8( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 3 -; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret i8 [[DATA]] ; - %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x i8> %data, i32 3 ret i8 %elt1 } -define amdgpu_ps <2 x i8> @extract_elt0_elt1_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x i8> @extract_elt0_elt1_struct_buffer_load_v4i8(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4i8( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x i8> [[DATA]] ; - %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x i8> %data, <4 x i8> poison, <2 x i32> ret <2 x i8> %shuf } -declare i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32>, i32, i32, i32, i32) #1 -declare <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32>, i32, i32, i32, i32) #1 -declare <3 x i8> @llvm.amdgcn.struct.buffer.load.v3i8(<4 x i32>, i32, i32, i32, i32) #1 -declare <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32>, i32, i32, i32, i32) #1 +declare i8 @llvm.amdgcn.struct.buffer.load.i8(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <3 x i8> @llvm.amdgcn.struct.buffer.load.v3i8(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(ptr addrspace(8), i32, i32, i32, i32) #1 ; -------------------------------------------------------------------- ; llvm.amdgcn.struct.buffer.load.format ; -------------------------------------------------------------------- -define amdgpu_ps float @struct_buffer_load_format_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @struct_buffer_load_format_f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @struct_buffer_load_format_f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret float %data } -define amdgpu_ps <1 x float> @struct_buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <1 x float> @struct_buffer_load_format_v1f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @struct_buffer_load_format_v1f32( -; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <1 x float> [[DATA]] ; - %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret <1 x float> %data } -define amdgpu_ps <2 x float> @struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @struct_buffer_load_format_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @struct_buffer_load_format_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret <2 x float> %data } -define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @struct_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <4 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret <4 x float> %data } -define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt2_struct_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps float @extract_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt3_struct_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt3_struct_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i64 3 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <3 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @extract_elt2_struct_buffer_load_format_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_format_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_format_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_format_v3f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define i32 @extract0_bitcast_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define i32 @extract0_bitcast_struct_buffer_load_format_v4f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_format_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) ; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 ; CHECK-NEXT: ret i32 [[VAR2]] ; - %var = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %var = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %var1 = bitcast <4 x float> %var to <4 x i32> %var2 = extractelement <4 x i32> %var1, i32 0 ret i32 %var2 } -define amdgpu_ps float @preserve_metadata_extract_elt0_struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +define amdgpu_ps float @preserve_metadata_extract_elt0_struct_buffer_load_format_v2f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_buffer_load_format_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0 ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 + %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32) #1 -declare <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(<4 x i32>, i32, i32, i32, i32) #1 -declare <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32, i32) #1 -declare <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32) #1 +declare float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8), i32, i32, i32, i32) #1 -declare <4 x i32> @llvm.amdgcn.struct.buffer.load.format.v4i32(<4 x i32>, i32, i32, i32, i32) #1 +declare <4 x i32> @llvm.amdgcn.struct.buffer.load.format.v4i32(ptr addrspace(8), i32, i32, i32, i32) #1 ; -------------------------------------------------------------------- ; llvm.amdgcn.raw.tbuffer.load ; -------------------------------------------------------------------- -define amdgpu_ps float @raw_tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps float @raw_tbuffer_load_f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @raw_tbuffer_load_f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) ret float %data } -define amdgpu_ps <2 x float> @raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps <2 x float> @raw_tbuffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @raw_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) ret <2 x float> %data } -define amdgpu_ps <4 x float> @raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps <4 x float> @raw_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @raw_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret <4 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) ret <4 x float> %data } -define amdgpu_ps float @extract_elt0_raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps float @extract_elt0_raw_tbuffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps float @extract_elt1_raw_tbuffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt0_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps float @extract_elt0_raw_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps float @extract_elt1_raw_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps float @extract_elt2_raw_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps float @extract_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps float @extract_elt3_raw_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt3_raw_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i64 3 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt0_elt1_raw_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt1_elt2_raw_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt2_elt3_raw_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret <3 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps float @extract_elt0_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps float @extract_elt0_raw_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps float @extract_elt1_raw_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps float @extract_elt2_raw_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt0_elt1_raw_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt1_elt2_raw_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define i32 @extract0_bitcast_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define i32 @extract0_bitcast_raw_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract0_bitcast_raw_tbuffer_load_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 ; CHECK-NEXT: ret i32 [[VAR2]] ; - %var = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %var = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %var1 = bitcast <4 x float> %var to <4 x i32> %var2 = extractelement <4 x i32> %var1, i32 0 ret i32 %var2 } -define amdgpu_ps float @preserve_metadata_extract_elt0_raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps float @preserve_metadata_extract_elt0_raw_tbuffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0), !fpmath !0 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0), !fpmath !0 ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0), !fpmath !0 + %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -declare float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32>, i32, i32, i32, i32) #1 -declare <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) #1 -declare <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32>, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #1 +declare float @llvm.amdgcn.raw.tbuffer.load.f32(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(ptr addrspace(8), i32, i32, i32, i32) #1 -declare <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #1 +declare <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(ptr addrspace(8), i32, i32, i32, i32) #1 -define amdgpu_ps half @extract_elt3_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps half @extract_elt3_raw_tbuffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt3_raw_tbuffer_load_v4f16( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x half> [[DATA]], i64 3 ; CHECK-NEXT: ret half [[ELT1]] ; - %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x half> %data, i32 3 ret half %elt1 } -define amdgpu_ps half @extract_elt2_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps half @extract_elt2_raw_tbuffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v4f16( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x half> [[DATA]], i64 2 ; CHECK-NEXT: ret half [[ELT1]] ; - %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x half> %data, i32 2 ret half %elt1 } -define amdgpu_ps half @extract_elt1_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps half @extract_elt1_raw_tbuffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v4f16( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[DATA]], i64 1 ; CHECK-NEXT: ret half [[ELT1]] ; - %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x half> %data, i32 1 ret half %elt1 } -define amdgpu_ps half @extract_elt0_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +define amdgpu_ps half @extract_elt0_raw_tbuffer_load_v4f16(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { ; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v4f16( -; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.tbuffer.load.f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret half [[DATA]] ; - %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x half> %data, i32 0 ret half %elt1 } -declare half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32) #1 -declare <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32) #1 -declare <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32>, i32, i32, i32, i32) #1 -declare <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32) #1 +declare half @llvm.amdgcn.raw.tbuffer.load.f16(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(ptr addrspace(8), i32, i32, i32, i32) #1 +declare <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(ptr addrspace(8), i32, i32, i32, i32) #1 ; -------------------------------------------------------------------- ; llvm.amdgcn.struct.tbuffer.load ; -------------------------------------------------------------------- -define amdgpu_ps float @struct_tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps float @struct_tbuffer_load_f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @struct_tbuffer_load_f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) ret float %data } -define amdgpu_ps <2 x float> @struct_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps <2 x float> @struct_tbuffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @struct_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) ret <2 x float> %data } -define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @struct_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret <4 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) ret <4 x float> %data } -define amdgpu_ps float @extract_elt0_struct_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps float @extract_elt0_struct_tbuffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_struct_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps float @extract_elt1_struct_tbuffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt0_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps float @extract_elt0_struct_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps float @extract_elt1_struct_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps float @extract_elt2_struct_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt2_struct_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps float @extract_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps float @extract_elt3_struct_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt3_struct_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i64 3 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt0_elt1_struct_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt1_elt2_struct_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt2_elt3_struct_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret <3 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps float @extract_elt0_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps float @extract_elt0_struct_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps float @extract_elt1_struct_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps float @extract_elt2_struct_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt2_struct_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt0_elt1_struct_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract_elt1_elt2_struct_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define i32 @extract0_bitcast_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define i32 @extract0_bitcast_struct_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @extract0_bitcast_struct_tbuffer_load_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) ; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 ; CHECK-NEXT: ret i32 [[VAR2]] ; - %var = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %var = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %var1 = bitcast <4 x float> %var to <4 x i32> %var2 = extractelement <4 x i32> %var1, i32 0 ret i32 %var2 } -define amdgpu_ps float @preserve_metadata_extract_elt0_struct_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +define amdgpu_ps float @preserve_metadata_extract_elt0_struct_tbuffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { ; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0), !fpmath !0 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0), !fpmath !0 ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0), !fpmath !0 + %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -declare float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32>, i32, i32, i32, i32, i32) #1 -declare <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32>, i32, i32, i32, i32, i32) #1 -declare <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32>, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32) #1 +declare float @llvm.amdgcn.struct.tbuffer.load.f32(ptr addrspace(8), i32, i32, i32, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(ptr addrspace(8), i32, i32, i32, i32, i32) #1 +declare <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(ptr addrspace(8), i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(ptr addrspace(8), i32, i32, i32, i32, i32) #1 -declare <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32) #1 +declare <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(ptr addrspace(8), i32, i32, i32, i32, i32) #1 ; -------------------------------------------------------------------- ; llvm.amdgcn.tbuffer.load ; -------------------------------------------------------------------- -define amdgpu_ps float @tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps float @tbuffer_load_f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @tbuffer_load_f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call float @llvm.amdgcn.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ret float %data } -define amdgpu_ps <2 x float> @tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps <2 x float> @tbuffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ret <2 x float> %data } -define amdgpu_ps <4 x float> @tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps <4 x float> @tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: ret <4 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ret <4 x float> %data } -define amdgpu_ps float @extract_elt0_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps float @extract_elt0_tbuffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt0_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps float @extract_elt1_tbuffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt1_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt0_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps float @extract_elt0_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt0_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps float @extract_elt1_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt1_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps float @extract_elt2_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt2_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps float @extract_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps float @extract_elt3_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt3_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i64 3 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps <2 x float> @extract_elt2_elt3_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt2_elt3_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt0_elt1_elt2_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: ret <3 x float> [[DATA]] ; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt1_elt2_elt3_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt0_elt2_elt3_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[SHUF]] ; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -define amdgpu_ps float @extract_elt0_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps float @extract_elt0_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt0_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -define amdgpu_ps float @extract_elt1_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps float @extract_elt1_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt1_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -define amdgpu_ps float @extract_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps float @extract_elt2_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt2_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 ; CHECK-NEXT: ret float [[ELT1]] ; - %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: ret <2 x float> [[DATA]] ; - %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v3f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[SHUF]] ; - %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -define i32 @extract0_bitcast_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define i32 @extract0_bitcast_tbuffer_load_v4f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @extract0_bitcast_tbuffer_load_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 ; CHECK-NEXT: ret i32 [[VAR2]] ; - %var = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %var = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %var1 = bitcast <4 x float> %var to <4 x i32> %var2 = extractelement <4 x i32> %var1, i32 0 ret i32 %var2 } -define amdgpu_ps float @preserve_metadata_extract_elt0_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +define amdgpu_ps float @preserve_metadata_extract_elt0_tbuffer_load_v2f32(ptr addrspace(8) inreg %rsrc, i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: @preserve_metadata_extract_elt0_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0 ; CHECK-NEXT: ret float [[DATA]] ; - %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0 + %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -declare float @llvm.amdgcn.tbuffer.load.f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1 -declare <1 x float> @llvm.amdgcn.tbuffer.load.v1f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1 -declare <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1 -declare <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1 +declare float @llvm.amdgcn.tbuffer.load.f32(ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) #1 +declare <1 x float> @llvm.amdgcn.tbuffer.load.v1f32(ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) #1 +declare <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) #1 +declare <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) #1 -declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1 +declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(ptr addrspace(8), i32, i32, i32, i32, i32, i32, i1, i1) #1 ; -------------------------------------------------------------------- ; llvm.amdgcn.image.sample diff --git a/llvm/test/Transforms/LICM/AMDGPU/buffer-rsrc-ptrs.ll b/llvm/test/Transforms/LICM/AMDGPU/buffer-rsrc-ptrs.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LICM/AMDGPU/buffer-rsrc-ptrs.ll @@ -0,0 +1,122 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt < %s -aa-pipeline=basic-aa,scoped-noalias-aa -passes=licm -S | FileCheck %s + +target triple = "amdgcn-amd-amdhsa" + +define void @hoistable_noalias(ptr addrspace(8) noalias %p, ptr addrspace(8) noalias %q, i32 %bound) { +; CHECK-LABEL: define void @hoistable_noalias +; CHECK-SAME: (ptr addrspace(8) noalias [[P:%.*]], ptr addrspace(8) noalias [[Q:%.*]], i32 [[BOUND:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[HOISTABLE:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32(ptr addrspace(8) [[Q]], i32 0, i32 0, i32 0, i32 0) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[ORIG:%.*]] = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) [[P]], i32 [[I]], i32 0, i32 0) +; CHECK-NEXT: [[INC:%.*]] = add i32 [[HOISTABLE]], [[ORIG]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.buffer.store.i32(i32 [[INC]], ptr addrspace(8) [[P]], i32 [[I]], i32 0, i32 0) +; CHECK-NEXT: [[NEXT]] = add i32 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[NEXT]], [[BOUND]] +; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[TAIL:%.*]] +; CHECK: tail: +; CHECK-NEXT: ret void +; +entry: + br label %loop +loop: + %i = phi i32 [0, %entry], [%next, %loop] + + %hoistable = call i32 @llvm.amdgcn.struct.buffer.load.i32(ptr addrspace(8) %q, i32 0, i32 0, i32 0, i32 0) + %orig = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) %p, i32 %i, i32 0, i32 0) + %inc = add i32 %hoistable, %orig + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %inc, ptr addrspace(8) %p, i32 %i, i32 0, i32 0) + + %next = add i32 %i, 1 + %cond = icmp ult i32 %next, %bound + br i1 %cond, label %loop, label %tail +tail: + ret void +} + +define void @not_hoistable_may_alias(ptr addrspace(8) %p, ptr addrspace(8) %q, i32 %bound) { +; CHECK-LABEL: define void @not_hoistable_may_alias +; CHECK-SAME: (ptr addrspace(8) [[P:%.*]], ptr addrspace(8) [[Q:%.*]], i32 [[BOUND:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[HOISTABLE:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32(ptr addrspace(8) [[Q]], i32 0, i32 0, i32 0, i32 0) +; CHECK-NEXT: [[ORIG:%.*]] = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) [[P]], i32 [[I]], i32 0, i32 0) +; CHECK-NEXT: [[INC:%.*]] = add i32 [[HOISTABLE]], [[ORIG]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.buffer.store.i32(i32 [[INC]], ptr addrspace(8) [[P]], i32 [[I]], i32 0, i32 0) +; CHECK-NEXT: [[NEXT]] = add i32 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[NEXT]], [[BOUND]] +; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[TAIL:%.*]] +; CHECK: tail: +; CHECK-NEXT: ret void +; +entry: + br label %loop +loop: + %i = phi i32 [0, %entry], [%next, %loop] + + %hoistable = call i32 @llvm.amdgcn.struct.buffer.load.i32(ptr addrspace(8) %q, i32 0, i32 0, i32 0, i32 0) + %orig = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) %p, i32 %i, i32 0, i32 0) + %inc = add i32 %hoistable, %orig + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %inc, ptr addrspace(8) %p, i32 %i, i32 0, i32 0) + + %next = add i32 %i, 1 + %cond = icmp ult i32 %next, %bound + br i1 %cond, label %loop, label %tail +tail: + ret void +} + +define void @hoistable_alias_scope(ptr addrspace(8) %p, ptr addrspace(8) %q, i32 %bound) { +; CHECK-LABEL: define void @hoistable_alias_scope +; CHECK-SAME: (ptr addrspace(8) [[P:%.*]], ptr addrspace(8) [[Q:%.*]], i32 [[BOUND:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[HOISTABLE:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32(ptr addrspace(8) [[Q]], i32 0, i32 0, i32 0, i32 0), !alias.scope !0, !noalias !3 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[ORIG:%.*]] = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) [[P]], i32 [[I]], i32 0, i32 0), !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[INC:%.*]] = add i32 [[HOISTABLE]], [[ORIG]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.buffer.store.i32(i32 [[INC]], ptr addrspace(8) [[P]], i32 [[I]], i32 0, i32 0), !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[NEXT]] = add i32 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[NEXT]], [[BOUND]] +; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[TAIL:%.*]] +; CHECK: tail: +; CHECK-NEXT: ret void +; +entry: + br label %loop +loop: + %i = phi i32 [0, %entry], [%next, %loop] + + %hoistable = call i32 @llvm.amdgcn.struct.buffer.load.i32(ptr addrspace(8) %q, i32 0, i32 0, i32 0, i32 0), !alias.scope !4, !noalias !3 + %orig = call i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) %p, i32 %i, i32 0, i32 0), !alias.scope !3, !noalias !4 + %inc = add i32 %hoistable, %orig + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %inc, ptr addrspace(8) %p, i32 %i, i32 0, i32 0), !alias.scope !3, !noalias !4 + + %next = add i32 %i, 1 + %cond = icmp ult i32 %next, %bound + br i1 %cond, label %loop, label %tail +tail: + ret void +} +!0 = !{!0, !"hoisting"} +!1 = !{!1, !0, !"p"} +!2 = !{!2, !0, !"q"} + +!3 = !{!1} +!4 = !{!2} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare i32 @llvm.amdgcn.raw.buffer.load.i32(ptr addrspace(8) nocapture readonly, i32, i32, i32 immarg) #0 +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare i32 @llvm.amdgcn.struct.buffer.load.i32(ptr addrspace(8) nocapture readonly, i32, i32, i32, i32 immarg) #0 +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.amdgcn.raw.buffer.store.i32(i32, ptr addrspace(8) nocapture writeonly, i32, i32, i32 immarg) #1 + +attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } diff --git a/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll --- a/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll +++ b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll @@ -420,7 +420,7 @@ ret void } -declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(ptr addrspace(8), i32, i32, i32) #0 declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1) nounwind declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1) nounwind attributes #0 = { nounwind readonly } diff --git a/llvm/test/Transforms/StructurizeCFG/rebuild-ssa-infinite-loop-inseltpoison.ll b/llvm/test/Transforms/StructurizeCFG/rebuild-ssa-infinite-loop-inseltpoison.ll --- a/llvm/test/Transforms/StructurizeCFG/rebuild-ssa-infinite-loop-inseltpoison.ll +++ b/llvm/test/Transforms/StructurizeCFG/rebuild-ssa-infinite-loop-inseltpoison.ll @@ -9,7 +9,7 @@ define amdgpu_vs void @wrapper(i32 inreg %arg, i32 %arg1) { main_body: %tmp = add i32 %arg1, %arg - %tmp2 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> undef, i32 %tmp, i32 0, i32 0, i32 0) + %tmp2 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) undef, i32 %tmp, i32 0, i32 0, i32 0) %tmp3 = extractelement <4 x float> %tmp2, i32 1 %tmp4 = fptosi float %tmp3 to i32 %tmp5 = insertelement <2 x i32> poison, i32 %tmp4, i32 1 @@ -45,7 +45,7 @@ ; Function Attrs: nounwind declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 -declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32 immarg) #2 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8), i32, i32, i32, i32 immarg) #2 declare <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 attributes #0 = { nounwind } diff --git a/llvm/test/Transforms/StructurizeCFG/rebuild-ssa-infinite-loop.ll b/llvm/test/Transforms/StructurizeCFG/rebuild-ssa-infinite-loop.ll --- a/llvm/test/Transforms/StructurizeCFG/rebuild-ssa-infinite-loop.ll +++ b/llvm/test/Transforms/StructurizeCFG/rebuild-ssa-infinite-loop.ll @@ -9,7 +9,7 @@ define amdgpu_vs void @wrapper(i32 inreg %arg, i32 %arg1) { main_body: %tmp = add i32 %arg1, %arg - %tmp2 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> undef, i32 %tmp, i32 0, i32 0, i32 0) + %tmp2 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8) undef, i32 %tmp, i32 0, i32 0, i32 0) %tmp3 = extractelement <4 x float> %tmp2, i32 1 %tmp4 = fptosi float %tmp3 to i32 %tmp5 = insertelement <2 x i32> undef, i32 %tmp4, i32 1 @@ -45,7 +45,7 @@ ; Function Attrs: nounwind declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 -declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32 immarg) #2 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(ptr addrspace(8), i32, i32, i32, i32 immarg) #2 declare <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 attributes #0 = { nounwind } diff --git a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll --- a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll +++ b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll @@ -1,76 +1,76 @@ ; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s -declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) -define void @buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i1 %bool) { +declare float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8), i32, i32, i1, i1) +define void @buffer_load_f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i1 %bool) { ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i1 %bool - ; CHECK-NEXT: %data0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 %bool, i1 false) - %data0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 %bool, i1 false) + ; CHECK-NEXT: %data0 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 %bool, i1 false) + %data0 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 %bool, i1 false) ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i1 %bool - ; CHECK-NEXT: %data1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 %bool) - %data1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 %bool) + ; CHECK-NEXT: %data1 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 %bool) + %data1 = call float @llvm.amdgcn.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i1 false, i1 %bool) ret void } -declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) -define void @raw_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs, i32 %arg) { +declare float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8), i32, i32, i32) +define void @raw_buffer_load_f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs, i32 %arg) { ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i32 %arg - ; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 %arg) - %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 %arg) + ; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 %arg) + %data = call float @llvm.amdgcn.raw.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 %arg) ret void } -declare float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32>, i32, i32, i32) -define void @raw_buffer_load_format_f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs, i32 %arg) { +declare float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8), i32, i32, i32) +define void @raw_buffer_load_format_f32(ptr addrspace(8) inreg %rsrc, i32 %ofs, i32 %sofs, i32 %arg) { ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i32 %arg - ; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 %arg) - %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 %arg) + ; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 %arg) + %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 %sofs, i32 %arg) ret void } -declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) -define void @struct_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 %arg) { +declare float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8), i32, i32, i32, i32) +define void @struct_buffer_load_f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 %arg) { ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i32 %arg - ; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 %arg) - %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 %arg) + ; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 %arg) + %data = call float @llvm.amdgcn.struct.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 %arg) ret void } -declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32) -define void @struct_buffer_load_format_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 %arg) { +declare float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8), i32, i32, i32, i32) +define void @struct_buffer_load_format_f32(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 %arg) { ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i32 %arg - ; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 %arg) - %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 %arg) + ; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 %arg) + %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 %arg) ret void } -declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) -define void @invalid_image_sample_1d_v4f32_f32(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc, i32 %dmask, i1 %bool, i32 %arg) { +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, ptr addrspace(8), i1, i32, i32) +define void @invalid_image_sample_1d_v4f32_f32(float %vaddr, <8 x i32> inreg %sampler, ptr addrspace(8) inreg %rsrc, i32 %dmask, i1 %bool, i32 %arg) { ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i32 %dmask - ; CHECK-NEXT: %data0 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 %dmask, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) - %data0 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 %dmask, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) + ; CHECK-NEXT: %data0 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 %dmask, float %vaddr, <8 x i32> %sampler, ptr addrspace(8) %rsrc, i1 false, i32 0, i32 0) + %data0 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 %dmask, float %vaddr, <8 x i32> %sampler, ptr addrspace(8) %rsrc, i1 false, i32 0, i32 0) ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i1 %bool - ; CHECK-NEXT: %data1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 0, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 %bool, i32 0, i32 0) - %data1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 0, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 %bool, i32 0, i32 0) + ; CHECK-NEXT: %data1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 0, float %vaddr, <8 x i32> %sampler, ptr addrspace(8) %rsrc, i1 %bool, i32 0, i32 0) + %data1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 0, float %vaddr, <8 x i32> %sampler, ptr addrspace(8) %rsrc, i1 %bool, i32 0, i32 0) ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i32 %arg - ; CHECK-NEXT: %data2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 0, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 %arg, i32 0) - %data2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 0, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 %arg, i32 0) + ; CHECK-NEXT: %data2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 0, float %vaddr, <8 x i32> %sampler, ptr addrspace(8) %rsrc, i1 false, i32 %arg, i32 0) + %data2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 0, float %vaddr, <8 x i32> %sampler, ptr addrspace(8) %rsrc, i1 false, i32 %arg, i32 0) ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i32 %arg - ; CHECK-NEXT: %data3 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 0, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 %arg) - %data3 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 0, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 %arg) + ; CHECK-NEXT: %data3 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 0, float %vaddr, <8 x i32> %sampler, ptr addrspace(8) %rsrc, i1 false, i32 0, i32 %arg) + %data3 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 0, float %vaddr, <8 x i32> %sampler, ptr addrspace(8) %rsrc, i1 false, i32 0, i32 %arg) ret void } @@ -469,12 +469,12 @@ ret void } -declare i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i32) -define amdgpu_ps void @raw_buffer_atomic_swap(<4 x i32> inreg %rsrc, i32 %data, i32 %var) { +declare i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32, ptr addrspace(8), i32, i32, i32) +define amdgpu_ps void @raw_buffer_atomic_swap(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %var) { ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i32 %var - ; CHECK-NEXT: %val2 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 %var) - %val2 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 %var) + ; CHECK-NEXT: %val2 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 %var) + %val2 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 %var) ret void } @@ -704,11 +704,11 @@ ret void } -declare void @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) -define amdgpu_cs void @test_buffer_atomic_fadd(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %offset, i1 %slc) { +declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, ptr addrspace(8), i32, i32, i1) +define amdgpu_cs float @test_buffer_atomic_fadd(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %offset, i1 %slc) { ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i1 %slc - ; CHECK-ENXT: call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %offset, i1 %slc) - call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %offset, i1 %slc) - ret void + ; CHECK-ENXT: call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %offset, i1 %slc) + %val2 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %offset, i1 %slc) + ret float %val2 }