Index: lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- lib/Target/AMDGPU/BUFInstructions.td +++ lib/Target/AMDGPU/BUFInstructions.td @@ -1422,54 +1422,6 @@ defm : MUBUFScratchLoadPat_Lo16; defm : MUBUFScratchLoadPat_Lo16; } - -// BUFFER_LOAD_DWORD*, addr64=0 -multiclass MUBUF_Load_Dword { - - def : GCNPat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset, - imm:$offset, 0, 0, imm:$glc, imm:$slc, - imm:$tfe)), - (offset $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), - (as_i1imm $slc), (as_i1imm $tfe)) - >; - - def : GCNPat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, - imm:$offset, 1, 0, imm:$glc, imm:$slc, - imm:$tfe)), - (offen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $tfe)) - >; - - def : GCNPat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, - imm:$offset, 0, 1, imm:$glc, imm:$slc, - imm:$tfe)), - (idxen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), - (as_i1imm $slc), (as_i1imm $tfe)) - >; - - def : GCNPat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset, - imm:$offset, 1, 1, imm:$glc, imm:$slc, - imm:$tfe)), - (bothen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $tfe)) - >; -} - -defm : MUBUF_Load_Dword ; -defm : MUBUF_Load_Dword ; -defm : MUBUF_Load_Dword ; - multiclass MUBUFStore_Atomic_Pattern { // Store follows atomic op convention so address is forst Index: lib/Target/AMDGPU/SIIntrinsics.td =================================================================== --- lib/Target/AMDGPU/SIIntrinsics.td +++ lib/Target/AMDGPU/SIIntrinsics.td @@ -34,18 +34,4 @@ llvm_i32_ty], // tfe(imm) []>; - // Fully-flexible BUFFER_LOAD_DWORD_* except for the ADDR64 bit, which is not exposed - def int_SI_buffer_load_dword : Intrinsic < - [llvm_anyint_ty], // vdata(VGPR), overloaded for types i32, v2i32, v4i32 - [llvm_anyint_ty, // rsrc(SGPR) - llvm_anyint_ty, // vaddr(VGPR) - llvm_i32_ty, // soffset(SGPR) - llvm_i32_ty, // inst_offset(imm) - llvm_i32_ty, // offen(imm) - llvm_i32_ty, // idxen(imm) - llvm_i32_ty, // glc(imm) - llvm_i32_ty, // slc(imm) - llvm_i32_ty], // tfe(imm) - [IntrReadMem, IntrArgMemOnly]>; - } // End TargetPrefix = "SI", isTarget = 1 Index: test/CodeGen/AMDGPU/llvm.SI.load.dword.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.SI.load.dword.ll +++ /dev/null @@ -1,54 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s - -; Example of a simple geometry shader loading vertex attributes from the -; ESGS ring buffer - -; FIXME: Out of bounds immediate offset crashes - -; CHECK-LABEL: {{^}}main: -; CHECK: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 glc slc -; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen glc slc -; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen glc slc -; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen offen glc slc -; CHECK: s_movk_i32 [[K:s[0-9]+]], 0x4d2 ; encoding -; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, [[K]] idxen offen offset:65535 glc slc - -define amdgpu_vs void @main([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <32 x i8>] addrspace(4)* byval %arg2, [2 x <4 x i32>] addrspace(4)* byval %arg3, [17 x <4 x i32>] addrspace(4)* inreg %arg4, [17 x <4 x i32>] addrspace(4)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) { -main_body: - %tmp = getelementptr [2 x <4 x i32>], [2 x <4 x i32>] addrspace(4)* %arg3, i64 0, i32 1 - %tmp10 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0 - %tmp11 = shl i32 %arg6, 2 - %tmp12 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0) - %tmp13 = bitcast i32 %tmp12 to float - %tmp14 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp10, i32 %tmp11, i32 0, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0) - %tmp15 = bitcast i32 %tmp14 to float - %tmp16 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp10, i32 %tmp11, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 0) - %tmp17 = bitcast i32 %tmp16 to float - %tmp18 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<4 x i32> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 0) - %tmp19 = bitcast i32 %tmp18 to float - - %tmp20 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<4 x i32> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 123, i32 1, i32 1, i32 1, i32 1, i32 0) - %tmp21 = bitcast i32 %tmp20 to float - - %tmp22 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<4 x i32> %tmp10, <2 x i32> zeroinitializer, i32 1234, i32 65535, i32 1, i32 1, i32 1, i32 1, i32 0) - %tmp23 = bitcast i32 %tmp22 to float - - call void @llvm.amdgcn.exp.f32(i32 15, i32 12, float %tmp13, float %tmp15, float %tmp17, float %tmp19, i1 false, i1 false) - call void @llvm.amdgcn.exp.f32(i32 15, i32 12, float %tmp21, float %tmp23, float %tmp23, float %tmp23, i1 true, i1 false) - ret void -} - -; Function Attrs: nounwind readonly -declare i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 - -; Function Attrs: nounwind readonly -declare i32 @llvm.SI.buffer.load.dword.i32.v2i32(<4 x i32>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32) #0 - -declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 - -attributes #0 = { nounwind readonly } -attributes #1 = { nounwind inaccessiblememonly } - -!0 = !{!"const", !1, i32 1} -!1 = !{!"tbaa root"} Index: test/CodeGen/AMDGPU/mubuf.ll =================================================================== --- test/CodeGen/AMDGPU/mubuf.ll +++ test/CodeGen/AMDGPU/mubuf.ll @@ -60,7 +60,7 @@ %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(4)* %0, i32 0, i32 0 %tmp1 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp0 %tmp2 = shl i32 %6, 2 - %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) + %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.dword.i32(<4 x i32> %tmp1, i32 %tmp2, i32 64, i32 1) %tmp4 = add i32 %6, 16 %tmp1.4xi32 = bitcast <4 x i32> %tmp1 to <4 x i32> call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1) @@ -79,7 +79,7 @@ %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(4)* %0, i32 0, i32 0 %tmp1 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp0 %tmp2 = shl i32 %6, 2 - %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) + %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.dword.i32(<4 x i32> %tmp1, i32 %tmp2, i32 65, i32 1) %tmp4 = add i32 %6, 16 %tmp1.4xi32 = bitcast <4 x i32> %tmp1 to <4 x i32> call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1) @@ -176,7 +176,7 @@ ret void } -declare i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.load.dword.i32(<4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) attributes #0 = { nounwind readonly } Index: test/CodeGen/AMDGPU/scheduler-subrange-crash.ll =================================================================== --- test/CodeGen/AMDGPU/scheduler-subrange-crash.ll +++ test/CodeGen/AMDGPU/scheduler-subrange-crash.ll @@ -24,7 +24,7 @@ %array_vector9 = insertelement <4 x float> , float %tmp1, i32 1 %array_vector10 = insertelement <4 x float> %array_vector9, float 0.000000e+00, i32 2 %array_vector11 = insertelement <4 x float> %array_vector10, float undef, i32 3 - %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> undef, i32 undef, i32 4864, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) + %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> undef, i32 undef, i32 4864, i32 0) call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 36, i32 4, i32 4, i1 1, i1 1) %bc = bitcast <4 x float> %array_vector3 to <4 x i32> %tmp4 = extractelement <4 x i32> %bc, i32 undef @@ -46,7 +46,7 @@ } declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 -declare i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2 +declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32) #2 declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #3 attributes #0 = { nounwind "target-cpu"="tonga" }