diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1468,21 +1468,9 @@ std::pair AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const { SDLoc DL(N); - const MachineFunction &MF = CurDAG->getMachineFunction(); - const SIMachineFunctionInfo *Info = MF.getInfo(); - - if (auto FI = dyn_cast(N)) { - SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(), - FI->getValueType(0)); - - // If we can resolve this to a frame index access, this will be relative to - // either the stack or frame pointer SGPR. - return std::make_pair( - TFI, CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)); - } - // If we don't know this private access is a local stack object, it needs to - // be relative to the entry point's scratch wave offset. + // We rebase the base address into an absolute stack address and hence + // use constant 0 for soffset. return std::make_pair(N, CurDAG->getTargetConstant(0, DL, MVT::i32)); } diff --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll --- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll +++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll @@ -34,9 +34,9 @@ ; GCN-DAG: s_load_dword [[LDSPTR:s[0-9]+]] ; GCN-DAG: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]] +; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}} ; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO]] -; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}} ; GCN: ds_write_b32 [[VLDSPTR]], [[FI1]] define amdgpu_kernel void @stored_fi_to_lds_2_small_objects(float addrspace(5)* addrspace(3)* %ptr) #0 { %tmp0 = alloca float, addrspace(5) @@ -51,8 +51,8 @@ ; Same frame index is used multiple times in the store ; GCN-LABEL: {{^}}stored_fi_to_self: ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x4d2{{$}} -; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}} +; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} ; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} define amdgpu_kernel void @stored_fi_to_self() #0 { %tmp = alloca i32 addrspace(5)*, addrspace(5) @@ -66,12 +66,12 @@ ; GCN-LABEL: {{^}}stored_fi_to_self_offset: ; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 32{{$}} +; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x4d2{{$}} ; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} -; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x4d2{{$}} +; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x804{{$}} ; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2052{{$}} -; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x804{{$}} ; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2052{{$}} define amdgpu_kernel void @stored_fi_to_self_offset() #0 { %tmp0 = alloca [512 x i32], addrspace(5) @@ -91,12 +91,12 @@ ; GCN-LABEL: {{^}}stored_fi_to_fi: ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12{{$}} ; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}} -; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12{{$}} - ; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}} + +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12{{$}} +; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12{{$}} ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}} define amdgpu_kernel void @stored_fi_to_fi() #0 { %tmp0 = alloca i32 addrspace(5)*, addrspace(5) @@ -115,8 +115,8 @@ } ; GCN-LABEL: {{^}}stored_fi_to_global: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} ; GCN: buffer_store_dword [[FI]] define amdgpu_kernel void @stored_fi_to_global(float addrspace(5)* addrspace(1)* %ptr) #0 { %tmp = alloca float, addrspace(5) @@ -128,13 +128,14 @@ ; Offset is applied ; GCN-LABEL: {{^}}stored_fi_to_global_2_small_objects: ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} +; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}} + ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12{{$}} +; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}} -; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12{{$}} ; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}} ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} define amdgpu_kernel void @stored_fi_to_global_2_small_objects(float addrspace(5)* addrspace(1)* %ptr) #0 { %tmp0 = alloca float, addrspace(5) @@ -150,10 +151,10 @@ ; GCN-LABEL: {{^}}stored_fi_to_global_huge_frame_offset: ; GCN: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}} -; GCN: buffer_store_dword [[BASE_0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} ; FIXME: Re-initialize ; GCN: v_mov_b32_e32 [[BASE_0_1:v[0-9]+]], 4{{$}} +; GCN: buffer_store_dword [[BASE_0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} ; GCN-DAG: v_add_i32_e32 [[BASE_1_OFF_1:v[0-9]+]], vcc, 0x3ffc, [[BASE_0_1]] diff --git a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll --- a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll @@ -2,8 +2,12 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,WAVE32 %s ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo14: -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4 -; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x3ffc, [[FI]] +; WAVE64: v_mov_b32_e32 [[FI:v[0-9]+]], 4 +; WAVE64: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x3ffc, [[FI]] + +; WAVE32: s_movk_i32 [[MASK_VAL:s[0-9]+]], 0x3ffc +; WAVE32: v_and_b32_e64 [[MASKED:v[0-9]+]], [[MASK_VAL]], 4 + ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]] define amdgpu_kernel void @scratch_buffer_known_high_masklo14() #0 { %alloca = alloca i32, align 4, addrspace(5) @@ -15,8 +19,12 @@ } ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16: -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4 -; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xfffc, [[FI]] +; WAVE64: v_mov_b32_e32 [[FI:v[0-9]+]], 4 +; WAVE64: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xfffc, [[FI]] + +; WAVE32: s_mov_b32 [[MASK_VAL:s[0-9]+]], 0xfffc +; WAVE32: v_and_b32_e64 [[MASKED:v[0-9]+]], [[MASK_VAL]], 4 + ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]] define amdgpu_kernel void @scratch_buffer_known_high_masklo16() #0 { %alloca = alloca i32, align 4, addrspace(5) @@ -28,11 +36,12 @@ } ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo17: -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4 +; WAVE64: v_mov_b32_e32 [[FI:v[0-9]+]], 4 ; WAVE64-NOT: [[FI]] ; WAVE64: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FI]] -; WAVE32: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x1fffc, [[FI]] +; WAVE32: s_mov_b32 [[MASK_VAL:s[0-9]+]], 0x1fffc +; WAVE32: v_and_b32_e64 [[MASKED:v[0-9]+]], [[MASK_VAL]], 4 ; WAVE32: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]] define amdgpu_kernel void @scratch_buffer_known_high_masklo17() #0 { %alloca = alloca i32, align 4, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -22,33 +22,34 @@ ; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: v_mov_b32_e32 v1, 0x3000 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: v_add_u32_e32 v0, 64, v1 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_e32 v3, 0x2000 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v2, 0x2000 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v1, 0x3000 +; GCN-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GCN-NEXT: BB0_1: ; %loadstoreloop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_add_u32_e32 v3, s6, v1 +; GCN-NEXT: v_add_u32_e32 v2, s6, v1 ; GCN-NEXT: s_add_i32 s6, s6, 1 ; GCN-NEXT: s_cmpk_lt_u32 s6, 0x2120 -; GCN-NEXT: buffer_store_byte v2, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_byte v0, v2, s[0:3], 0 offen ; GCN-NEXT: s_cbranch_scc1 BB0_1 ; GCN-NEXT: ; %bb.2: ; %split -; GCN-NEXT: v_mov_b32_e32 v1, 0x3000 -; GCN-NEXT: v_add_u32_e32 v1, 0x20d0, v1 -; GCN-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 -; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 +; GCN-NEXT: v_mov_b32_e32 v0, 0x3000 +; GCN-NEXT: v_add_u32_e32 v0, 0x20d0, v0 +; GCN-NEXT: v_mov_b32_e32 v4, 0x3000 +; GCN-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen offset:64 +; GCN-NEXT: v_mov_b32_e32 v4, 0x3000 +; GCN-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:68 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v2, v3 +; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GCN-NEXT: s_endpgm @@ -75,33 +76,34 @@ ; GCN-NEXT: s_mov_b32 s5, s33 ; GCN-NEXT: s_and_b32 s33, s4, 0xfff80000 ; GCN-NEXT: v_lshrrev_b32_e64 v3, 6, s33 -; GCN-NEXT: v_add_u32_e32 v3, 0x1000, v3 -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: v_add_u32_e32 v2, 64, v3 +; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_add_u32_e32 v3, 0x1000, v3 ; GCN-NEXT: s_add_u32 s32, s32, 0x180000 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; GCN-NEXT: BB1_1: ; %loadstoreloop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_add_u32_e32 v5, s4, v3 +; GCN-NEXT: v_add_u32_e32 v4, s4, v3 ; GCN-NEXT: s_add_i32 s4, s4, 1 ; GCN-NEXT: s_cmpk_lt_u32 s4, 0x2120 -; GCN-NEXT: buffer_store_byte v4, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_byte v2, v4, s[0:3], 0 offen ; GCN-NEXT: s_cbranch_scc1 BB1_1 ; GCN-NEXT: ; %bb.2: ; %split -; GCN-NEXT: v_lshrrev_b32_e64 v3, 6, s33 -; GCN-NEXT: v_add_u32_e32 v3, 0x1000, v3 -; GCN-NEXT: v_add_u32_e32 v3, 0x20d0, v3 -; GCN-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen offset:4 -; GCN-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:4 +; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33 +; GCN-NEXT: v_add_u32_e32 v2, 0x1000, v2 +; GCN-NEXT: v_add_u32_e32 v2, 0x20d0, v2 +; GCN-NEXT: v_mov_b32_e32 v6, 0x1000 +; GCN-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; GCN-NEXT: buffer_load_dword v2, v6, s[0:3], s33 offen offset:64 +; GCN-NEXT: v_mov_b32_e32 v6, 0x1000 +; GCN-NEXT: buffer_load_dword v5, v6, s[0:3], s33 offen offset:68 ; GCN-NEXT: s_sub_u32 s32, s32, 0x180000 ; GCN-NEXT: s_mov_b32 s33, s5 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v4, v5 +; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc +; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v5, vcc ; GCN-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll --- a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll @@ -10,6 +10,8 @@ ; GCN-LABEL: {{^}}legal_offset_fi: ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:4{{$}} +; GCN: s_waitcnt +; GCN: v_mov_b32_e32 ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x8004 ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offen{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -27,7 +27,6 @@ ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: v_mov_b32_e32 v4, 0x400000 ; GCN-NEXT: s_mov_b32 s32, 0xc0000 -; GCN-NEXT: v_add_nc_u32_e64 v40, 4, 0x4000 ; GCN-NEXT: ; implicit-def: $vcc_hi ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4 @@ -41,9 +40,10 @@ ; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GCN-NEXT: s_cbranch_execz BB0_2 ; GCN-NEXT: ; %bb.1: ; %if.then4.i -; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: buffer_load_dword v0, v40, s[36:39], 0 offen -; GCN-NEXT: buffer_load_dword v1, v40, s[36:39], 0 offen offset:4 +; GCN-NEXT: v_mov_b32_e32 v2, 0x4000 +; GCN-NEXT: buffer_load_dword v0, v2, s[36:39], 0 offen offset:4 +; GCN-NEXT: v_mov_b32_e32 v2, 0x4000 +; GCN-NEXT: buffer_load_dword v1, v2, s[36:39], 0 offen offset:8 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0