Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -191,6 +191,8 @@ unsigned NumElem, unsigned AS) const override; bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override; + bool shouldConsiderGEPOffsetSplit() const override; + bool isCheapToSpeculateCttz() const override; bool isCheapToSpeculateCtlz() const override; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -762,6 +762,10 @@ return true; } +bool AMDGPUTargetLowering::shouldConsiderGEPOffsetSplit() const { + return true; +} + bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { // Truncate is just accessing a subregister. Index: test/CodeGen/AMDGPU/large-offset-gep.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/large-offset-gep.ll @@ -0,0 +1,228 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck %s + +%struct_type = type { [10000 x i32], i32, i32 } + +define amdgpu_kernel void @test1_global_sgpr_ptr(%struct_type addrspace(1)* addrspace(1)* %s, i32 %n) { +; CHECK-LABEL: test1_global_sgpr_ptr: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_add_u32 s2, s2, 0x9c40 +; CHECK-NEXT: s_addc_u32 s3, s3, 0 +; CHECK-NEXT: s_cmp_ge_i32 s1, s0 +; CHECK-NEXT: s_cbranch_scc1 BB0_2 +; CHECK-NEXT: BB0_1: ; %while_body +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_add_i32 s4, s1, 1 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; CHECK-NEXT: s_mov_b32 s1, s4 +; CHECK-NEXT: s_cmp_ge_i32 s1, s0 +; CHECK-NEXT: s_cbranch_scc0 BB0_1 +; CHECK-NEXT: BB0_2: ; %while_end +; CHECK-NEXT: s_endpgm +entry: + %struct = load %struct_type addrspace(1)*, %struct_type addrspace(1)* addrspace(1)* %s + br label %while_cond + +while_cond: + %phi = phi i32 [ 0, %entry ], [ %i, %while_body ] + %gep0 = getelementptr %struct_type, %struct_type addrspace(1)* %struct, i64 0, i32 1 + %gep1 = getelementptr %struct_type, %struct_type addrspace(1)* %struct, i64 0, i32 2 + %cmp = icmp slt i32 %phi, %n + br i1 %cmp, label %while_body, label %while_end + +while_body: + %i = add i32 %phi, 1 + store i32 %i, i32 addrspace(1)* %gep0 + store i32 %phi, i32 addrspace(1)* %gep1 + br label %while_cond + +while_end: + ret void +} + +define amdgpu_kernel void @test2_global_sgpr_ptr(%struct_type addrspace(1)* %struct, i32 %n) { +; CHECK-LABEL: test2_global_sgpr_ptr: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0 +; CHECK-NEXT: s_cbranch_scc1 BB1_3 +; CHECK-NEXT: ; %bb.1: ; %while_cond.preheader +; CHECK-NEXT: s_load_dword s2, s[4:5], 0x8 +; CHECK-NEXT: s_add_u32 s0, s0, 0x9c40 +; CHECK-NEXT: s_mov_b32 s3, 0 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_cmp_ge_i32 s3, s2 +; CHECK-NEXT: s_cbranch_scc1 BB1_3 +; CHECK-NEXT: BB1_2: ; %while_body +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_add_i32 s4, s3, 1 +; CHECK-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; CHECK-NEXT: s_mov_b32 s3, s4 +; CHECK-NEXT: s_cmp_ge_i32 s3, s2 +; CHECK-NEXT: s_cbranch_scc0 BB1_2 +; CHECK-NEXT: BB1_3: ; %while_end +; CHECK-NEXT: s_endpgm +entry: + %cmp = icmp eq %struct_type addrspace(1)* %struct, null + br i1 %cmp, label %while_end, label %while_cond + +while_cond: + %phi = phi i32 [ 0, %entry ], [ %i, %while_body ] + %gep0 = getelementptr %struct_type, %struct_type addrspace(1)* %struct, i64 0, i32 1 + %gep1 = getelementptr %struct_type, %struct_type addrspace(1)* %struct, i64 0, i32 2 + %cmp1 = icmp slt i32 %phi, %n + br i1 %cmp1, label %while_body, label %while_end + +while_body: + %i = add i32 %phi, 1 + store i32 %i, i32 addrspace(1)* %gep0 + store i32 %phi, i32 addrspace(1)* %gep1 + br label %while_cond + +while_end: + ret void +} + +define void @test1_global_vgpr_ptr(%struct_type addrspace(1)* addrspace(1)* %s, i32 %n) { +; CHECK-LABEL: test1_global_vgpr_ptr: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; CHECK-NEXT: s_mov_b32 s6, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: ; implicit-def: $sgpr8_sgpr9 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, 0x9c40, v0 +; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-NEXT: s_branch BB2_2 +; CHECK-NEXT: BB2_1: ; %Flow +; CHECK-NEXT: ; in Loop: Header=BB2_2 Depth=1 +; CHECK-NEXT: s_or_b64 exec, exec, s[10:11] +; CHECK-NEXT: s_and_b64 s[10:11], exec, s[8:9] +; CHECK-NEXT: s_or_b64 s[10:11], s[10:11], s[4:5] +; CHECK-NEXT: s_mov_b64 s[4:5], s[10:11] +; CHECK-NEXT: s_andn2_b64 exec, exec, s[10:11] +; CHECK-NEXT: s_cbranch_execz BB2_4 +; CHECK-NEXT: BB2_2: ; %while_cond +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s6, v2 +; CHECK-NEXT: s_or_b64 s[8:9], s[8:9], exec +; CHECK-NEXT: s_and_saveexec_b64 s[10:11], vcc +; CHECK-NEXT: ; mask branch BB2_1 +; CHECK-NEXT: s_cbranch_execz BB2_1 +; CHECK-NEXT: BB2_3: ; %while_body +; CHECK-NEXT: ; in Loop: Header=BB2_2 Depth=1 +; CHECK-NEXT: s_add_i32 s7, s6, 1 +; CHECK-NEXT: v_mov_b32_e32 v4, s6 +; CHECK-NEXT: v_mov_b32_e32 v3, s7 +; CHECK-NEXT: s_andn2_b64 s[8:9], s[8:9], exec +; CHECK-NEXT: s_mov_b32 s6, s7 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[3:4], off +; CHECK-NEXT: s_branch BB2_1 +; CHECK-NEXT: BB2_4: ; %while_end +; CHECK-NEXT: s_or_b64 exec, exec, s[10:11] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + %struct = load %struct_type addrspace(1)*, %struct_type addrspace(1)* addrspace(1)* %s + br label %while_cond + +while_cond: + %phi = phi i32 [ 0, %entry ], [ %i, %while_body ] + %gep0 = getelementptr %struct_type, %struct_type addrspace(1)* %struct, i64 0, i32 1 + %gep1 = getelementptr %struct_type, %struct_type addrspace(1)* %struct, i64 0, i32 2 + %cmp = icmp slt i32 %phi, %n + br i1 %cmp, label %while_body, label %while_end + +while_body: + %i = add i32 %phi, 1 + store i32 %i, i32 addrspace(1)* %gep0 + store i32 %phi, i32 addrspace(1)* %gep1 + br label %while_cond + +while_end: + ret void +} + +define void @test2_global_vgpr_ptr(%struct_type addrspace(1)* %struct, i32 %n) { +; CHECK-LABEL: test2_global_vgpr_ptr: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: s_mov_b64 s[6:7], 0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: ; mask branch BB3_6 +; CHECK-NEXT: s_cbranch_execz BB3_6 +; CHECK-NEXT: BB3_1: ; %while_cond.preheader +; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, 0x9c40, v0 +; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-NEXT: ; implicit-def: $sgpr10_sgpr11 +; CHECK-NEXT: s_branch BB3_3 +; CHECK-NEXT: BB3_2: ; %Flow +; CHECK-NEXT: ; in Loop: Header=BB3_3 Depth=1 +; CHECK-NEXT: s_or_b64 exec, exec, s[12:13] +; CHECK-NEXT: s_and_b64 s[12:13], exec, s[10:11] +; CHECK-NEXT: s_or_b64 s[12:13], s[12:13], s[6:7] +; CHECK-NEXT: s_mov_b64 s[6:7], s[12:13] +; CHECK-NEXT: s_andn2_b64 exec, exec, s[12:13] +; CHECK-NEXT: s_cbranch_execz BB3_5 +; CHECK-NEXT: BB3_3: ; %while_cond +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s8, v2 +; CHECK-NEXT: s_or_b64 s[10:11], s[10:11], exec +; CHECK-NEXT: s_and_saveexec_b64 s[12:13], vcc +; CHECK-NEXT: ; mask branch BB3_2 +; CHECK-NEXT: s_cbranch_execz BB3_2 +; CHECK-NEXT: BB3_4: ; %while_body +; CHECK-NEXT: ; in Loop: Header=BB3_3 Depth=1 +; CHECK-NEXT: s_add_i32 s9, s8, 1 +; CHECK-NEXT: v_mov_b32_e32 v4, s8 +; CHECK-NEXT: v_mov_b32_e32 v3, s9 +; CHECK-NEXT: s_andn2_b64 s[10:11], s[10:11], exec +; CHECK-NEXT: s_mov_b32 s8, s9 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[3:4], off +; CHECK-NEXT: s_branch BB3_2 +; CHECK-NEXT: BB3_5: ; %Flow1 +; CHECK-NEXT: s_or_b64 exec, exec, s[12:13] +; CHECK-NEXT: BB3_6: ; %Flow2 +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp eq %struct_type addrspace(1)* %struct, null + br i1 %cmp, label %while_end, label %while_cond + +while_cond: + %phi = phi i32 [ 0, %entry ], [ %i, %while_body ] + %gep0 = getelementptr %struct_type, %struct_type addrspace(1)* %struct, i64 0, i32 1 + %gep1 = getelementptr %struct_type, %struct_type addrspace(1)* %struct, i64 0, i32 2 + %cmp1 = icmp slt i32 %phi, %n + br i1 %cmp1, label %while_body, label %while_end + +while_body: + %i = add i32 %phi, 1 + store i32 %i, i32 addrspace(1)* %gep0 + store i32 %phi, i32 addrspace(1)* %gep1 + br label %while_cond + +while_end: + ret void +} + Index: test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll =================================================================== --- test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll +++ test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll @@ -4,14 +4,17 @@ ; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-MAXOCC %s ; SI-MINREG: NumSgprs: {{[1-9]$}} -; SI-MINREG: NumVgprs: {{[1-9]$}} +; SI-MINREG: NumVgprs: {{1[0-9]$}} -; SI-MAXOCC: NumSgprs: {{[0-4][0-9]$}} +; SI-MAXOCC: NumSgprs: {{[0-9]$}} ; SI-MAXOCC: NumVgprs: {{[0-4][0-9]$}} ; stores may alias loads -; VI: NumSgprs: {{[0-9]$}} -; VI: NumVgprs: {{[1-3][0-9]$}} +; VI-MINREG: NumSgprs: {{[0-9]$}} +; VI-MINREG: NumVgprs: {{8[0-9]$}} + +; VI-MAXOCC: NumSgprs: {{6[0-9]$}} +; VI-MAXOCC: NumVgprs: {{3[0-9]$}} define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %in_arg, float addrspace(1)* nocapture %out_arg) { bb: