Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -63,7 +63,6 @@ bool runOnMachineFunction(MachineFunction &MF) override; void Select(SDNode *N) override; const char *getPassName() const override; - void PreprocessISelDAG() override; void PostprocessISelDAG() override; private: @@ -1498,65 +1497,6 @@ return SelectVOP3Mods(In, Src, SrcMods); } -void AMDGPUDAGToDAGISel::PreprocessISelDAG() { - MachineFrameInfo &MFI = CurDAG->getMachineFunction().getFrameInfo(); - - // Handle the perverse case where a frame index is being stored. We don't - // want to see multiple frame index operands on the same instruction since - // it complicates things and violates some assumptions about frame index - // lowering. - for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); - I != E; ++I) { - SDValue FI = CurDAG->getTargetFrameIndex(I, MVT::i32); - - // It's possible that we have a frame index defined in the function that - // isn't used in this block. - if (FI.use_empty()) - continue; - - // Skip over the AssertZext inserted during lowering. - SDValue EffectiveFI = FI; - auto It = FI->use_begin(); - if (It->getOpcode() == ISD::AssertZext && FI->hasOneUse()) { - EffectiveFI = SDValue(*It, 0); - It = EffectiveFI->use_begin(); - } - - for (auto It = EffectiveFI->use_begin(); !It.atEnd(); ) { - SDUse &Use = It.getUse(); - SDNode *User = Use.getUser(); - unsigned OpIdx = It.getOperandNo(); - ++It; - - if (MemSDNode *M = dyn_cast(User)) { - unsigned PtrIdx = M->getOpcode() == ISD::STORE ? 2 : 1; - if (OpIdx == PtrIdx) - continue; - - unsigned OpN = M->getNumOperands(); - SDValue NewOps[8]; - - assert(OpN < array_lengthof(NewOps)); - for (unsigned Op = 0; Op != OpN; ++Op) { - if (Op != OpIdx) { - NewOps[Op] = M->getOperand(Op); - continue; - } - - MachineSDNode *Mov = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, - SDLoc(M), MVT::i32, FI); - NewOps[Op] = SDValue(Mov, 0); - } - - CurDAG->UpdateNodeOperands(M, makeArrayRef(NewOps, OpN)); - } - - if (EffectiveFI->use_empty()) - CurDAG->RemoveDeadNode(EffectiveFI.getNode()); - } - } -} - void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = *static_cast(getTargetLowering()); Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -33,7 +33,6 @@ SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -89,7 +89,6 @@ setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); - setOperationAction(ISD::FrameIndex, MVT::i32, Custom); setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); setOperationAction(ISD::SELECT, MVT::i1, Promote); @@ -1555,7 +1554,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::LOAD: { SDValue Result = LowerLOAD(Op, DAG); @@ -1602,43 +1600,6 @@ return nullptr; } -SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { - - SDLoc SL(Op); - FrameIndexSDNode *FINode = cast(Op); - unsigned FrameIndex = FINode->getIndex(); - - // A FrameIndex node represents a 32-bit offset into scratch memory. If the - // high bit of a frame index offset were to be set, this would mean that it - // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch - // buffer, with 64 being the number of threads per wave. - // - // The maximum private allocation for the entire GPU is 4G, and we are - // concerned with the largest the index could ever be for an individual - // workitem. This will occur with the minmum dispatch size. If a program - // requires more, the dispatch size will be reduced. - // - // With this limit, we can mark the high bit of the FrameIndex node as known - // zero, which is important, because it means in most situations we can prove - // that values derived from FrameIndex nodes are non-negative. This enables us - // to take advantage of more addressing modes when accessing scratch buffers, - // since for scratch reads/writes, the register offset must always be - // positive. - - uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024; - - // XXX - It is unclear if partial dispatch works. Assume it works at half wave - // granularity. It is probably a full wave. - uint64_t MinGranularity = 32; - - unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity); - EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits); - - SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32); - return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI, - DAG.getValueType(ExtVT)); -} - bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN) return false; Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -295,6 +295,11 @@ N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32); }]>; +def frameindex_to_targetframeindex : SDNodeXForm(N); + return CurDAG->getTargetFrameIndex(FI->getIndex(), MVT::i32); +}]>; + // Copied from the AArch64 backend: def bitcast_fpimm_to_i64 : SDNodeXFormgetTargetConstant( Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -1940,6 +1940,11 @@ >; def : Pat < + (i32 frameindex:$fi), + (V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi))) +>; + +def : Pat < (i64 InlineImm:$imm), (S_MOV_B64 InlineImm:$imm) >; Index: test/CodeGen/AMDGPU/amdgpu.private-memory.ll =================================================================== --- test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -271,7 +271,12 @@ ; R600_CHECK: MOV ; R600_CHECK: [[CHAN:[XYZW]]]+ ; R600-NOT: [[CHAN]]+ -; SI: v_mov_b32_e32 v3 + +; SI: v_mov_b32_e32 [[FI0:v[0-9]+]], 0 ; encoding + +; SI-DAG: buffer_store_byte v{{[0-9]+}}, [[FI0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:1 ; encoding +; SI-DAG: buffer_store_byte v{{[0-9]+}}, [[FI0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen ; encoding +; SI-DAG: buffer_store_byte v{{[0-9]+}}, [[FI0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:2 ; encoding define void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 { entry: %0 = alloca [3 x i8], align 1 Index: test/CodeGen/AMDGPU/captured-frame-index.ll =================================================================== --- test/CodeGen/AMDGPU/captured-frame-index.ll +++ test/CodeGen/AMDGPU/captured-frame-index.ll @@ -1,12 +1,23 @@ ; RUN: llc -march=amdgcn -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; GCN-LABEL: {{^}}store_fi_lifetime: +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} +; GCN: buffer_store_dword [[FI]] +define void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 { +entry: + %b = alloca i8 + call void @llvm.lifetime.start(i64 1, i8* %b) + store volatile i8* %b, i8* addrspace(1)* undef + call void @llvm.lifetime.end(i64 1, i8* %b) + ret void +} + ; GCN-LABEL: {{^}}stored_fi_to_lds: ; GCN: s_load_dword [[LDSPTR:s[0-9]+]] -; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO1]] -; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO]] ; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]] -; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]] +; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO]] define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 { %tmp = alloca float store float 4.0, float *%tmp @@ -17,15 +28,14 @@ ; Offset is applied ; GCN-LABEL: {{^}}stored_fi_to_lds_2_small_objects: ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}} ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:4{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[FI1]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} ; GCN-DAG: s_load_dword [[LDSPTR:s[0-9]+]] ; GCN-DAG: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]] ; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO]] - -; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}} ; GCN: ds_write_b32 [[VLDSPTR]], [[FI1]] define void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 { %tmp0 = alloca float @@ -59,10 +69,9 @@ ; GCN: buffer_store_dword [[K0]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} ; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x4d2{{$}} -; GCN: buffer_store_dword [[K1]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:2048{{$}} - -; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x800{{$}} -; GCN: buffer_store_dword [[OFFSETK]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:2048{{$}} +; GCN-DAG: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x800{{$}} +; GCN: buffer_store_dword [[K1]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} +; GCN: buffer_store_dword [[OFFSETK]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} define void @stored_fi_to_self_offset() #0 { %tmp0 = alloca [512 x i32] %tmp1 = alloca i32* @@ -79,16 +88,17 @@ } ; GCN-LABEL: {{^}}stored_fi_to_fi: -; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:4{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:8{{$}} +; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}} -; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}} -; GCN: buffer_store_dword [[FI1]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:8{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} +; GCN-AG: buffer_store_dword v{{[0-9]+}}, [[FI1]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} ; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}} -; GCN: buffer_store_dword [[FI2]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:4{{$}} + +; GCN: buffer_store_dword [[FI1]], [[FI2]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} +; GCN: buffer_store_dword [[FI2]], [[FI1]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} define void @stored_fi_to_fi() #0 { %tmp0 = alloca i32* %tmp1 = alloca i32* @@ -106,9 +116,10 @@ } ; GCN-LABEL: {{^}}stored_fi_to_global: -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} -; GCN: buffer_store_dword [[FI]] +; GCN: v_mov_b32_e32 [[FI0:v[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 0{{$}} +; GCN: buffer_store_dword [[FI1]], [[FI0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen +; GCN: buffer_store_dword [[FI0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} define void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 { %tmp = alloca float store float 0.0, float *%tmp @@ -118,14 +129,19 @@ ; Offset is applied ; GCN-LABEL: {{^}}stored_fi_to_global_2_small_objects: -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen -; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}} -; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN: v_mov_b32_e32 [[FI0:v[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} + +; GCN: buffer_store_dword [[ZERO]], [[FI0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen +; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}} ; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}} +; GCN: buffer_store_dword [[ZERO]], [[FI1]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen +; GCN: buffer_store_dword [[ZERO]], [[FI2]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen + + +; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 { %tmp0 = alloca float @@ -140,17 +156,15 @@ } ; GCN-LABEL: {{^}}stored_fi_to_global_huge_frame_offset: -; GCN: s_add_i32 [[BASE_1_OFF_0:s[0-9]+]], 0, 0x3ffc -; GCN: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}} -; GCN: buffer_store_dword [[BASE_0]], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen - -; GCN: v_mov_b32_e32 [[V_BASE_1_OFF_0:v[0-9]+]], [[BASE_1_OFF_0]] -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} -; GCN: s_add_i32 [[BASE_1_OFF_1:s[0-9]+]], 0, 56 -; GCN: buffer_store_dword [[K]], [[V_BASE_1_OFF_0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} +; GCN: v_mov_b32_e32 [[BASE_0_0:v[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GCN: buffer_store_dword [[ZERO]], [[BASE_0_0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen -; GCN: v_mov_b32_e32 [[V_BASE_1_OFF_1:v[0-9]+]], [[BASE_1_OFF_1]] -; GCN: buffer_store_dword [[V_BASE_1_OFF_1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: v_add_i32_e32 [[BASE_1_OFF_0:v[0-9]+]], vcc, 0x3ffc, [[BASE_0_0]] +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} +; GCN-DAG: v_add_i32_e32 [[BASE_1_OFF_1:v[0-9]+]], vcc, 56, [[BASE_0_0]] +; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} +; GCN: buffer_store_dword [[BASE_1_OFF_1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 { %tmp0 = alloca [4096 x i32] %tmp1 = alloca [4096 x i32] @@ -182,4 +196,8 @@ ret void } +declare void @llvm.lifetime.start(i64, i8* nocapture) #1 +declare void @llvm.lifetime.end(i64, i8* nocapture) #1 + attributes #0 = { nounwind } +attributes #1 = { argmemonly nounwind } Index: test/CodeGen/AMDGPU/local-stack-slot-bug.ll =================================================================== --- test/CodeGen/AMDGPU/local-stack-slot-bug.ll +++ test/CodeGen/AMDGPU/local-stack-slot-bug.ll @@ -7,9 +7,13 @@ ; ; CHECK-LABEL: {{^}}main: ; CHECK: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0 -; CHECK: v_add_i32_e32 [[HI_OFF:v[0-9]+]], vcc, 0x200, [[BYTES]] -; CHECK: v_add_i32_e32 [[LO_OFF:v[0-9]+]], vcc, 0, [[BYTES]] +; CHECK-DAG: v_mov_b32_e32 [[ZERO_BASE_FI:v[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 [[HI_BASE_FI:v[0-9]+]], 0x200{{$}} +; CHECK-DAG: v_add_i32_e32 [[HI_OFF:v[0-9]+]], vcc, [[HI_BASE_FI]], [[BYTES]] +; CHECK-DAG: v_add_i32_e32 [[LO_OFF:v[0-9]+]], vcc, [[ZERO_BASE_FI]], [[BYTES]] + ; CHECK: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen +; CHECK: buffer_store_dword v{{[0-9]+}}, [[HI_BASE_FI]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:{{[0-9]+}} ; CHECK: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen define amdgpu_ps float @main(i32 %idx) { main_body: