Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -63,10 +63,10 @@ bool runOnMachineFunction(MachineFunction &MF) override; void Select(SDNode *N) override; const char *getPassName() const override; - void PreprocessISelDAG() override; void PostprocessISelDAG() override; private: + SDValue foldFrameIndex(SDValue N) const; bool isInlineImmediate(const SDNode *N) const; bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs, const R600InstrInfo *TII); @@ -910,6 +910,12 @@ return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE); } +SDValue AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const { + if (auto FI = dyn_cast(N)) + return CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); + return N; +} + bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, SDValue &VAddr, SDValue &SOffset, SDValue &ImmOffset) const { @@ -929,14 +935,14 @@ // Offsets in vaddr must be positive. ConstantSDNode *C1 = cast(N1); if (isLegalMUBUFImmOffset(C1)) { - VAddr = N0; + VAddr = foldFrameIndex(N0); ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); return true; } } // (node) - VAddr = Addr; + VAddr = foldFrameIndex(Addr); ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16); return true; } @@ -1498,65 +1504,6 @@ return SelectVOP3Mods(In, Src, SrcMods); } -void AMDGPUDAGToDAGISel::PreprocessISelDAG() { - MachineFrameInfo &MFI = CurDAG->getMachineFunction().getFrameInfo(); - - // Handle the perverse case where a frame index is being stored. We don't - // want to see multiple frame index operands on the same instruction since - // it complicates things and violates some assumptions about frame index - // lowering. - for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); - I != E; ++I) { - SDValue FI = CurDAG->getTargetFrameIndex(I, MVT::i32); - - // It's possible that we have a frame index defined in the function that - // isn't used in this block. - if (FI.use_empty()) - continue; - - // Skip over the AssertZext inserted during lowering. - SDValue EffectiveFI = FI; - auto It = FI->use_begin(); - if (It->getOpcode() == ISD::AssertZext && FI->hasOneUse()) { - EffectiveFI = SDValue(*It, 0); - It = EffectiveFI->use_begin(); - } - - for (auto It = EffectiveFI->use_begin(); !It.atEnd(); ) { - SDUse &Use = It.getUse(); - SDNode *User = Use.getUser(); - unsigned OpIdx = It.getOperandNo(); - ++It; - - if (MemSDNode *M = dyn_cast(User)) { - unsigned PtrIdx = M->getOpcode() == ISD::STORE ? 2 : 1; - if (OpIdx == PtrIdx) - continue; - - unsigned OpN = M->getNumOperands(); - SDValue NewOps[8]; - - assert(OpN < array_lengthof(NewOps)); - for (unsigned Op = 0; Op != OpN; ++Op) { - if (Op != OpIdx) { - NewOps[Op] = M->getOperand(Op); - continue; - } - - MachineSDNode *Mov = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, - SDLoc(M), MVT::i32, FI); - NewOps[Op] = SDValue(Mov, 0); - } - - CurDAG->UpdateNodeOperands(M, makeArrayRef(NewOps, OpN)); - } - - if (EffectiveFI->use_empty()) - CurDAG->RemoveDeadNode(EffectiveFI.getNode()); - } - } -} - void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = *static_cast(getTargetLowering()); Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -33,7 +33,6 @@ SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -89,7 +89,6 @@ setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); - setOperationAction(ISD::FrameIndex, MVT::i32, Custom); setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); setOperationAction(ISD::SELECT, MVT::i1, Promote); @@ -1555,7 +1554,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::LOAD: { SDValue Result = LowerLOAD(Op, DAG); @@ -1602,43 +1600,6 @@ return nullptr; } -SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { - - SDLoc SL(Op); - FrameIndexSDNode *FINode = cast(Op); - unsigned FrameIndex = FINode->getIndex(); - - // A FrameIndex node represents a 32-bit offset into scratch memory. If the - // high bit of a frame index offset were to be set, this would mean that it - // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch - // buffer, with 64 being the number of threads per wave. - // - // The maximum private allocation for the entire GPU is 4G, and we are - // concerned with the largest the index could ever be for an individual - // workitem. This will occur with the minmum dispatch size. If a program - // requires more, the dispatch size will be reduced. - // - // With this limit, we can mark the high bit of the FrameIndex node as known - // zero, which is important, because it means in most situations we can prove - // that values derived from FrameIndex nodes are non-negative. This enables us - // to take advantage of more addressing modes when accessing scratch buffers, - // since for scratch reads/writes, the register offset must always be - // positive. - - uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024; - - // XXX - It is unclear if partial dispatch works. Assume it works at half wave - // granularity. It is probably a full wave. - uint64_t MinGranularity = 32; - - unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity); - EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits); - - SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32); - return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI, - DAG.getValueType(ExtVT)); -} - bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN) return false; Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -295,6 +295,11 @@ N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32); }]>; +def frameindex_to_targetframeindex : SDNodeXForm(N); + return CurDAG->getTargetFrameIndex(FI->getIndex(), MVT::i32); +}]>; + // Copied from the AArch64 backend: def bitcast_fpimm_to_i64 : SDNodeXFormgetTargetConstant( Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -1940,6 +1940,11 @@ >; def : Pat < + (i32 frameindex:$fi), + (V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi))) +>; + +def : Pat < (i64 InlineImm:$imm), (S_MOV_B64 InlineImm:$imm) >; Index: test/CodeGen/AMDGPU/captured-frame-index.ll =================================================================== --- test/CodeGen/AMDGPU/captured-frame-index.ll +++ test/CodeGen/AMDGPU/captured-frame-index.ll @@ -1,5 +1,17 @@ ; RUN: llc -march=amdgcn -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; GCN-LABEL: {{^}}store_fi_lifetime: +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} +; GCN: buffer_store_dword [[FI]] +define void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 { +entry: + %b = alloca i8 + call void @llvm.lifetime.start(i64 1, i8* %b) + store volatile i8* %b, i8* addrspace(1)* undef + call void @llvm.lifetime.end(i64 1, i8* %b) + ret void +} + ; GCN-LABEL: {{^}}stored_fi_to_lds: ; GCN: s_load_dword [[LDSPTR:s[0-9]+]] ; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}} @@ -140,17 +152,18 @@ } ; GCN-LABEL: {{^}}stored_fi_to_global_huge_frame_offset: -; GCN: s_add_i32 [[BASE_1_OFF_0:s[0-9]+]], 0, 0x3ffc +; GCN: v_mov_b32_e32 [[VAL_0:v[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}} -; GCN: buffer_store_dword [[BASE_0]], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen +; GCN: buffer_store_dword [[VAL_0]], [[BASE_0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen + +; GCN: v_mov_b32_e32 [[BASE_0_1:v[0-9]+]], 0{{$}} +; GCN: v_add_i32_e32 [[BASE_1_OFF_0:v[0-9]+]], vcc, 0x3ffc, [[BASE_0_1]] -; GCN: v_mov_b32_e32 [[V_BASE_1_OFF_0:v[0-9]+]], [[BASE_1_OFF_0]] ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} -; GCN: s_add_i32 [[BASE_1_OFF_1:s[0-9]+]], 0, 56 -; GCN: buffer_store_dword [[K]], [[V_BASE_1_OFF_0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} +; GCN: v_add_i32_e32 [[BASE_1_OFF_1:v[0-9]+]], vcc, 56, [[BASE_0_1]] +; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} -; GCN: v_mov_b32_e32 [[V_BASE_1_OFF_1:v[0-9]+]], [[BASE_1_OFF_1]] -; GCN: buffer_store_dword [[V_BASE_1_OFF_1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN: buffer_store_dword [[BASE_1_OFF_1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 { %tmp0 = alloca [4096 x i32] %tmp1 = alloca [4096 x i32] @@ -182,4 +195,8 @@ ret void } +declare void @llvm.lifetime.start(i64, i8* nocapture) #1 +declare void @llvm.lifetime.end(i64, i8* nocapture) #1 + attributes #0 = { nounwind } +attributes #1 = { argmemonly nounwind } Index: test/CodeGen/AMDGPU/local-stack-slot-bug.ll =================================================================== --- test/CodeGen/AMDGPU/local-stack-slot-bug.ll +++ test/CodeGen/AMDGPU/local-stack-slot-bug.ll @@ -7,8 +7,10 @@ ; ; CHECK-LABEL: {{^}}main: ; CHECK: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0 -; CHECK: v_add_i32_e32 [[HI_OFF:v[0-9]+]], vcc, 0x200, [[BYTES]] -; CHECK: v_add_i32_e32 [[LO_OFF:v[0-9]+]], vcc, 0, [[BYTES]] +; CHECK-DAG: v_mov_b32_e32 [[ZERO_BASE_FI:v[0-9]+]], 0{{$}} +; CHECK-DAG: v_add_i32_e32 [[HI_OFF:v[0-9]+]], vcc, 0x200, [[BYTES]] +; CHECK-DAG: v_add_i32_e32 [[LO_OFF:v[0-9]+]], vcc, 0, [[BYTES]] + ; CHECK: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen ; CHECK: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen define amdgpu_ps float @main(i32 %idx) {