Index: include/llvm/CodeGen/AsmPrinter.h =================================================================== --- include/llvm/CodeGen/AsmPrinter.h +++ include/llvm/CodeGen/AsmPrinter.h @@ -620,6 +620,13 @@ virtual void emitInlineAsmEnd(const MCSubtargetInfo &StartInfo, const MCSubtargetInfo *EndInfo) const; + /// This emits visibility information about symbol, if this is supported by + /// the target. + void EmitVisibility(MCSymbol *Sym, unsigned Visibility, + bool IsDefinition = true) const; + + void EmitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const; + private: /// Private state for PrintSpecial() // Assign a unique ID to this machine instruction. @@ -650,13 +657,6 @@ // Internal Implementation Details //===------------------------------------------------------------------===// - /// This emits visibility information about symbol, if this is supported by - /// the target. - void EmitVisibility(MCSymbol *Sym, unsigned Visibility, - bool IsDefinition = true) const; - - void EmitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const; - void EmitJumpTableEntry(const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB, unsigned uid) const; void EmitLLVMUsedList(const ConstantArray *InitList); Index: include/llvm/CodeGen/MachineOperand.h =================================================================== --- include/llvm/CodeGen/MachineOperand.h +++ include/llvm/CodeGen/MachineOperand.h @@ -713,6 +713,10 @@ /// ChangeToES - Replace this operand with a new external symbol operand. void ChangeToES(const char *SymName, unsigned char TargetFlags = 0); + /// ChangeToGA - Replace this operand with a new global address operand. + void ChangeToGA(const GlobalValue *GV, int64_t Offset, + unsigned char TargetFlags = 0); + /// ChangeToMCSymbol - Replace this operand with a new MC symbol operand. void ChangeToMCSymbol(MCSymbol *Sym); Index: lib/CodeGen/MachineOperand.cpp =================================================================== --- lib/CodeGen/MachineOperand.cpp +++ lib/CodeGen/MachineOperand.cpp @@ -181,6 +181,19 @@ setTargetFlags(TargetFlags); } +void MachineOperand::ChangeToGA(const GlobalValue *GV, int64_t Offset, + unsigned char TargetFlags) { + assert((!isReg() || !isTied()) && + "Cannot change a tied operand into a global address"); + + removeRegFromUses(); + + OpKind = MO_GlobalAddress; + Contents.OffsetedInfo.Val.GV = GV; + setOffset(Offset); + setTargetFlags(TargetFlags); +} + void MachineOperand::ChangeToMCSymbol(MCSymbol *Sym) { assert((!isReg() || !isTied()) && "Cannot change a tied operand into an MCSymbol"); Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -285,10 +285,39 @@ } void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { + if (GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + const Triple::OSType OS = TM.getTargetTriple().getOS(); + + // LDS variables aren't emitted in HSA or PAL yet. + if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) + return; + + if (GV->hasInitializer() && !isa(GV->getInitializer())) { + OutContext.reportError({}, + Twine(GV->getName()) + + ": unsupported initializer for address space"); + return; + } + + MCSymbol *GVSym = getSymbol(GV); + + GVSym->redefineIfPossible(); + if (GVSym->isDefined() || GVSym->isVariable()) + report_fatal_error("symbol '" + Twine(GVSym->getName()) + + "' is already defined"); - // Group segment variables aren't emitted in HSA. - if (AMDGPU::isGroupSegment(GV)) + const DataLayout &DL = GV->getParent()->getDataLayout(); + uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType()); + unsigned Align = GV->getAlignment(); + if (!Align) + Align = 4; + + EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration()); + EmitLinkage(GV, GVSym); + OutStreamer->emitELFSize(GVSym, MCConstantExpr::create(Size, OutContext)); + getTargetStreamer()->emitAMDGPULDS(GVSym, Align); return; + } AsmPrinter::EmitGlobalVariable(GV); } Index: lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- lib/Target/AMDGPU/SIFoldOperands.cpp +++ lib/Target/AMDGPU/SIFoldOperands.cpp @@ -50,7 +50,7 @@ } else if (FoldOp->isFI()) { FrameIndexToFold = FoldOp->getIndex(); } else { - assert(FoldOp->isReg()); + assert(FoldOp->isReg() || FoldOp->isGlobal()); OpToFold = FoldOp; } } @@ -67,6 +67,8 @@ return Kind == MachineOperand::MO_Register; } + bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; } + bool isCommuted() const { return Commuted; } @@ -252,6 +254,12 @@ assert(!Fold.needsShrink() && "not handled"); + if (Fold.isGlobal()) { + Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(), + Fold.OpToFold->getTargetFlags()); + return true; + } + if (Fold.isFI()) { Old.ChangeToFrameIndex(Fold.FrameIndexToFold); return true; @@ -443,8 +451,7 @@ return; } - - bool FoldingImm = OpToFold.isImm(); + bool FoldingImm = OpToFold.isImm() || OpToFold.isGlobal(); if (FoldingImm && UseMI->isCopy()) { unsigned DestReg = UseMI->getOperand(0).getReg(); @@ -805,7 +812,7 @@ SmallVector FoldList; MachineOperand &Dst = MI.getOperand(0); - bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); + bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); if (FoldingImm) { unsigned NumLiteralUses = 0; MachineOperand *NonInlineUse = nullptr; @@ -1151,7 +1158,8 @@ } MachineOperand &OpToFold = MI.getOperand(1); - bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); + bool FoldingImm = + OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); // FIXME: We could also be folding things like TargetIndexes. if (!FoldingImm && !OpToFold.isReg()) Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -349,6 +349,10 @@ void finalizeLowering(MachineFunction &MF) const override; + void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, + const APInt &DemandedElts, + const SelectionDAG &DAG, + unsigned Depth = 0) const override; void computeKnownBitsForFrameIndex(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -3437,6 +3437,8 @@ } case AMDGPU::GET_GROUPSTATICSIZE: { + assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA || + getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL); DebugLoc DL = MI.getDebugLoc(); BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32)) .add(MI.getOperand(0)) @@ -4548,7 +4550,9 @@ SelectionDAG &DAG) const { GlobalAddressSDNode *GSD = cast(Op); const GlobalValue *GV = GSD->getGlobal(); - if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + (getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA || + getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL)) || GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS || GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); @@ -4557,6 +4561,15 @@ EVT PtrVT = Op.getValueType(); // FIXME: Should not make address space based decisions here. + + if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(), + SIInstrInfo::MO_ABS32_LO); + GA = DAG.getNode(ISD::AssertZext, DL, MVT::i32, GA, + DAG.getValueType(MVT::i16)); + return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0}; + } + if (shouldEmitFixup(GV)) return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT); else if (shouldEmitPCReloc(GV)) @@ -5479,6 +5492,18 @@ case Intrinsic::amdgcn_fmad_ftz: return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::amdgcn_groupstaticsize: { + Triple::OSType OS = getTargetMachine().getTargetTriple().getOS(); + if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) + return Op; + + const Module *M = MF.getFunction().getParent(); + const GlobalValue *GV = + M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize)); + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0, + SIInstrInfo::MO_ABS32_LO); + return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0}; + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) @@ -9874,6 +9899,27 @@ TargetLoweringBase::finalizeLowering(MF); } +void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op, + KnownBits &Known, + const APInt &DemandedElts, + const SelectionDAG &DAG, + unsigned Depth) const { + if (Op->isMachineOpcode()) { + switch (Op->getMachineOpcode()) { + case AMDGPU::S_MOV_B32: + case AMDGPU::V_MOV_B32_e32: + Known = DAG.computeKnownBits(Op->getOperand(0), DemandedElts, Depth + 1); + break; + default: + break; + } + return; + } + + AMDGPUTargetLowering::computeKnownBitsForTargetNode(Op, Known, DemandedElts, + DAG, Depth); +} + void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2598,7 +2598,7 @@ const MachineOperand &MO) const { const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo]; - assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); + assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) return true; @@ -3471,7 +3471,7 @@ return isLegalRegOperand(MRI, OpInfo, MO); // Handle non-register types that are treated like immediates. - assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); + assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); return true; } @@ -3512,7 +3512,7 @@ } // Handle non-register types that are treated like immediates. - assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); + assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal()); if (!DefinedRC) { // This operand expects an immediate. Index: test/CodeGen/AMDGPU/32-bit-local-address-space.ll =================================================================== --- test/CodeGen/AMDGPU/32-bit-local-address-space.ll +++ test/CodeGen/AMDGPU/32-bit-local-address-space.ll @@ -81,8 +81,8 @@ @g_lds = addrspace(3) global float undef, align 4 ; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset: -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 -; SI: ds_read_b32 v{{[0-9]+}}, [[REG]] +; SI: v_mov_b32_e32 [[PTR:v[0-9]+]], g_lds@abs32@lo +; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]] define amdgpu_kernel void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) { %val = load float, float addrspace(3)* @g_lds store float %val, float addrspace(1)* %out Index: test/CodeGen/AMDGPU/constant-fold-mi-operands.ll =================================================================== --- test/CodeGen/AMDGPU/constant-fold-mi-operands.ll +++ test/CodeGen/AMDGPU/constant-fold-mi-operands.ll @@ -1,90 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; GCN-LABEL: {{^}}fold_mi_v_and_0: -; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} -; GCN-NOT: [[RESULT]] -; GCN: buffer_store_dword [[RESULT]] -define amdgpu_kernel void @fold_mi_v_and_0(i32 addrspace(1)* %out) { - %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 - %size = call i32 @llvm.amdgcn.groupstaticsize() - %and = and i32 %size, %x - store i32 %and, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}fold_mi_s_and_0: -; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} -; GCN-NOT: [[RESULT]] -; GCN: buffer_store_dword [[RESULT]] -define amdgpu_kernel void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 { - %size = call i32 @llvm.amdgcn.groupstaticsize() - %and = and i32 %size, %x - store i32 %and, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}fold_mi_v_or_0: -; GCN: v_mbcnt_lo_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]] -; GCN-NOT: [[RESULT]] -; GCN: buffer_store_dword [[RESULT]] -define amdgpu_kernel void @fold_mi_v_or_0(i32 addrspace(1)* %out) { - %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 - %size = call i32 @llvm.amdgcn.groupstaticsize() - %or = or i32 %size, %x - store i32 %or, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}fold_mi_s_or_0: -; GCN: s_load_dword [[SVAL:s[0-9]+]] -; GCN-NOT: [[SVAL]] -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] -; GCN-NOT: [[VVAL]] -; GCN: buffer_store_dword [[VVAL]] -define amdgpu_kernel void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 { - %size = call i32 @llvm.amdgcn.groupstaticsize() - %or = or i32 %size, %x - store i32 %or, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}fold_mi_v_xor_0: -; GCN: v_mbcnt_lo_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]] -; GCN-NOT: [[RESULT]] -; GCN: buffer_store_dword [[RESULT]] -define amdgpu_kernel void @fold_mi_v_xor_0(i32 addrspace(1)* %out) { - %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 - %size = call i32 @llvm.amdgcn.groupstaticsize() - %xor = xor i32 %size, %x - store i32 %xor, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}fold_mi_s_xor_0: -; GCN: s_load_dword [[SVAL:s[0-9]+]] -; GCN-NOT: [[SVAL]] -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] -; GCN-NOT: [[VVAL]] -; GCN: buffer_store_dword [[VVAL]] -define amdgpu_kernel void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 { - %size = call i32 @llvm.amdgcn.groupstaticsize() - %xor = xor i32 %size, %x - store i32 %xor, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}fold_mi_s_not_0: -; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], -1{{$}} -; GCN-NOT: [[RESULT]] -; GCN: buffer_store_dword [[RESULT]] -define amdgpu_kernel void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 { - %size = call i32 @llvm.amdgcn.groupstaticsize() - %xor = xor i32 %size, -1 - store i32 %xor, i32 addrspace(1)* %out - ret void -} - ; GCN-LABEL: {{^}}fold_mi_v_not_0: ; GCN: v_bcnt_u32_b32{{(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}} ; GCN: v_bcnt_u32_b32{{(_e32)*(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}} @@ -138,7 +54,6 @@ declare i64 @llvm.ctpop.i64(i64) #1 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 -declare i32 @llvm.amdgcn.groupstaticsize() #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/ds-sub-offset.ll =================================================================== --- test/CodeGen/AMDGPU/ds-sub-offset.ll +++ test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -7,8 +7,8 @@ ; GCN-LABEL: {{^}}write_ds_sub0_offset0_global: ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v0 -; CI: v_sub_i32_e32 [[BASEPTR:v[0-9]+]], vcc, 0, [[SHL]] -; GFX9: v_sub_u32_e32 [[BASEPTR:v[0-9]+]], 0, [[SHL]] +; CI: v_sub_i32_e32 [[BASEPTR:v[0-9]+]], vcc, lds.obj@abs32@lo, [[SHL]] +; GFX9: v_sub_u32_e32 [[BASEPTR:v[0-9]+]], lds.obj@abs32@lo, [[SHL]] ; GCN: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b ; GCN: ds_write_b32 [[BASEPTR]], [[VAL]] offset:12 define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 { Index: test/CodeGen/AMDGPU/ds_read2.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2.ll +++ test/CodeGen/AMDGPU/ds_read2.ll @@ -355,7 +355,8 @@ ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}} +; GCN-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 3, {{v[0-9]+}} +; GCN-DAG: v_add_{{[iu]}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}lds.f64@abs32@lo, [[VOFS]] ; GCN: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8 ; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} @@ -441,8 +442,8 @@ ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 +; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}} +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1 define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) { %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 @@ -455,8 +456,8 @@ ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2 +; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}} +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:2 define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) { %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 @@ -471,9 +472,9 @@ ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 -; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3 +; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}} +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1 +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:2 offset1:3 define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) { %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 @@ -488,10 +489,13 @@ ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} -; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000 -; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1 -; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1 +; GCN-DAG: s_mov_b32 [[SBASE0:s[0-9]+]], bar.large@abs32@lo +; GCN-DAG: s_add_i32 [[SBASE1:s[0-9]+]], [[SBASE0]], 0x4000{{$}} +; GCN-DAG: s_addk_i32 [[SBASE0]], 0x7ff8{{$}} +; GCN-DAG: v_mov_b32_e32 [[VBASE0:v[0-9]+]], [[SBASE0]] +; GCN-DAG: v_mov_b32_e32 [[VBASE1:v[0-9]+]], [[SBASE1]] +; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VBASE0]] offset1:1 +; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VBASE1]] offset1:1 ; GCN: s_endpgm define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) { %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 Index: test/CodeGen/AMDGPU/ds_write2.ll =================================================================== --- test/CodeGen/AMDGPU/ds_write2.ll +++ test/CodeGen/AMDGPU/ds_write2.ll @@ -103,10 +103,17 @@ ; CI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}} ; CI-DAG: s_mov_b32 m0 -; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} - -; GFX9: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}} -; GFX9: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}} +; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}} +; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]] +; +; TODO: This should be an s_mov_b32. The v_mov_b32 gets introduced by an +; early legalization of the constant bus constraint on the v_lshl_add_u32, +; and then SIFoldOperands folds in an unlucky order. +; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo +; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], {{v[0-9]+}}, 2, [[VBASE]] + +; GFX9-DAG: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}} +; GFX9-DAG: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}} ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 ; GCN: s_endpgm @@ -131,7 +138,12 @@ ; GFX9-NOT: m0 ; GCN-DAG: {{buffer|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} -; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} + +; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}} +; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]] +; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo +; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], v{{[0-9]+}}, 2, [[VBASE]] + ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 ; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { @@ -153,7 +165,12 @@ ; GFX9-NOT: m0 ; GCN-DAG: {{buffer|global}}_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} -; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} + +; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}} +; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]] +; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo +; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], v{{[0-9]+}}, 2, [[VBASE]] + ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 ; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 { @@ -389,8 +406,8 @@ ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}} +; GCN: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 define amdgpu_kernel void @store_constant_adjacent_offsets() { store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 @@ -402,8 +419,8 @@ ; GFX9-NOT: m0 ; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}} -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2 +; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}} +; GCN: ds_write2_b32 [[PTR]], [[VAL]], [[VAL]] offset1:2 define amdgpu_kernel void @store_constant_disjoint_offsets() { store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 @@ -416,9 +433,9 @@ ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 +; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}} +; GCN-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; GCN-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 ; GCN: s_endpgm define amdgpu_kernel void @store_misaligned64_constant_offsets() { store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 @@ -432,10 +449,13 @@ ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} -; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}} -; GCN-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -; GCN-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; GCN-DAG: s_mov_b32 [[SBASE0:s[0-9]+]], bar.large@abs32@lo +; GCN-DAG: s_add_i32 [[SBASE1:s[0-9]+]], [[SBASE0]], 0x4000{{$}} +; GCN-DAG: s_addk_i32 [[SBASE0]], 0x7ff8{{$}} +; GCN-DAG: v_mov_b32_e32 [[VBASE0:v[0-9]+]], [[SBASE0]]{{$}} +; GCN-DAG: v_mov_b32_e32 [[VBASE1:v[0-9]+]], [[SBASE1]]{{$}} +; GCN-DAG: ds_write2_b32 [[VBASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; GCN-DAG: ds_write2_b32 [[VBASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 ; GCN: s_endpgm define amdgpu_kernel void @store_misaligned64_constant_large_offsets() { store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 Index: test/CodeGen/AMDGPU/lds-initializer.ll =================================================================== --- test/CodeGen/AMDGPU/lds-initializer.ll +++ test/CodeGen/AMDGPU/lds-initializer.ll @@ -1,7 +1,7 @@ ; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck %s ; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s -; CHECK: in function load_init_lds_global{{.*}}: unsupported initializer for address space +; CHECK: lds: unsupported initializer for address space @lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8] Index: test/CodeGen/AMDGPU/lds-relocs.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/lds-relocs.ll @@ -0,0 +1,67 @@ +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -filetype=obj < %s | llvm-readobj -r -t | FileCheck -check-prefixes=ELF %s + +@lds.external = external unnamed_addr addrspace(3) global [0 x i32] +@lds.defined = unnamed_addr addrspace(3) global [8 x i32] undef, align 8 + +; ELF: Relocations [ +; ELF-NEXT: Section (3) .rel.text { +; ELF-NEXT: 0x{{[0-9a-f]*}} R_AMDGPU_ABS32 lds.external 0x0 +; ELF-NEXT: 0x{{[0-9a-f]*}} R_AMDGPU_ABS32 lds.defined 0x0 +; ELF-NEXT: } +; ELF-NEXT: ] + +; ELF: Symbol { +; ELF: Name: lds.defined +; ELF-NEXT: Value: 0x0 +; ELF-NEXT: Size: 32 +; ELF-NEXT: Binding: Global (0x1) +; ELF-NEXT: Type: AMDGPU_LDS (0xD) +; ELF-NEXT: Align: 8 +; ELF-NEXT: Other: 24 +; ELF-NEXT: Section: Undefined (0x0) +; ELF-NEXT: } + +; ELF: Symbol { +; ELF: Name: lds.external +; ELF-NEXT: Value: 0x0 +; ELF-NEXT: Size: 0 +; ELF-NEXT: Binding: Global (0x1) +; ELF-NEXT: Type: AMDGPU_LDS (0xD) +; ELF-NEXT: Align: 4 +; ELF-NEXT: Other: 16 +; ELF-NEXT: Section: Undefined (0x0) +; ELF-NEXT: } + +; GCN-LABEL: {{^}}test_basic: +; GCN: v_mov_b32_e32 v1, lds.external@abs32@lo ; encoding: [0xff,0x02,0x02,0x7e,A,A,A,A] +; GCN-NEXT: ; fixup A - offset: 4, value: lds.external@abs32@lo, kind: FK_Data_4{{$}} +; +; GCN: s_add_i32 s0, lds.defined@abs32@lo, s0 ; encoding: [0xff,0x00,0x00,0x81,A,A,A,A] +; GCN-NEXT: ; fixup A - offset: 4, value: lds.defined@abs32@lo, kind: FK_Data_4{{$}} +; +; GCN: .globl lds.external +; GCN: .size lds.external, 0 +; GCN: .amdgpu_lds lds.external, 4 +; GCN: .globl lds.defined +; GCN: .size lds.defined, 32 +; GCN: .amdgpu_lds lds.defined, 8 +define amdgpu_gs float @test_basic(i32 inreg %wave, i32 %arg1) #0 { +main_body: + %gep0 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @lds.external, i32 0, i32 %arg1 + %tmp = load i32, i32 addrspace(3)* %gep0 + + %mask = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %tmp, i32 0, i32 0) + %mask.32 = trunc i64 %mask to i32 + %gep1 = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds.defined, i32 0, i32 %wave + store i32 %mask.32, i32 addrspace(3)* %gep1 + + %r = bitcast i32 %tmp to float + ret float %r +} + +; Function Attrs: convergent nounwind readnone +declare i64 @llvm.amdgcn.icmp.i64.i32(i32, i32, i32) #4 + +attributes #0 = { "no-signed-zeros-fp-math"="true" } +attributes #4 = { convergent nounwind readnone } Index: test/CodeGen/AMDGPU/lds-size.ll =================================================================== --- test/CodeGen/AMDGPU/lds-size.ll +++ test/CodeGen/AMDGPU/lds-size.ll @@ -1,4 +1,3 @@ -; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=ALL -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=ALL -check-prefix=HSA %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=ALL -check-prefix=EG %s Index: test/CodeGen/AMDGPU/lds-zero-initializer.ll =================================================================== --- test/CodeGen/AMDGPU/lds-zero-initializer.ll +++ test/CodeGen/AMDGPU/lds-zero-initializer.ll @@ -1,7 +1,7 @@ ; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck %s ; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s -; CHECK: in function load_zeroinit_lds_global{{.*}}: unsupported initializer for address space +; CHECK: lds: unsupported initializer for address space @lds = addrspace(3) global [256 x i32] zeroinitializer Index: test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll @@ -268,7 +268,11 @@ ; CIVI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}} +; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]] +; GFX9-DAG: s_mov_b32 [[BASE:s[0-9]+]], lds0@abs32@lo +; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 2, [[BASE]] + ; GCN: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -412,7 +416,11 @@ ; CIVI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}} +; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 3, {{v[0-9]+}} +; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]] +; GFX9-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], lds1@abs32@lo +; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 3, [[BASE]] + ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16 define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 Index: test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll @@ -131,7 +131,10 @@ @lds0 = addrspace(3) global [512 x i32] undef, align 4 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i32: -; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}} +; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]] +; GFX9-DAG: s_mov_b32 [[BASE:s[0-9]+]], lds0@abs32@lo +; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 2, [[BASE]] ; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -325,7 +328,10 @@ @lds1 = addrspace(3) global [512 x i64] undef, align 8 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i64: -; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}} +; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 3, {{v[0-9]+}} +; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]] +; GFX9-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], lds1@abs32@lo +; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 3, [[BASE]] ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 Index: test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,NOHSA %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,HSA %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,HSA %s @lds0 = addrspace(3) global [512 x float] undef, align 4 @lds1 = addrspace(3) global [256 x float] undef, align 4 @@ -8,7 +8,8 @@ @large = addrspace(3) global [4096 x i32] undef, align 4 ; CHECK-LABEL: {{^}}groupstaticsize_test0: -; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}} +; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo +; HSA: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}} define amdgpu_kernel void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 64 @@ -22,7 +23,8 @@ } ; CHECK-LABEL: {{^}}groupstaticsize_test1: -; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}} +; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo +; HSA: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}} define amdgpu_kernel void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) { entry: %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1 @@ -50,7 +52,8 @@ ; Exceeds 16-bit simm limit of s_movk_i32 ; CHECK-LABEL: {{^}}large_groupstaticsize: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}} +; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo +; HSA: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}} define amdgpu_kernel void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 { %gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(3)* @large, i32 0, i32 %idx store volatile i32 0, i32 addrspace(3)* %gep Index: test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll @@ -17,8 +17,9 @@ ; VI-LABEL: {{^}}dpp_test1: ; VI: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}} ; VI-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0 -; VI-NEXT: s_nop 0 -; VI-NEXT: s_nop 0 +; VI-OPT: s_nop 1 +; VI-NOOPT: s_nop 0 +; VI-NOOPT: s_nop 0 ; VI-NEXT: v_mov_b32_dpp {{v[0-9]+}}, [[REG]] quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf @0 = internal unnamed_addr addrspace(3) global [448 x i32] undef, align 4 define weak_odr amdgpu_kernel void @dpp_test1(i32* %arg) local_unnamed_addr { Index: test/CodeGen/AMDGPU/local-memory.amdgcn.ll =================================================================== --- test/CodeGen/AMDGPU/local-memory.amdgcn.ll +++ test/CodeGen/AMDGPU/local-memory.amdgcn.ll @@ -3,12 +3,6 @@ @local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4 -; Check that the LDS size emitted correctly -; SI: .long 47180 -; SI-NEXT: .long 65668 -; CI: .long 47180 -; CI-NEXT: .long 32900 - ; GCN-LABEL: {{^}}local_memory: ; GCN-NOT: s_wqm_b64 @@ -36,27 +30,28 @@ @local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4 @local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4 -; Check that the LDS size emitted correctly -; EG: .long 166120 -; EG-NEXT: .long 8 -; GCN: .long 47180 -; GCN-NEXT: .long 32900 - ; GCN-LABEL: {{^}}local_memory_two_objects: -; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0 -; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4 -; SI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4 +; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, v0 +; GCN-DAG: s_mov_b32 [[SBASE0:s[0-9]+]], local_memory_two_objects.local_mem0@abs32@lo +; GCN-DAG: s_mov_b32 [[SBASE1:s[0-9]+]], local_memory_two_objects.local_mem1@abs32@lo +; GCN-DAG: v_add_i32_e32 [[VPTR0:v[0-9]+]], vcc, [[SBASE0]], [[OFS]] +; GCN-DAG: v_add_i32_e32 [[VPTR1:v[0-9]+]], vcc, [[SBASE1]], [[OFS]] +; GCN-DAG: ds_write_b32 [[VPTR0]], {{v[0-9]+}} +; GCN-DAG: ds_write_b32 [[VPTR1]], {{v[0-9]+}} ; GCN: s_barrier -; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]] -; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]] +; GCN-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, [[SBASE0]], [[OFS]] +; GCN-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, [[SBASE1]], [[OFS]] + +; SI-DAG: v_add_i32_e32 [[RPTR0:v[0-9]+]], vcc, 12, [[SUB0]] +; SI-DAG: v_add_i32_e32 [[RPTR1:v[0-9]+]], vcc, 12, [[SUB1]] +; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[RPTR0]] +; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[RPTR1]] -; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]] -; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]] +; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]] offset:12 +; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]] offset:12 -; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]] -; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7 define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 { entry: %x.i = call i32 @llvm.amdgcn.workitem.id.x() Index: test/CodeGen/AMDGPU/local-memory.ll =================================================================== --- test/CodeGen/AMDGPU/local-memory.ll +++ test/CodeGen/AMDGPU/local-memory.ll @@ -10,8 +10,8 @@ ; not an immediate. ; FUNC-LABEL: {{^}}load_i32_local_const_ptr: -; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0 -; GCN: ds_read_b32 v{{[0-9]+}}, v[[ZERO]] offset:4 +; GCN: v_mov_b32_e32 v[[PTR:[0-9]+]], lds@abs32@lo +; GCN: ds_read_b32 v{{[0-9]+}}, v[[PTR]] offset:4 ; R600: LDS_READ_RET define amdgpu_kernel void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 { Index: test/CodeGen/AMDGPU/merge-store-crash.ll =================================================================== --- test/CodeGen/AMDGPU/merge-store-crash.ll +++ test/CodeGen/AMDGPU/merge-store-crash.ll @@ -7,7 +7,8 @@ @tess_lds = external addrspace(3) global [8192 x i32] ; CHECK-LABEL: {{^}}main: -; CHECK: ds_write2_b32 +; CHECK: ds_write_b32 +; CHECK: ds_write_b32 ; CHECK: v_mov_b32_e32 v1, v0 ; CHECK: tbuffer_store_format_xyzw v[0:3], define amdgpu_vs void @main(i32 inreg %arg) { Index: test/CodeGen/AMDGPU/over-max-lds-size.ll =================================================================== --- test/CodeGen/AMDGPU/over-max-lds-size.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck -check-prefix=ERROR %s -; RUN: not llc -march=amdgcn -mcpu=hawaii < %s 2>&1 | FileCheck -check-prefix=ERROR %s -; RUN: not llc -march=amdgcn -mcpu=fiji < %s 2>&1 | FileCheck -check-prefix=ERROR %s - -; ERROR: error: local memory limit exceeded (400000) in use_huge_lds - -@huge = internal unnamed_addr addrspace(3) global [100000 x i32] undef, align 4 - -define amdgpu_kernel void @use_huge_lds() { -entry: - %v0 = getelementptr inbounds [100000 x i32], [100000 x i32] addrspace(3)* @huge, i32 0, i32 0 - store i32 0, i32 addrspace(3)* %v0 - ret void -} Index: test/CodeGen/AMDGPU/promote-alloca-globals.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-globals.ll +++ test/CodeGen/AMDGPU/promote-alloca-globals.ll @@ -8,7 +8,10 @@ ; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { ; IR: alloca [10 x i32] ; ASM-LABEL: {{^}}promote_alloca_size_256: -; ASM: ; LDSByteSize: 60000 bytes/workgroup (compile time only) +; ASM: .size global_array0, 30000 +; ASM: .amdgpu_lds global_array0, 4 +; ASM: .size global_array1, 30000 +; ASM: .amdgpu_lds global_array1, 4 define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { entry: Index: test/CodeGen/AMDGPU/s_addk_i32.ll =================================================================== --- test/CodeGen/AMDGPU/s_addk_i32.ll +++ test/CodeGen/AMDGPU/s_addk_i32.ll @@ -101,18 +101,5 @@ ret void } -@lds = addrspace(3) global [512 x i32] undef, align 4 - -; SI-LABEL: {{^}}commute_s_addk_i32: -; SI: s_addk_i32 s{{[0-9]+}}, 0x800{{$}} -define amdgpu_kernel void @commute_s_addk_i32(i32 addrspace(1)* %out, i32 %b) #0 { - %size = call i32 @llvm.amdgcn.groupstaticsize() - %add = add i32 %size, %b - call void asm sideeffect "; foo $0, $1", "v,s"([512 x i32] addrspace(3)* @lds, i32 %add) - ret void -} - -declare i32 @llvm.amdgcn.groupstaticsize() #1 - attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/s_mulk_i32.ll =================================================================== --- test/CodeGen/AMDGPU/s_mulk_i32.ll +++ test/CodeGen/AMDGPU/s_mulk_i32.ll @@ -40,18 +40,5 @@ ret void } -@lds = addrspace(3) global [512 x i32] undef, align 4 - -; SI-LABEL: {{^}}commute_s_mulk_i32: -; SI: s_mulk_i32 s{{[0-9]+}}, 0x800{{$}} -define amdgpu_kernel void @commute_s_mulk_i32(i32 addrspace(1)* %out, i32 %b) #0 { - %size = call i32 @llvm.amdgcn.groupstaticsize() - %add = mul i32 %size, %b - call void asm sideeffect "; foo $0, $1", "v,s"([512 x i32] addrspace(3)* @lds, i32 %add) - ret void -} - -declare i32 @llvm.amdgcn.groupstaticsize() #1 - attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/shl_add_ptr.ll =================================================================== --- test/CodeGen/AMDGPU/shl_add_ptr.ll +++ test/CodeGen/AMDGPU/shl_add_ptr.ll @@ -33,7 +33,11 @@ ; remaining add use goes through the normal shl + add constant fold. ; GCN-LABEL: {{^}}load_shl_base_lds_1: -; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}} + +; TODO: integrate into the ds_read_b32 offset using a 16-bit relocation +; GCN: v_add_{{[iu]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]] + ; GCN: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8 ; GCN: v_add_{{[iu]}}32_e32 [[ADDUSE:v[0-9]+]], vcc, 8, v{{[0-9]+}} ; GCN-DAG: buffer_store_dword [[RESULT]] @@ -68,10 +72,18 @@ ; The two globals are placed adjacent in memory, so the same base ; pointer can be used with an offset into the second one. +; TODO: Recover the optimization of using ds_read2st64_b32 using alignment hints + ; GCN-LABEL: {{^}}load_shl_base_lds_2: -; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}} +; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR0:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]] +; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR1:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]] ; GCN: s_mov_b32 m0, -1 -; GCN-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9 + +; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR0]] offset:256 +; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR1]] offset:256 +; TODO: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9 + ; GCN: s_endpgm define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 Index: test/CodeGen/AMDGPU/si-sgpr-spill.ll =================================================================== --- test/CodeGen/AMDGPU/si-sgpr-spill.ll +++ test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -4,16 +4,11 @@ ; These tests check that the compiler won't crash when it needs to spill ; SGPRs. -@ddxy_lds = external addrspace(3) global [64 x i32] - ; GCN-LABEL: {{^}}main: ; GCN: s_wqm ; Make sure not emitting unused scratch resource descriptor setup ; GCN-NOT: s_mov_b32 -; GCN-NOT: s_mov_b32 -; GCN-NOT: s_mov_b32 -; GCN-NOT: s_mov_b32 ; GCN: s_mov_b32 m0 @@ -26,6 +21,7 @@ ; TOVGPR: ScratchSize: 0{{$}} define amdgpu_ps void @main([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <8 x i32>] addrspace(4)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) { main_body: + %lds = inttoptr i32 0 to [64 x i32] addrspace(3)* %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0 %tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0 %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 96, i32 0) @@ -203,18 +199,18 @@ %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 5, i32 %arg4) #0 %mbcnt.lo.0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tmp109 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.0) - %tmp110 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp109 + %tmp110 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp109 %tmp111 = bitcast float %p2.i to i32 store i32 %tmp111, i32 addrspace(3)* %tmp110 %tmp112 = bitcast float %p2.i96 to i32 store i32 %tmp112, i32 addrspace(3)* %tmp110 %mbcnt.lo.1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tmp113 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.1) - %tmp114 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp113 + %tmp114 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp113 %tmp115 = and i32 %tmp113, -4 - %tmp116 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp115 + %tmp116 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp115 %tmp117 = add i32 %tmp115, 1 - %tmp118 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp117 + %tmp118 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp117 %tmp119 = bitcast float %p2.i to i32 store i32 %tmp119, i32 addrspace(3)* %tmp114 %tmp120 = load i32, i32 addrspace(3)* %tmp116 @@ -241,7 +237,7 @@ %tmp140 = fmul float %tmp59, %p2.i96 %mbcnt.lo.2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tmp141 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.2) - %tmp142 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp141 + %tmp142 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp141 %tmp143 = bitcast float %tmp137 to i32 store i32 %tmp143, i32 addrspace(3)* %tmp142 %tmp144 = bitcast float %tmp138 to i32 @@ -252,11 +248,11 @@ store i32 %tmp146, i32 addrspace(3)* %tmp142 %mbcnt.lo.3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tmp147 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.3) - %tmp148 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp147 + %tmp148 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp147 %tmp149 = and i32 %tmp147, -4 - %tmp150 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp149 + %tmp150 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp149 %tmp151 = add i32 %tmp149, 2 - %tmp152 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp151 + %tmp152 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp151 %tmp153 = bitcast float %tmp137 to i32 store i32 %tmp153, i32 addrspace(3)* %tmp148 %tmp154 = load i32, i32 addrspace(3)* %tmp150 Index: test/CodeGen/AMDGPU/sopk-compares.ll =================================================================== --- test/CodeGen/AMDGPU/sopk-compares.ll +++ test/CodeGen/AMDGPU/sopk-compares.ll @@ -1,12 +1,6 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; Since this intrinsic is exposed as a constant after isel, use it to -; defeat the DAG's compare with constant canonicalizations. -declare i32 @llvm.amdgcn.groupstaticsize() #1 - -@lds = addrspace(3) global [512 x i32] undef, align 4 - ; GCN-LABEL: {{^}}br_scc_eq_i32_inline_imm: ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 4{{$}} define amdgpu_kernel void @br_scc_eq_i32_inline_imm(i32 %cond, i32 addrspace(1)* %out) #0 { @@ -232,23 +226,6 @@ ret void } -; GCN-LABEL: {{^}}br_scc_sge_i32: -; GCN: s_cmpk_ge_i32 s{{[0-9]+}}, 0x800{{$}} -define amdgpu_kernel void @br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 { -entry: - %size = call i32 @llvm.amdgcn.groupstaticsize() - %cmp0 = icmp sge i32 %cond, %size - br i1 %cmp0, label %endif, label %if - -if: - call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds) - br label %endif - -endif: - store volatile i32 1, i32 addrspace(1)* %out - ret void -} - ; GCN-LABEL: {{^}}br_scc_slt_i32: ; GCN: s_cmpk_lt_i32 s{{[0-9]+}}, 0x41{{$}} define amdgpu_kernel void @br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { @@ -265,57 +242,6 @@ ret void } -; GCN-LABEL: {{^}}br_scc_sle_i32: -; GCN: s_cmpk_le_i32 s{{[0-9]+}}, 0x800{{$}} -define amdgpu_kernel void @br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 { -entry: - %size = call i32 @llvm.amdgcn.groupstaticsize() - %cmp0 = icmp sle i32 %cond, %size - br i1 %cmp0, label %endif, label %if - -if: - call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds) - br label %endif - -endif: - store volatile i32 1, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}br_scc_ugt_i32: -; GCN: s_cmpk_gt_u32 s{{[0-9]+}}, 0x800{{$}} -define amdgpu_kernel void @br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { -entry: - %size = call i32 @llvm.amdgcn.groupstaticsize() - %cmp0 = icmp ugt i32 %cond, %size - br i1 %cmp0, label %endif, label %if - -if: - call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds) - br label %endif - -endif: - store volatile i32 1, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}br_scc_uge_i32: -; GCN: s_cmpk_ge_u32 s{{[0-9]+}}, 0x800{{$}} -define amdgpu_kernel void @br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 { -entry: - %size = call i32 @llvm.amdgcn.groupstaticsize() - %cmp0 = icmp uge i32 %cond, %size - br i1 %cmp0, label %endif, label %if - -if: - call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds) - br label %endif - -endif: - store volatile i32 1, i32 addrspace(1)* %out - ret void -} - ; GCN-LABEL: {{^}}br_scc_ult_i32: ; GCN: s_cmpk_lt_u32 s{{[0-9]+}}, 0x41{{$}} define amdgpu_kernel void @br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 { @@ -364,211 +290,6 @@ ret void } -; GCN-LABEL: {{^}}br_scc_ule_i32: -; GCN: s_cmpk_le_u32 s{{[0-9]+}}, 0x800{{$}} -define amdgpu_kernel void @br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 { -entry: - %size = call i32 @llvm.amdgcn.groupstaticsize() - %cmp0 = icmp ule i32 %cond, %size - br i1 %cmp0, label %endif, label %if - -if: - call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds) - br label %endif - -endif: - store volatile i32 1, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}commute_br_scc_eq_i32: -; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x800{{$}} -define amdgpu_kernel void @commute_br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 { -entry: - %size = call i32 @llvm.amdgcn.groupstaticsize() - %cmp0 = icmp eq i32 %size, %cond - br i1 %cmp0, label %endif, label %if - -if: - call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds) - br label %endif - -endif: - store volatile i32 1, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}commute_br_scc_ne_i32: -; GCN: s_cmpk_lg_i32 s{{[0-9]+}}, 0x800{{$}} -define amdgpu_kernel void @commute_br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 { -entry: - %size = call i32 @llvm.amdgcn.groupstaticsize() - %cmp0 = icmp ne i32 %size, %cond - br i1 %cmp0, label %endif, label %if - -if: - call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds) - br label %endif - -endif: - store volatile i32 1, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}commute_br_scc_sgt_i32: -; GCN: s_cmpk_lt_i32 s{{[0-9]+}}, 0x800{{$}} -define amdgpu_kernel void @commute_br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { -entry: - %size = call i32 @llvm.amdgcn.groupstaticsize() - %cmp0 = icmp sgt i32 %size, %cond - br i1 %cmp0, label %endif, label %if - -if: - call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds) - br label %endif - -endif: - store volatile i32 1, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}commute_br_scc_sge_i32: -; GCN: s_cmpk_le_i32 s{{[0-9]+}}, 0x800{{$}} -define amdgpu_kernel void @commute_br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 { -entry: - %size = call i32 @llvm.amdgcn.groupstaticsize() - %cmp0 = icmp sge i32 %size, %cond - br i1 %cmp0, label %endif, label %if - -if: - call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds) - br label %endif - -endif: - store volatile i32 1, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}commute_br_scc_slt_i32: -; GCN: s_cmpk_gt_i32 s{{[0-9]+}}, 0x800{{$}} -define amdgpu_kernel void @commute_br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { -entry: - %size = call i32 @llvm.amdgcn.groupstaticsize() - %cmp0 = icmp slt i32 %size, %cond - br i1 %cmp0, label %endif, label %if - -if: - call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds) - br label %endif - -endif: - store volatile i32 1, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}commute_br_scc_sle_i32: -; GCN: s_cmpk_ge_i32 s{{[0-9]+}}, 0x800{{$}} -define amdgpu_kernel void @commute_br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 { -entry: - %size = call i32 @llvm.amdgcn.groupstaticsize() - %cmp0 = icmp sle i32 %size, %cond - br i1 %cmp0, label %endif, label %if - -if: - call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds) - br label %endif - -endif: - store volatile i32 1, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}commute_br_scc_ugt_i32: -; GCN: s_cmpk_lt_u32 s{{[0-9]+}}, 0x800{{$}} -define amdgpu_kernel void @commute_br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { -entry: - %size = call i32 @llvm.amdgcn.groupstaticsize() - %cmp0 = icmp ugt i32 %size, %cond - br i1 %cmp0, label %endif, label %if - -if: - call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds) - br label %endif - -endif: - store volatile i32 1, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}commute_br_scc_uge_i32: -; GCN: s_cmpk_le_u32 s{{[0-9]+}}, 0x800{{$}} -define amdgpu_kernel void @commute_br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 { -entry: - %size = call i32 @llvm.amdgcn.groupstaticsize() - %cmp0 = icmp uge i32 %size, %cond - br i1 %cmp0, label %endif, label %if - -if: - call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds) - br label %endif - -endif: - store volatile i32 1, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}commute_br_scc_ult_i32: -; GCN: s_cmpk_gt_u32 s{{[0-9]+}}, 0x800{{$}} -define amdgpu_kernel void @commute_br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 { -entry: - %size = call i32 @llvm.amdgcn.groupstaticsize() - %cmp0 = icmp ult i32 %size, %cond - br i1 %cmp0, label %endif, label %if - -if: - call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds) - br label %endif - -endif: - store volatile i32 1, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}commute_br_scc_ule_i32: -; GCN: s_cmpk_ge_u32 s{{[0-9]+}}, 0x800{{$}} -define amdgpu_kernel void @commute_br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 { -entry: - %size = call i32 @llvm.amdgcn.groupstaticsize() - %cmp0 = icmp ule i32 %size, %cond - br i1 %cmp0, label %endif, label %if - -if: - call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds) - br label %endif - -endif: - store volatile i32 1, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}br_scc_ult_i32_non_u16: -; GCN: s_cmp_lt_u32 s2, 0xfffff7ff -define amdgpu_kernel void @br_scc_ult_i32_non_u16(i32 %cond, i32 addrspace(1)* %out) #0 { -entry: - %size = call i32 @llvm.amdgcn.groupstaticsize() - %not.size = xor i32 %size, -1 - %cmp0 = icmp ult i32 %cond, %not.size - br i1 %cmp0, label %endif, label %if - -if: - call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds) - br label %endif - -endif: - store volatile i32 1, i32 addrspace(1)* %out - ret void -} - ; GCN-LABEL: {{^}}br_scc_eq_i64_inline_imm: ; VI: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, 4 Index: test/CodeGen/AMDGPU/sub.i16.ll =================================================================== --- test/CodeGen/AMDGPU/sub.i16.ll +++ test/CodeGen/AMDGPU/sub.i16.ll @@ -144,26 +144,7 @@ ret void } -@lds = addrspace(3) global [512 x i32] undef, align 4 - -; GCN-LABEL: {{^}}v_test_sub_i16_constant_commute: -; VI: v_subrev_u16_e32 v{{[0-9]+}}, 0x800, v{{[0-9]+}} -; CI: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 0x800, v{{[0-9]+}} -define amdgpu_kernel void @v_test_sub_i16_constant_commute(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { - %size = call i32 @llvm.amdgcn.groupstaticsize() - %size.trunc = trunc i32 %size to i16 - call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds) - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid - %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid - %a = load volatile i16, i16 addrspace(1)* %gep.in0 - %add = sub i16 %a, %size.trunc - store i16 %add, i16 addrspace(1)* %out - ret void -} - declare i32 @llvm.amdgcn.workitem.id.x() #0 -declare i32 @llvm.amdgcn.groupstaticsize() #0 attributes #0 = { nounwind readnone } attributes #1 = { nounwind } Index: test/CodeGen/AMDGPU/target-cpu.ll =================================================================== --- test/CodeGen/AMDGPU/target-cpu.ll +++ test/CodeGen/AMDGPU/target-cpu.ll @@ -78,7 +78,6 @@ ; CHECK-LABEL: {{^}}promote_alloca_enabled: ; CHECK: ds_read_b32 -; CHECK: ; LDSByteSize: 5120 define amdgpu_kernel void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 { entry: %stack = alloca [5 x i32], align 4, addrspace(5) Index: test/CodeGen/MIR/AMDGPU/machine-function-info.ll =================================================================== --- test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -10,7 +10,7 @@ ; CHECK: machineFunctionInfo: ; CHECK-NEXT: explicitKernArgSize: 128 ; CHECK-NEXT: maxKernArgAlign: 64 -; CHECK-NEXT: ldsSize: 2048 +; CHECK-NEXT: ldsSize: 0 ; CHECK-NEXT: isEntryFunction: true ; CHECK-NEXT: noSignedZerosFPMath: false ; CHECK-NEXT: memoryBound: false