diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -315,14 +315,12 @@ const DataLayout &DL = GV->getParent()->getDataLayout(); uint64_t Size = DL.getTypeAllocSize(GV->getValueType()); - unsigned Align = GV->getAlignment(); - if (!Align) - Align = 4; + Align Alignment = GV->getAlign().getValueOr(Align(4)); emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration()); emitLinkage(GV, GVSym); if (auto TS = getTargetStreamer()) - TS->emitAMDGPULDS(GVSym, Size, Align); + TS->emitAMDGPULDS(GVSym, Size, Alignment); return; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -317,10 +317,9 @@ Type *Ty = I.getType(); const DataLayout &DL = Mod->getDataLayout(); int TySize = DL.getTypeSizeInBits(Ty); - unsigned Align = I.getAlignment() ? - I.getAlignment() : DL.getABITypeAlignment(Ty); + Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty); - return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I); + return I.isSimple() && TySize < 32 && Alignment >= 4 && DA->isUniform(&I); } bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1010,7 +1010,7 @@ const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn); CallingConv::ID CC = Fn.getCallingConv(); - unsigned MaxAlign = 1; + Align MaxAlign = Align(1); uint64_t ExplicitArgOffset = 0; const DataLayout &DL = Fn.getParent()->getDataLayout(); @@ -1018,12 +1018,12 @@ for (const Argument &Arg : Fn.args()) { Type *BaseArgTy = Arg.getType(); - unsigned Align = DL.getABITypeAlignment(BaseArgTy); - MaxAlign = std::max(Align, MaxAlign); + Align Alignment = DL.getABITypeAlign(BaseArgTy); + MaxAlign = std::max(Alignment, MaxAlign); unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy); - uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset; - ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize; + uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset; + ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize; // We're basically throwing away everything passed into us and starting over // to get accurate in-memory offsets. The "PartOffset" is completely useless @@ -2931,16 +2931,17 @@ EVT VT = LN->getMemoryVT(); unsigned Size = VT.getStoreSize(); - unsigned Align = LN->getAlignment(); - if (Align < Size && isTypeLegal(VT)) { + Align Alignment = LN->getAlign(); + if (Alignment < Size && isTypeLegal(VT)) { bool IsFast; unsigned AS = LN->getAddressSpace(); // Expand unaligned loads earlier than legalization. Due to visitation order // problems during legalization, the emitted instructions to pack and unpack // the bytes again are not eliminated in the case of an unaligned copy. - if (!allowsMisalignedMemoryAccesses( - VT, AS, Align, LN->getMemOperand()->getFlags(), &IsFast)) { + if (!allowsMisalignedMemoryAccesses(VT, AS, Alignment.value(), + LN->getMemOperand()->getFlags(), + &IsFast)) { SDValue Ops[2]; if (VT.isVector()) @@ -2985,8 +2986,8 @@ SDLoc SL(N); SelectionDAG &DAG = DCI.DAG; - unsigned Align = SN->getAlignment(); - if (Align < Size && isTypeLegal(VT)) { + Align Alignment = SN->getAlign(); + if (Alignment < Size && isTypeLegal(VT)) { bool IsFast; unsigned AS = SN->getAddressSpace(); @@ -2994,8 +2995,9 @@ // order problems during legalization, the emitted instructions to pack and // unpack the bytes again are not eliminated in the case of an unaligned // copy. - if (!allowsMisalignedMemoryAccesses( - VT, AS, Align, SN->getMemOperand()->getFlags(), &IsFast)) { + if (!allowsMisalignedMemoryAccesses(VT, AS, Alignment.value(), + SN->getMemOperand()->getFlags(), + &IsFast)) { if (VT.isVector()) return scalarizeVectorStore(SN, DAG); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -43,14 +43,13 @@ if (!Entry.second) return Entry.first->second; - unsigned Align = GV.getAlignment(); - if (Align == 0) - Align = DL.getABITypeAlignment(GV.getValueType()); + Align Alignment = + DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType()); /// TODO: We should sort these to minimize wasted space due to alignment /// padding. Currently the padding is decided by the first encountered use /// during lowering. - unsigned Offset = LDSSize = alignTo(LDSSize, Align); + unsigned Offset = LDSSize = alignTo(LDSSize, Alignment); Entry.first->second = Offset; LDSSize += DL.getTypeAllocSize(GV.getValueType()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -736,16 +736,15 @@ continue; if (Use->getParent()->getParent() == &F) { - unsigned Align = GV.getAlignment(); - if (Align == 0) - Align = DL.getABITypeAlignment(GV.getValueType()); + Align Alignment = + DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType()); // FIXME: Try to account for padding here. The padding is currently // determined from the inverse order of uses in the function. I'm not // sure if the use list order is in any way connected to this, so the // total reported size is likely incorrect. uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType()); - CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align); + CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alignment); CurrentLocalMemUsage += AllocSize; break; } @@ -837,9 +836,8 @@ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction); unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second; - unsigned Align = I.getAlignment(); - if (Align == 0) - Align = DL.getABITypeAlignment(I.getAllocatedType()); + Align Alignment = + DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType()); // FIXME: This computed padding is likely wrong since it depends on inverse // usage order. @@ -847,7 +845,7 @@ // FIXME: It is also possible that if we're allowed to use all of the memory // could could end up using more than the maximum due to alignment padding. - uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align); + uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment); uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy); NewSize += AllocSize; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1303,18 +1303,16 @@ // the three offsets (voffset, soffset and instoffset) static unsigned setBufferOffsets(MachineIRBuilder &B, const AMDGPURegisterBankInfo &RBI, - Register CombinedOffset, - Register &VOffsetReg, - Register &SOffsetReg, - int64_t &InstOffsetVal, - unsigned Align) { + Register CombinedOffset, Register &VOffsetReg, + Register &SOffsetReg, int64_t &InstOffsetVal, + Align Alignment) { const LLT S32 = LLT::scalar(32); MachineRegisterInfo *MRI = B.getMRI(); if (Optional Imm = getConstantVRegVal(CombinedOffset, *MRI)) { uint32_t SOffset, ImmOffset; - if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, - &RBI.Subtarget, Align)) { + if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget, + Alignment)) { VOffsetReg = B.buildConstant(S32, 0).getReg(0); SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); InstOffsetVal = ImmOffset; @@ -1334,7 +1332,7 @@ uint32_t SOffset, ImmOffset; if (Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, - &RBI.Subtarget, Align)) { + &RBI.Subtarget, Alignment)) { if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { VOffsetReg = Base; SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); @@ -1417,7 +1415,7 @@ // Use the alignment to ensure that the required offsets will fit into the // immediate offsets. - const unsigned Alignment = NumLoads > 1 ? 16 * NumLoads : 1; + const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1); MachineIRBuilder B(MI); MachineFunction &MF = B.getMF(); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -4460,19 +4460,19 @@ if (Size > LocalMemorySize) return Error(SizeLoc, "size is too large"); - int64_t Align = 4; + int64_t Alignment = 4; if (getLexer().is(AsmToken::Comma)) { Lex(); SMLoc AlignLoc = getLexer().getLoc(); - if (getParser().parseAbsoluteExpression(Align)) + if (getParser().parseAbsoluteExpression(Alignment)) return true; - if (Align < 0 || !isPowerOf2_64(Align)) + if (Alignment < 0 || !isPowerOf2_64(Alignment)) return Error(AlignLoc, "alignment must be a power of two"); // Alignment larger than the size of LDS is possible in theory, as long // as the linker manages to place to symbol at address 0, but we do want // to make sure the alignment fits nicely into a 32-bit integer. - if (Align >= 1u << 31) + if (Alignment >= 1u << 31) return Error(AlignLoc, "alignment is too large"); } @@ -4484,7 +4484,7 @@ if (!Symbol->isUndefined()) return Error(NameLoc, "invalid symbol redefinition"); - getTargetStreamer().emitAMDGPULDS(Symbol, Size, Align); + getTargetStreamer().emitAMDGPULDS(Symbol, Size, Align(Alignment)); return false; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -54,7 +54,7 @@ virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) = 0; virtual void emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, - unsigned Align) = 0; + Align Alignment) = 0; /// \returns True on success, false on failure. virtual bool EmitISAVersion(StringRef IsaVersionString) = 0; @@ -110,7 +110,7 @@ void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override; - void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, unsigned Align) override; + void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override; /// \returns True on success, false on failure. bool EmitISAVersion(StringRef IsaVersionString) override; @@ -158,7 +158,7 @@ void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override; - void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, unsigned Align) override; + void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override; /// \returns True on success, false on failure. bool EmitISAVersion(StringRef IsaVersionString) override; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -212,9 +212,9 @@ } void AMDGPUTargetAsmStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, - unsigned Align) { - OS << "\t.amdgpu_lds " << Symbol->getName() << ", " << Size << ", " << Align - << '\n'; + Align Alignment) { + OS << "\t.amdgpu_lds " << Symbol->getName() << ", " << Size << ", " + << Alignment.value() << '\n'; } bool AMDGPUTargetAsmStreamer::EmitISAVersion(StringRef IsaVersionString) { @@ -515,9 +515,7 @@ } void AMDGPUTargetELFStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, - unsigned Align) { - assert(isPowerOf2_32(Align)); - + Align Alignment) { MCSymbolELF *SymbolELF = cast(Symbol); SymbolELF->setType(ELF::STT_OBJECT); @@ -526,7 +524,7 @@ SymbolELF->setExternal(true); } - if (SymbolELF->declareCommon(Size, Align, true)) { + if (SymbolELF->declareCommon(Size, Alignment.value(), true)) { report_fatal_error("Symbol: " + Symbol->getName() + " redeclared as different type"); } diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1265,10 +1265,11 @@ return scalarizeVectorStore(StoreNode, DAG); } - unsigned Align = StoreNode->getAlignment(); - if (Align < MemVT.getStoreSize() && - !allowsMisalignedMemoryAccesses( - MemVT, AS, Align, StoreNode->getMemOperand()->getFlags(), nullptr)) { + Align Alignment = StoreNode->getAlign(); + if (Alignment < MemVT.getStoreSize() && + !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment.value(), + StoreNode->getMemOperand()->getFlags(), + nullptr)) { return expandUnalignedStore(StoreNode, DAG); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -42,7 +42,8 @@ SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const; SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, - uint64_t Offset, unsigned Align, bool Signed, + uint64_t Offset, Align Alignment, + bool Signed, const ISD::InputArg *Arg = nullptr) const; SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA, @@ -216,7 +217,7 @@ /// \returns 0 If there is a non-constant offset or if the offset is 0. /// Otherwise returns the constant offset. unsigned setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, - SDValue *Offsets, unsigned Align = 4) const; + SDValue *Offsets, Align Alignment = Align(4)) const; // Handle 8 bit and 16 bit buffer loads SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1578,16 +1578,15 @@ } SDValue SITargetLowering::lowerKernargMemParameter( - SelectionDAG &DAG, EVT VT, EVT MemVT, - const SDLoc &SL, SDValue Chain, - uint64_t Offset, unsigned Align, bool Signed, - const ISD::InputArg *Arg) const { + SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, + uint64_t Offset, Align Alignment, bool Signed, + const ISD::InputArg *Arg) const { MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); // Try to avoid using an extload by loading earlier than the argument address, // and extracting the relevant bits. The load should hopefully be merged with // the previous argument. - if (MemVT.getStoreSize() < 4 && Align < 4) { + if (MemVT.getStoreSize() < 4 && Alignment < 4) { // TODO: Handle align < 4 and size >= 4 (can happen with packed structs). int64_t AlignDownOffset = alignDown(Offset, 4); int64_t OffsetDiff = Offset - AlignDownOffset; @@ -1613,9 +1612,9 @@ } SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset); - SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align, + SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment, MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); + MachineMemOperand::MOInvariant); SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg); return DAG.getMergeValues({ Val, Load.getValue(1) }, SL); @@ -2233,9 +2232,9 @@ // // FIXME: Alignment of explicit arguments totally broken with non-0 explicit // kern arg offset. - const unsigned KernelArgBaseAlign = 16; + const Align KernelArgBaseAlign = Align(16); - for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { + for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { const ISD::InputArg &Arg = Ins[i]; if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) { InVals.push_back(DAG.getUNDEF(Arg.VT)); @@ -2250,10 +2249,11 @@ EVT MemVT = VA.getLocVT(); const uint64_t Offset = VA.getLocMemOffset(); - unsigned Align = MinAlign(KernelArgBaseAlign, Offset); + Align Alignment = commonAlignment(KernelArgBaseAlign, Offset); - SDValue Arg = lowerKernargMemParameter( - DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]); + SDValue Arg = + lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset, Alignment, + Ins[i].Flags.isSExt(), &Ins[i]); Chains.push_back(Arg.getValue(1)); auto *ParamTy = @@ -3127,7 +3127,7 @@ SDValue Size = Tmp2.getOperand(1); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); Chain = SP.getValue(1); - unsigned Align = cast(Tmp3)->getZExtValue(); + MaybeAlign Alignment(cast(Tmp3)->getZExtValue()); const GCNSubtarget &ST = MF.getSubtarget(); const TargetFrameLowering *TFL = ST.getFrameLowering(); unsigned Opc = @@ -3138,12 +3138,13 @@ ISD::SHL, dl, VT, Size, DAG.getConstant(ST.getWavefrontSizeLog2(), dl, MVT::i32)); - unsigned StackAlign = TFL->getStackAlignment(); + Align StackAlign = TFL->getStackAlign(); Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value - if (Align > StackAlign) { - Tmp1 = DAG.getNode( - ISD::AND, dl, VT, Tmp1, - DAG.getConstant(-(uint64_t)Align << ST.getWavefrontSizeLog2(), dl, VT)); + if (Alignment && *Alignment > StackAlign) { + Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, + DAG.getConstant(-(uint64_t)Alignment->value() + << ST.getWavefrontSizeLog2(), + dl, VT)); } Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain @@ -5538,11 +5539,11 @@ Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext()); PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); const DataLayout &DataLayout = DAG.getDataLayout(); - unsigned Align = DataLayout.getABITypeAlignment(PtrTy); + Align Alignment = DataLayout.getABITypeAlign(PtrTy); MachinePointerInfo PtrInfo = MachinePointerInfo::getGOT(DAG.getMachineFunction()); - return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align, + return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment, MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); } @@ -5568,8 +5569,8 @@ MVT VT, unsigned Offset) const { SDLoc SL(Op); - SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL, - DAG.getEntryNode(), Offset, 4, false); + SDValue Param = lowerKernargMemParameter( + DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false); // The local size values will have the hi 16-bits as zero. return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, DAG.getValueType(VT)); @@ -6203,7 +6204,8 @@ // Use the alignment to ensure that the required offsets will fit into the // immediate offsets. - setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4); + setBufferOffsets(Offset, DAG, &Ops[3], + NumLoads > 1 ? Align(16 * NumLoads) : Align(4)); uint64_t InstOffset = cast(Ops[5])->getZExtValue(); for (unsigned i = 0; i < NumLoads; ++i) { @@ -6299,37 +6301,43 @@ return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_X, 4, false); + SI::KernelInputOffsets::NGROUPS_X, Align(4), + false); case Intrinsic::r600_read_ngroups_y: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Y, 4, false); + SI::KernelInputOffsets::NGROUPS_Y, Align(4), + false); case Intrinsic::r600_read_ngroups_z: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Z, 4, false); + SI::KernelInputOffsets::NGROUPS_Z, Align(4), + false); case Intrinsic::r600_read_global_size_x: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false); + SI::KernelInputOffsets::GLOBAL_SIZE_X, + Align(4), false); case Intrinsic::r600_read_global_size_y: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false); + SI::KernelInputOffsets::GLOBAL_SIZE_Y, + Align(4), false); case Intrinsic::r600_read_global_size_z: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false); + SI::KernelInputOffsets::GLOBAL_SIZE_Z, + Align(4), false); case Intrinsic::r600_read_local_size_x: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); @@ -7618,13 +7626,14 @@ // three offsets (voffset, soffset and instoffset) into the SDValue[3] array // pointed to by Offsets. unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset, - SelectionDAG &DAG, SDValue *Offsets, - unsigned Align) const { + SelectionDAG &DAG, SDValue *Offsets, + Align Alignment) const { SDLoc DL(CombinedOffset); if (auto C = dyn_cast(CombinedOffset)) { uint32_t Imm = C->getZExtValue(); uint32_t SOffset, ImmOffset; - if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) { + if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, + Alignment)) { Offsets[0] = DAG.getConstant(0, DL, MVT::i32); Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); @@ -7637,7 +7646,7 @@ uint32_t SOffset, ImmOffset; int Offset = cast(N1)->getSExtValue(); if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, - Subtarget, Align)) { + Subtarget, Alignment)) { Offsets[0] = N0; Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -15,6 +15,7 @@ #include "llvm/IR/CallingConv.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/Support/AMDHSAKernelDescriptor.h" +#include "llvm/Support/Alignment.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TargetParser.h" @@ -692,7 +693,8 @@ bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset); bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, - const GCNSubtarget *Subtarget, uint32_t Align = 4); + const GCNSubtarget *Subtarget, + Align Alignment = Align(4)); /// \returns true if the intrinsic is divergent bool isIntrinsicSourceOfDivergence(unsigned IntrID); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1377,8 +1377,8 @@ // aligned if they are aligned to begin with. It also ensures that additional // offsets within the given alignment can be added to the resulting ImmOffset. bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, - const GCNSubtarget *Subtarget, uint32_t Align) { - const uint32_t MaxImm = alignDown(4095, Align); + const GCNSubtarget *Subtarget, Align Alignment) { + const uint32_t MaxImm = alignDown(4095, Alignment.value()); uint32_t Overflow = 0; if (Imm > MaxImm) { @@ -1396,10 +1396,10 @@ // // Atomic operations fail to work correctly when individual address // components are unaligned, even if their sum is aligned. - uint32_t High = (Imm + Align) & ~4095; - uint32_t Low = (Imm + Align) & 4095; + uint32_t High = (Imm + Alignment.value()) & ~4095; + uint32_t Low = (Imm + Alignment.value()) & 4095; Imm = Low; - Overflow = High - Align; + Overflow = High - Alignment.value(); } }