Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -35,10 +35,17 @@ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State) { MachineFunction &MF = State.getMachineFunction(); + const DataLayout &DL = MF.getDataLayout(); AMDGPUMachineFunction *MFI = MF.getInfo(); - uint64_t Offset = MFI->allocateKernArg(ValVT.getStoreSize(), - ArgFlags.getOrigAlign()); + Type *ValTy = EVT(ValVT).getTypeForEVT(State.getContext()); + + // XXX - What is orig align supposed to mean? It seem to be completely broken + // for any split vectors after the first component. + unsigned Align = std::max(DL.getABITypeAlignment(ValTy), + ArgFlags.getOrigAlign()); + + uint64_t Offset = MFI->allocateKernArg(ValVT.getStoreSize(), Align); State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo)); return true; } Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -360,6 +360,10 @@ return isAmdHsaOS() ? 0 : 36; } + unsigned getKernargSegmentPtrAlignment() const { + return 256; + } + unsigned getMaxNumUserSGPRs() const { return 16; } Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -22,7 +22,8 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &DL, - SDValue Chain, unsigned Offset, bool Signed) const; + SDValue Chain, unsigned Offset, unsigned Align, + bool Signed) const; SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const override; SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -536,7 +536,8 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, - unsigned Offset, bool Signed) const { + unsigned Offset, unsigned Align, + bool Signed) const { const DataLayout &DL = DAG.getDataLayout(); MachineFunction &MF = DAG.getMachineFunction(); const SIRegisterInfo *TRI = @@ -552,21 +553,38 @@ MRI.getLiveInVirtReg(InputPtrReg), PtrVT); SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, DAG.getConstant(Offset, SL, PtrVT)); - SDValue PtrOffset = DAG.getUNDEF(PtrVT); + MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); - unsigned Align = DL.getABITypeAlignment(Ty); - ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD; + if (MemVT.isFloatingPoint()) ExtTy = ISD::EXTLOAD; - return DAG.getLoad(ISD::UNINDEXED, ExtTy, - VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT, - false, // isVolatile - true, // isNonTemporal - true, // isInvariant - Align); // Alignment + EVT LoadVT = VT.bitsGE(MemVT) ? VT : MemVT; + + SDValue Load = DAG.getExtLoad( + ExtTy, + SL, + LoadVT, + Chain, + Ptr, + PtrInfo, + MemVT, + false, // isVolatile + true, // isNonTemporal + true, // isInvariant + Align); + + if (LoadVT == VT) + return Load; + + SDValue Ops[] = { + DAG.getNode(ISD::TRUNCATE, SL, VT, Load), + Load.getValue(1) + }; + + return DAG.getMergeValues(Ops, SL); } SDValue SITargetLowering::LowerFormalArguments( @@ -726,10 +744,18 @@ EVT MemVT = Splits[i].VT; const unsigned Offset = Subtarget->getExplicitKernelArgOffset() + VA.getLocMemOffset(); + + // We already correctly aligned the offset when the argument was + // allocated. Figure out the alignment of the load from there. + unsigned Align + = MinAlign(Subtarget->getKernargSegmentPtrAlignment(), Offset); + // The first 36 bytes of the input buffer contains information about // thread group and global sizes. SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain, - Offset, Ins[i].Flags.isSExt()); + Offset, + Align, + Ins[i].Flags.isSExt()); Chains.push_back(Arg.getValue(1)); auto *ParamTy = @@ -1488,8 +1514,13 @@ MVT VT, unsigned Offset) const { SDLoc SL(Op); + + unsigned Align + = MinAlign(Subtarget->getKernargSegmentPtrAlignment(), Offset); + + // FIXME: Get alignment from somewhere. Base pointer should be 256 aligned. SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL, - DAG.getEntryNode(), Offset, false); + DAG.getEntryNode(), Offset, Align, false); // The local size values will have the hi 16-bits as zero. return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, DAG.getValueType(VT)); @@ -1576,37 +1607,55 @@ return emitNonHSAIntrinsicError(DAG, DL, VT); return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_X, false); + SI::KernelInputOffsets::NGROUPS_X, + MinAlign(Subtarget->getKernargSegmentPtrAlignment(), + SI::KernelInputOffsets::NGROUPS_X), + false); case Intrinsic::r600_read_ngroups_y: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Y, false); + SI::KernelInputOffsets::NGROUPS_Y, + MinAlign(Subtarget->getKernargSegmentPtrAlignment(), + SI::KernelInputOffsets::NGROUPS_Y), + false); case Intrinsic::r600_read_ngroups_z: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Z, false); + SI::KernelInputOffsets::NGROUPS_Z, + MinAlign(Subtarget->getKernargSegmentPtrAlignment(), + SI::KernelInputOffsets::NGROUPS_Z), + false); case Intrinsic::r600_read_global_size_x: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_X, false); + SI::KernelInputOffsets::GLOBAL_SIZE_X, + MinAlign(Subtarget->getKernargSegmentPtrAlignment(), + SI::KernelInputOffsets::GLOBAL_SIZE_X), + false); case Intrinsic::r600_read_global_size_y: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); + SI::KernelInputOffsets::GLOBAL_SIZE_Y, + MinAlign(Subtarget->getKernargSegmentPtrAlignment(), + SI::KernelInputOffsets::GLOBAL_SIZE_Y), + false); case Intrinsic::r600_read_global_size_z: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); + SI::KernelInputOffsets::GLOBAL_SIZE_Z, + MinAlign(Subtarget->getKernargSegmentPtrAlignment(), + SI::KernelInputOffsets::GLOBAL_SIZE_Z), + false); case Intrinsic::r600_read_local_size_x: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT);