Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -193,6 +193,17 @@ virtual bool useSoftFloat() const { return false; } + /// \p returns the alignment of that should be used for a temporary stack + /// slot. + virtual unsigned getStackTemporaryPreferredAlign(const DataLayout &DL, + LLVMContext &Context, + EVT VT, + unsigned MinAlign = 1) const; + virtual unsigned getStackTemporaryPreferredAlign(const DataLayout &DL, + LLVMContext &Context, + EVT VT1, + EVT VT2) const; + /// Return the pointer type for the given address space, defaults to /// the pointer type from the data layout. /// FIXME: The default needs to be removed once all the code is updated. Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Triple.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" @@ -1633,14 +1634,14 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT, const SDLoc &dl) { // Create the stack frame object. - unsigned SrcAlign = DAG.getDataLayout().getPrefTypeAlignment( - SrcOp.getValueType().getTypeForEVT(*DAG.getContext())); - SDValue FIPtr = DAG.CreateStackTemporary(SlotVT, SrcAlign); + SDValue FIPtr = DAG.CreateStackTemporary(SrcOp.getValueType(), SlotVT); FrameIndexSDNode *StackPtrFI = cast(FIPtr); int SPFI = StackPtrFI->getIndex(); - MachinePointerInfo PtrInfo = - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); + + MachineFunction &MF = DAG.getMachineFunction(); + MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, SPFI); + unsigned SrcAlign = MF.getFrameInfo().getObjectAlignment(SPFI); unsigned SrcSize = SrcOp.getValueSizeInBits(); unsigned SlotSize = SlotVT.getSizeInBits(); Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1816,24 +1816,21 @@ MachinePointerInfo(VD)); } -SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) { +SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned MinAlign) { MachineFrameInfo &MFI = getMachineFunction().getFrameInfo(); - unsigned ByteSize = VT.getStoreSize(); - Type *Ty = VT.getTypeForEVT(*getContext()); - unsigned StackAlign = - std::max((unsigned)getDataLayout().getPrefTypeAlignment(Ty), minAlign); + unsigned StackAlign + = TLI->getStackTemporaryPreferredAlign(getDataLayout(), *getContext(), + VT, MinAlign); + unsigned ByteSize = VT.getStoreSize(); int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); return getFrameIndex(FrameIdx, TLI->getPointerTy(getDataLayout())); } SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) { unsigned Bytes = std::max(VT1.getStoreSize(), VT2.getStoreSize()); - Type *Ty1 = VT1.getTypeForEVT(*getContext()); - Type *Ty2 = VT2.getTypeForEVT(*getContext()); - const DataLayout &DL = getDataLayout(); - unsigned Align = - std::max(DL.getPrefTypeAlignment(Ty1), DL.getPrefTypeAlignment(Ty2)); + unsigned Align = TLI->getStackTemporaryPreferredAlign(getDataLayout(), + *getContext(), VT1, VT2); MachineFrameInfo &MFI = getMachineFunction().getFrameInfo(); int FrameIdx = MFI.CreateStackObject(Bytes, Align, false); Index: lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- lib/CodeGen/TargetLoweringBase.cpp +++ lib/CodeGen/TargetLoweringBase.cpp @@ -970,6 +970,24 @@ setOperationAction(ISD::DEBUGTRAP, MVT::Other, Expand); } +unsigned TargetLoweringBase::getStackTemporaryPreferredAlign( + const DataLayout &DL, + LLVMContext &Context, + EVT VT, unsigned MinAlign) const { + Type *Ty = VT.getTypeForEVT(Context); + return std::max((unsigned)DL.getPrefTypeAlignment(Ty), MinAlign); +} + +unsigned TargetLoweringBase::getStackTemporaryPreferredAlign(const DataLayout &DL, + LLVMContext &Context, + EVT VT1, + EVT VT2) const { + Type *Ty1 = VT1.getTypeForEVT(Context); + Type *Ty2 = VT2.getTypeForEVT(Context); + + return std::max(DL.getPrefTypeAlignment(Ty1), DL.getPrefTypeAlignment(Ty2)); +} + MVT TargetLoweringBase::getScalarShiftAmountTy(const DataLayout &DL, EVT) const { return MVT::getIntegerVT(8 * DL.getPointerSize(0)); Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -119,6 +119,22 @@ public: AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI); + // Any 4-byte aligned access is always legal, and stack objects are broken + // into accesses of 4-byte elements, so anything higher is just wasting space. + unsigned getStackTemporaryPreferredAlign(const DataLayout &DL, + LLVMContext &Context, + EVT VT, + unsigned MinAlign = 1) const override { + return std::max(4u, MinAlign); + } + + unsigned getStackTemporaryPreferredAlign(const DataLayout &DL, + LLVMContext &Context, + EVT VT1, + EVT VT2) const override { + return 4; + } + bool mayIgnoreSignedZero(SDValue Op) const { if (getTargetMachine().Options.UnsafeFPMath) // FIXME: nsz only return true; Index: test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -209,7 +209,7 @@ ; GCN-DAG: v_mov_b32_e32 [[BASE_FI:v[0-9]+]], 0{{$}} ; GCN-DAG: s_and_b32 [[MASK_IDX:s[0-9]+]], s{{[0-9]+}}, 3{{$}} -; GCN-DAG: v_or_b32_e32 [[IDX:v[0-9]+]], [[MASK_IDX]], [[BASE_FI]]{{$}} +; GCN-DAG: v_add_i32_e32 [[IDX:v[0-9]+]], vcc, [[MASK_IDX]], [[BASE_FI]]{{$}} ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:6 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 @@ -388,8 +388,7 @@ ret void } -; FIXME: Should be able to do without stack access. The used stack -; space is also 2x what should be required. +; FIXME: Should be able to do without stack access. ; GCN-LABEL: {{^}}dynamic_insertelement_v4f64: ; GCN: SCRATCH_RSRC_DWORD @@ -410,7 +409,7 @@ ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm -; GCN: ScratchSize: 64 +; GCN: ScratchSize: 36 define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind { %vecins = insertelement <4 x double> %a, double 8.0, i32 %b @@ -438,7 +437,7 @@ ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm -; GCN: ScratchSize: 128 +; GCN: ScratchSize: 68 define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind { %vecins = insertelement <8 x double> %a, double 8.0, i32 %b store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16 Index: test/CodeGen/AMDGPU/local-stack-slot-bug.ll =================================================================== --- test/CodeGen/AMDGPU/local-stack-slot-bug.ll +++ test/CodeGen/AMDGPU/local-stack-slot-bug.ll @@ -7,14 +7,12 @@ ; ; CHECK-LABEL: {{^}}main: -; CHECK-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x200 -; CHECK-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} ; CHECK-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0 ; CHECK-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]] ; TODO: add 0? -; CHECK-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[CLAMP_IDX]], [[ZERO]] -; CHECK-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[CLAMP_IDX]], [[K]] +; CHECK-DAG: v_add_i32_e32 [[LO_OFF:v[0-9]+]], vcc, 0, [[CLAMP_IDX]] +; CHECK-DAG: v_add_i32_e32 [[HI_OFF:v[0-9]+]], vcc, 0x200, [[CLAMP_IDX]] ; CHECK: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen ; CHECK: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll =================================================================== --- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -15,7 +15,7 @@ ; HSA: enable_sgpr_private_segment_buffer = 1 ; HSA: enable_sgpr_flat_scratch_init = 0 -; HSA: workitem_private_segment_byte_size = 1024 +; HSA: workitem_private_segment_byte_size = 540 ; GCN-NOT: flat_scr @@ -40,7 +40,7 @@ ; GCN: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}} ; GCN: NumVgprs: 256 -; GCN: ScratchSize: 1024 +; GCN: ScratchSize: 540 ; s[0:3] input user SGPRs. s4,s5,s6 = workgroup IDs. s8 scratch offset. define void @spill_vgpr_compute(<4 x float> %arg6, float addrspace(1)* %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 { Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll =================================================================== --- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -24,7 +24,7 @@ ; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Reload ; GCN: NumVgprs: 256 -; GCN: ScratchSize: 1024 +; GCN: ScratchSize: 536 define amdgpu_vs void @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 { bb: