Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7979,13 +7979,6 @@ assert(Op.getValueType().getVectorElementType() == MVT::i32 && "Custom lowering for non-i32 vectors hasn't been implemented."); - if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), - MemVT, *Load->getMemOperand())) { - SDValue Ops[2]; - std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); - return DAG.getMergeValues(Ops, DL); - } - unsigned Alignment = Load->getAlignment(); unsigned AS = Load->getAddressSpace(); if (Subtarget->hasLDSMisalignedBug() && @@ -8097,6 +8090,14 @@ return SplitVectorLoad(Op, DAG); } } + + if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + MemVT, *Load->getMemOperand())) { + SDValue Ops[2]; + std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); + return DAG.getMergeValues(Ops, DL); + } + return SDValue(); } @@ -8502,11 +8503,6 @@ assert(VT.isVector() && Store->getValue().getValueType().getScalarType() == MVT::i32); - if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), - VT, *Store->getMemOperand())) { - return expandUnalignedStore(Store, DAG); - } - unsigned AS = Store->getAddressSpace(); if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS && @@ -8531,6 +8527,11 @@ // v3 stores not supported on SI. if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) return SplitVectorStore(Op, DAG); + + if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + VT, *Store->getMemOperand())) + return expandUnalignedStore(Store, DAG); + return SDValue(); } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) { switch (Subtarget->getMaxPrivateElementSize()) { @@ -8570,6 +8571,13 @@ return SplitVectorStore(Op, DAG); } + if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + VT, *Store->getMemOperand())) { + if (VT.isVector()) + return SplitVectorStore(Op, DAG); + return expandUnalignedStore(Store, DAG); + } + return SDValue(); } else { llvm_unreachable("unhandled address space"); Index: llvm/test/CodeGen/AMDGPU/load-local-spill.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/load-local-spill.ll @@ -0,0 +1,61 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s + +define amdgpu_vs void @spill_test(<4 x i32> inreg %arg1, <6 x float> addrspace(3)* %arg2) { +; CHECK-LABEL: spill_test: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: ds_read_b32 v2, v1 +; CHECK-NEXT: ds_read_b32 v1, v4 +; CHECK-NEXT: ds_read_b32 v3, v3 +; CHECK-NEXT: ds_read_b32 v0, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: exp mrt0 off, off, off, off +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] idxen +; CHECK-NEXT: s_endpgm + call void @llvm.amdgcn.exp.f32(i32 immarg 0, i32 immarg 0, float undef, float undef, float undef, float undef, i1 immarg false, i1 immarg false) + %var1 = load <6 x float>, <6 x float> addrspace(3)* %arg2, align 4 + %var2 = shufflevector <6 x float> %var1, <6 x float> undef, <4 x i32> + call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %var2, <4 x i32> %arg1, i32 0, i32 0, i32 0, i32 immarg 126, i32 immarg 0) + ret void +} + +declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) +declare void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg, i32 immarg) + + +define amdgpu_vs void @spill_test_2(<4 x i32> inreg %arg1, i32 %arg2, i32 inreg %arg3, <8 x float> addrspace(3)* %arg4) { +; CHECK-LABEL: spill_test_2: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_add_i32_e32 v5, vcc, 28, v1 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 24, v1 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 20, v1 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, 16, v1 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, 12, v1 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, 8, v1 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, 4, v1 +; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: ds_read_b32 v4, v2 +; CHECK-NEXT: ds_read_b32 v3, v3 +; CHECK-NEXT: ds_read_b32 v2, v6 +; CHECK-NEXT: ds_read_b32 v9, v7 +; CHECK-NEXT: ds_read_b32 v8, v8 +; CHECK-NEXT: ds_read_b32 v7, v10 +; CHECK-NEXT: ds_read_b32 v6, v1 +; CHECK-NEXT: ds_read_b32 v5, v5 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: tbuffer_store_format_xyzw v[6:9], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen glc slc +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: tbuffer_store_format_xyzw v[2:5], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:16 glc slc +; CHECK-NEXT: s_endpgm + %load = load <8 x float>, <8 x float> addrspace(3)* %arg4, align 4 + %vec1 = shufflevector <8 x float> %load, <8 x float> undef, <4 x i32> + call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %vec1, <4 x i32> %arg1, i32 %arg2, i32 0, i32 %arg3, i32 immarg 77, i32 immarg 3) + %vec2 = shufflevector <8 x float> %load, <8 x float> undef, <4 x i32> + call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %vec2, <4 x i32> %arg1, i32 %arg2, i32 16, i32 %arg3, i32 immarg 77, i32 immarg 3) + ret void +}