diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -8014,13 +8014,6 @@ assert(Op.getValueType().getVectorElementType() == MVT::i32 && "Custom lowering for non-i32 vectors hasn't been implemented."); - if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), - MemVT, *Load->getMemOperand())) { - SDValue Ops[2]; - std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); - return DAG.getMergeValues(Ops, DL); - } - unsigned Alignment = Load->getAlignment(); unsigned AS = Load->getAddressSpace(); if (Subtarget->hasLDSMisalignedBug() && @@ -8132,6 +8125,14 @@ return SplitVectorLoad(Op, DAG); } } + + if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + MemVT, *Load->getMemOperand())) { + SDValue Ops[2]; + std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); + return DAG.getMergeValues(Ops, DL); + } + return SDValue(); } @@ -8537,11 +8538,6 @@ assert(VT.isVector() && Store->getValue().getValueType().getScalarType() == MVT::i32); - if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), - VT, *Store->getMemOperand())) { - return expandUnalignedStore(Store, DAG); - } - unsigned AS = Store->getAddressSpace(); if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS && @@ -8566,6 +8562,11 @@ // v3 stores not supported on SI. if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) return SplitVectorStore(Op, DAG); + + if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + VT, *Store->getMemOperand())) + return expandUnalignedStore(Store, DAG); + return SDValue(); } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) { switch (Subtarget->getMaxPrivateElementSize()) { @@ -8605,6 +8606,13 @@ return SplitVectorStore(Op, DAG); } + if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + VT, *Store->getMemOperand())) { + if (VT.isVector()) + return SplitVectorStore(Op, DAG); + return expandUnalignedStore(Store, DAG); + } + return SDValue(); } else { llvm_unreachable("unhandled address space"); diff --git a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll --- a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll @@ -7,35 +7,15 @@ define amdgpu_vs void @test(<4 x i32> inreg %arg1, <6 x float> addrspace(3)* %arg2) { ; CHECK-LABEL: test: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_mov_b32 s8, s4 -; CHECK-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; CHECK-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; CHECK-NEXT: s_mov_b32 s6, -1 -; CHECK-NEXT: s_mov_b32 s7, 0xe8f000 -; CHECK-NEXT: s_add_u32 s4, s4, s8 -; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 4, v0 ; CHECK-NEXT: s_mov_b32 m0, -1 -; CHECK-NEXT: ds_read_b32 v1, v1 -; CHECK-NEXT: ds_read_b32 v2, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; CHECK-NEXT: ds_read_b32 v2, v1 +; CHECK-NEXT: ds_read_b32 v1, v4 ; CHECK-NEXT: ds_read_b32 v3, v3 ; CHECK-NEXT: ds_read_b32 v0, v0 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:28 -; CHECK-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:20 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16 -; CHECK-NEXT: s_waitcnt expcnt(1) -; CHECK-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:28 -; CHECK-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:24 -; CHECK-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:20 -; CHECK-NEXT: s_waitcnt expcnt(0) -; CHECK-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: exp mrt0 off, off, off, off ; CHECK-NEXT: v_mov_b32_e32 v4, 0 ; CHECK-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] idxen @@ -50,42 +30,25 @@ define amdgpu_vs void @test_2(<4 x i32> inreg %arg1, i32 %arg2, i32 inreg %arg3, <8 x float> addrspace(3)* %arg4) { ; CHECK-LABEL: test_2: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; CHECK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; CHECK-NEXT: s_mov_b32 s10, -1 -; CHECK-NEXT: s_mov_b32 s11, 0xe8f000 -; CHECK-NEXT: s_add_u32 s8, s8, s5 -; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, 28, v1 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, 24, v1 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 28, v1 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 20, v1 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, 16, v1 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, 20, v1 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, 12, v1 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, 8, v1 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, 12, v1 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, 4, v1 ; CHECK-NEXT: s_mov_b32 m0, -1 ; CHECK-NEXT: ds_read_b32 v4, v2 -; CHECK-NEXT: ds_read_b32 v5, v3 +; CHECK-NEXT: ds_read_b32 v3, v3 ; CHECK-NEXT: ds_read_b32 v2, v6 -; CHECK-NEXT: ds_read_b32 v3, v7 +; CHECK-NEXT: ds_read_b32 v9, v7 ; CHECK-NEXT: ds_read_b32 v8, v8 -; CHECK-NEXT: ds_read_b32 v9, v9 ; CHECK-NEXT: ds_read_b32 v7, v10 ; CHECK-NEXT: ds_read_b32 v6, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:28 -; CHECK-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:20 -; CHECK-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:44 -; CHECK-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:40 +; CHECK-NEXT: ds_read_b32 v5, v5 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:36 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:32 ; CHECK-NEXT: tbuffer_store_format_xyzw v[6:9], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen glc slc +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: tbuffer_store_format_xyzw v[2:5], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:16 glc slc ; CHECK-NEXT: s_endpgm %load = load <8 x float>, <8 x float> addrspace(3)* %arg4, align 4 @@ -99,65 +62,42 @@ define amdgpu_vs void @test_3(i32 inreg %arg1, i32 inreg %arg2, <4 x i32> inreg %arg3, i32 %arg4, <6 x float> addrspace(3)* %arg5, <6 x float> addrspace(3)* %arg6) { ; CHECK-LABEL: test_3: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; CHECK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; CHECK-NEXT: s_mov_b32 s10, -1 -; CHECK-NEXT: s_mov_b32 s11, 0xe8f000 -; CHECK-NEXT: s_add_u32 s8, s8, s6 -; CHECK-NEXT: s_addc_u32 s9, s9, 0 ; CHECK-NEXT: s_mov_b32 s7, s5 ; CHECK-NEXT: s_mov_b32 s6, s4 ; CHECK-NEXT: s_mov_b32 s5, s3 ; CHECK-NEXT: s_mov_b32 s4, s2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, 8, v1 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 12, v1 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, 4, v1 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, 16, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, 16, v1 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, 12, v1 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 8, v1 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, 4, v1 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, 20, v1 ; CHECK-NEXT: v_mov_b32_e32 v9, s0 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, 8, v2 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, 16, v2 ; CHECK-NEXT: v_add_i32_e32 v11, vcc, 12, v2 -; CHECK-NEXT: v_add_i32_e32 v12, vcc, 4, v2 -; CHECK-NEXT: v_add_i32_e32 v13, vcc, 16, v2 -; CHECK-NEXT: v_add_i32_e32 v14, vcc, 20, v2 +; CHECK-NEXT: v_add_i32_e32 v12, vcc, 8, v2 ; CHECK-NEXT: s_mov_b32 m0, -1 -; CHECK-NEXT: ds_read_b32 v5, v0 -; CHECK-NEXT: ds_read_b32 v6, v3 -; CHECK-NEXT: ds_read_b32 v4, v4 -; CHECK-NEXT: ds_read_b32 v8, v8 -; CHECK-NEXT: ds_read_b32 v7, v7 ; CHECK-NEXT: ds_read_b32 v3, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:44 -; CHECK-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:40 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:36 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:32 +; CHECK-NEXT: ds_read_b32 v5, v4 +; CHECK-NEXT: ds_read_b32 v4, v7 +; CHECK-NEXT: ds_read_b32 v1, v8 +; CHECK-NEXT: ds_read_b32 v6, v6 +; CHECK-NEXT: ds_read_b32 v0, v0 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, 4, v2 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, 20, v2 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) ; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc -; CHECK-NEXT: tbuffer_store_format_xy v[7:8], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc -; CHECK-NEXT: ds_read_b32 v0, v10 -; CHECK-NEXT: ds_read_b32 v1, v11 -; CHECK-NEXT: s_waitcnt expcnt(1) -; CHECK-NEXT: ds_read_b32 v3, v12 -; CHECK-NEXT: ds_read_b32 v4, v13 -; CHECK-NEXT: ds_read_b32 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:28 -; CHECK-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:20 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:16 -; CHECK-NEXT: s_waitcnt expcnt(1) -; CHECK-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 +; CHECK-NEXT: tbuffer_store_format_xy v[0:1], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc ; CHECK-NEXT: s_waitcnt expcnt(0) -; CHECK-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:24 -; CHECK-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:20 -; CHECK-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:16 -; CHECK-NEXT: ds_read_b32 v5, v14 -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_read_b32 v0, v2 +; CHECK-NEXT: ds_read_b32 v2, v12 +; CHECK-NEXT: ds_read_b32 v1, v7 +; CHECK-NEXT: ds_read_b32 v5, v8 +; CHECK-NEXT: ds_read_b32 v3, v11 +; CHECK-NEXT: ds_read_b32 v4, v10 +; CHECK-NEXT: s_waitcnt lgkmcnt(5) ; CHECK-NEXT: exp mrt0 off, off, off, off +; CHECK-NEXT: s_waitcnt lgkmcnt(1) ; CHECK-NEXT: tbuffer_store_format_xyzw v[0:3], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: tbuffer_store_format_xy v[4:5], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc