Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1886,12 +1886,22 @@ llvm_unreachable("unsupported private_element_size"); } } - case AMDGPUAS::LOCAL_ADDRESS: + case AMDGPUAS::LOCAL_ADDRESS: { + if (NumElements > 2) + return SplitVectorLoad(Op, DAG); + + if (Load->getAlignment() < 4) { + SDValue Ops[2]; + std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); + return DAG.getMergeValues(Ops, DL); + } + if (NumElements == 2) - return Op; + return SDValue(); // If properly aligned, if we split we might be able to use ds_read_b64. return SplitVectorLoad(Op, DAG); + } default: return SDValue(); } @@ -2121,12 +2131,19 @@ llvm_unreachable("unsupported private_element_size"); } } - case AMDGPUAS::LOCAL_ADDRESS: + case AMDGPUAS::LOCAL_ADDRESS: { + if (NumElements > 2) + return SplitVectorStore(Op, DAG); + + if (Store->getAlignment() < 4) + return expandUnalignedStore(Store, DAG); + if (NumElements == 2) return Op; // If properly aligned, if we split we might be able to use ds_write_b64. return SplitVectorStore(Op, DAG); + } default: llvm_unreachable("unhandled address space"); } Index: test/CodeGen/AMDGPU/unaligned-load-store.ll =================================================================== --- test/CodeGen/AMDGPU/unaligned-load-store.ll +++ test/CodeGen/AMDGPU/unaligned-load-store.ll @@ -65,13 +65,36 @@ ; SI: ds_read_u8 ; SI: ds_read_u8 ; SI: ds_read_u8 + +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl ; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl + ; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl + ; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl + ; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl + ; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl + ; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl + ; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl ; SI: ds_write_b8 ; SI: s_endpgm define void @unaligned_load_store_i64_local(i64 addrspace(3)* %p, i64 addrspace(3)* %r) { @@ -80,6 +103,53 @@ ret void } +; SI-LABEL: {{^}}unaligned_load_store_v2i32_local: +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 + +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl +; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl + +; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl + +; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl + +; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl + +; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl + +; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl + +; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl +; SI: ds_write_b8 +; SI: s_endpgm +define void @unaligned_load_store_v2i32_local(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) { + %v = load <2 x i32>, <2 x i32> addrspace(3)* %p, align 1 + store <2 x i32> %v, <2 x i32> addrspace(3)* %r, align 1 + ret void +} + ; SI-LABEL: {{^}}unaligned_load_store_i64_global: ; SI: buffer_load_ubyte ; SI: buffer_load_ubyte