Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -101,6 +101,8 @@ setOperationAction(ISD::LOAD, MVT::i64, Promote); AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32); + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::STORE, MVT::v8i32, Custom); setOperationAction(ISD::STORE, MVT::v16i32, Custom); @@ -113,6 +115,8 @@ setOperationAction(ISD::STORE, MVT::i64, Promote); AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32); + setOperationAction(ISD::STORE, MVT::v2i32, Custom); + setOperationAction(ISD::SELECT, MVT::i64, Custom); setOperationAction(ISD::SELECT, MVT::f64, Promote); AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); @@ -1905,10 +1909,17 @@ assert(Op.getValueType().getVectorElementType() == MVT::i32 && "Custom lowering for non-i32 vectors hasn't been implemented."); - unsigned NumElements = MemVT.getVectorNumElements(); - assert(NumElements != 2 && "v2 loads are supported for all address spaces."); - switch (Load->getAddressSpace()) { + unsigned AS = Load->getAddressSpace(); + if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, + AS, Load->getAlignment())) { + SDValue Ops[2]; + std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); + return DAG.getMergeValues(Ops, DL); + } + + unsigned NumElements = MemVT.getVectorNumElements(); + switch (AS) { case AMDGPUAS::CONSTANT_ADDRESS: if (isMemOpUniform(Load)) return SDValue(); @@ -1943,9 +1954,16 @@ llvm_unreachable("unsupported private_element_size"); } } - case AMDGPUAS::LOCAL_ADDRESS: + case AMDGPUAS::LOCAL_ADDRESS: { + if (NumElements > 2) + return SplitVectorLoad(Op, DAG); + + if (NumElements == 2) + return SDValue(); + // If properly aligned, if we split we might be able to use ds_read_b64. return SplitVectorLoad(Op, DAG); + } default: return SDValue(); } @@ -2150,10 +2168,17 @@ Store->getBasePtr(), MVT::i1, Store->getMemOperand()); } - assert(Store->getValue().getValueType().getScalarType() == MVT::i32); + assert(VT.isVector() && + Store->getValue().getValueType().getScalarType() == MVT::i32); + + unsigned AS = Store->getAddressSpace(); + if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, + AS, Store->getAlignment())) { + return expandUnalignedStore(Store, DAG); + } unsigned NumElements = VT.getVectorNumElements(); - switch (Store->getAddressSpace()) { + switch (AS) { case AMDGPUAS::GLOBAL_ADDRESS: case AMDGPUAS::FLAT_ADDRESS: if (NumElements > 4) @@ -2175,9 +2200,16 @@ llvm_unreachable("unsupported private_element_size"); } } - case AMDGPUAS::LOCAL_ADDRESS: + case AMDGPUAS::LOCAL_ADDRESS: { + if (NumElements > 2) + return SplitVectorStore(Op, DAG); + + if (NumElements == 2) + return Op; + // If properly aligned, if we split we might be able to use ds_write_b64. return SplitVectorStore(Op, DAG); + } default: llvm_unreachable("unhandled address space"); } Index: test/CodeGen/AMDGPU/unaligned-load-store.ll =================================================================== --- test/CodeGen/AMDGPU/unaligned-load-store.ll +++ test/CodeGen/AMDGPU/unaligned-load-store.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s ; SI-LABEL: {{^}}unaligned_load_store_i16_local: @@ -56,6 +56,29 @@ ret void } +; SI-LABEL: {{^}}align2_load_store_i32_global: +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_store_short +; SI: buffer_store_short +define void @align2_load_store_i32_global(i32 addrspace(1)* %p, i32 addrspace(1)* %r) nounwind { + %v = load i32, i32 addrspace(1)* %p, align 2 + store i32 %v, i32 addrspace(1)* %r, align 2 + ret void +} + +; SI-LABEL: {{^}}align2_load_store_i32_local: +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_write_b16 +; SI: ds_write_b16 +define void @align2_load_store_i32_local(i32 addrspace(3)* %p, i32 addrspace(3)* %r) nounwind { + %v = load i32, i32 addrspace(3)* %p, align 2 + store i32 %v, i32 addrspace(3)* %r, align 2 + ret void +} + +; FIXME: Unnecessary packing and unpacking of bytes. ; SI-LABEL: {{^}}unaligned_load_store_i64_local: ; SI: ds_read_u8 ; SI: ds_read_u8 @@ -65,13 +88,36 @@ ; SI: ds_read_u8 ; SI: ds_read_u8 ; SI: ds_read_u8 + +; XSI-NOT: v_or_b32 +; XSI-NOT: v_lshl ; SI: ds_write_b8 +; XSI-NOT: v_or_b32 +; XSI-NOT: v_lshl + ; SI: ds_write_b8 +; XSI-NOT: v_or_b32 +; XSI-NOT: v_lshl + ; SI: ds_write_b8 +; XSI-NOT: v_or_b32 +; XSI-NOT: v_lshl + ; SI: ds_write_b8 +; XSI-NOT: v_or_b32 +; XSI-NOT: v_lshl + ; SI: ds_write_b8 +; XSI-NOT: v_or_b32 +; XSI-NOT: v_lshl + ; SI: ds_write_b8 +; XSI-NOT: v_or_b32 +; XSI-NOT: v_lshl + ; SI: ds_write_b8 +; XSI-NOT: v_or_b32 +; XSI-NOT: v_lshl ; SI: ds_write_b8 ; SI: s_endpgm define void @unaligned_load_store_i64_local(i64 addrspace(3)* %p, i64 addrspace(3)* %r) { @@ -80,6 +126,53 @@ ret void } +; SI-LABEL: {{^}}unaligned_load_store_v2i32_local: +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 + +; XSI-NOT: v_or_b32 +; XSI-NOT: v_lshl +; SI: ds_write_b8 +; XSI-NOT: v_or_b32 +; XSI-NOT: v_lshl + +; SI: ds_write_b8 +; XSI-NOT: v_or_b32 +; XSI-NOT: v_lshl + +; SI: ds_write_b8 +; XSI-NOT: v_or_b32 +; XSI-NOT: v_lshl + +; SI: ds_write_b8 +; XSI-NOT: v_or_b32 +; XSI-NOT: v_lshl + +; SI: ds_write_b8 +; XSI-NOT: v_or_b32 +; XSI-NOT: v_lshl + +; SI: ds_write_b8 +; XSI-NOT: v_or_b32 +; XSI-NOT: v_lshl + +; SI: ds_write_b8 +; XSI-NOT: v_or_b32 +; XSI-NOT: v_lshl +; SI: ds_write_b8 +; SI: s_endpgm +define void @unaligned_load_store_v2i32_local(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) { + %v = load <2 x i32>, <2 x i32> addrspace(3)* %p, align 1 + store <2 x i32> %v, <2 x i32> addrspace(3)* %r, align 1 + ret void +} + ; SI-LABEL: {{^}}unaligned_load_store_i64_global: ; SI: buffer_load_ubyte ; SI: buffer_load_ubyte @@ -89,6 +182,10 @@ ; SI: buffer_load_ubyte ; SI: buffer_load_ubyte ; SI: buffer_load_ubyte + +; XSI-NOT: v_or_ +; XSI-NOT: v_lshl + ; SI: buffer_store_byte ; SI: buffer_store_byte ; SI: buffer_store_byte