Index: llvm/trunk/lib/Target/AMDGPU/R600ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/R600ISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1120,26 +1120,36 @@ unsigned AS = StoreNode->getAddressSpace(); SDValue Value = StoreNode->getValue(); EVT ValueVT = Value.getValueType(); + EVT MemVT = StoreNode->getMemoryVT(); + unsigned Align = StoreNode->getAlignment(); if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) && ValueVT.isVector()) { return SplitVectorStore(Op, DAG); } + // Private AS needs special fixes + if (Align < MemVT.getStoreSize() && (AS != AMDGPUAS::PRIVATE_ADDRESS) && + !allowsMisalignedMemoryAccesses(MemVT, AS, Align, NULL)) { + return expandUnalignedStore(StoreNode, DAG); + } + SDLoc DL(Op); SDValue Chain = StoreNode->getChain(); SDValue Ptr = StoreNode->getBasePtr(); if (AS == AMDGPUAS::GLOBAL_ADDRESS) { + // It is beneficial to create MSKOR here instead of combiner to avoid + // artificial dependencies introduced by RMW if (StoreNode->isTruncatingStore()) { EVT VT = Value.getValueType(); assert(VT.bitsLE(MVT::i32)); - EVT MemVT = StoreNode->getMemoryVT(); SDValue MaskConstant; if (MemVT == MVT::i8) { MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32); } else { assert(MemVT == MVT::i16); + assert(StoreNode->getAlignment() >= 2); MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32); } SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr, @@ -1183,7 +1193,6 @@ if (AS != AMDGPUAS::PRIVATE_ADDRESS) return SDValue(); - EVT MemVT = StoreNode->getMemoryVT(); if (MemVT.bitsLT(MVT::i32)) return lowerPrivateTruncStore(StoreNode, DAG); Index: llvm/trunk/test/CodeGen/AMDGPU/store-global.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/store-global.ll +++ llvm/trunk/test/CodeGen/AMDGPU/store-global.ll @@ -5,6 +5,11 @@ ; FUNC-LABEL: {{^}}store_i1: ; EG: MEM_RAT MSKOR +; EG-NOT: MEM_RAT MSKOR + +; CM: MEM_RAT MSKOR +; CM-NOT: MEM_RAT MSKOR + ; GCN: buffer_store_byte define void @store_i1(i1 addrspace(1)* %out) { entry: @@ -15,6 +20,7 @@ ; i8 store ; FUNC-LABEL: {{^}}store_i8: ; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X +; EG-NOT: MEM_RAT MSKOR ; IG 0: Get the byte index and truncate the value ; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x @@ -45,6 +51,7 @@ ; i16 store ; FUNC-LABEL: {{^}}store_i16: ; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X +; EG-NOT: MEM_RAT MSKOR ; IG 0: Get the byte index and truncate the value @@ -78,6 +85,9 @@ ; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 ; GCN-DAG: buffer_store_byte ; GCN-DAG: buffer_store_short + +; EG: MEM_RAT MSKOR +; EG: MEM_RAT MSKOR define void @store_i24(i24 addrspace(1)* %out, i24 %in) { entry: store i24 %in, i24 addrspace(1)* %out @@ -88,6 +98,12 @@ ; GCN: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x1ffffff{{$}} ; GCN: v_mov_b32_e32 [[VAND:v[0-9]+]], [[AND]] ; GCN: buffer_store_dword [[VAND]] + +; EG: MEM_RAT_CACHELESS STORE_RAW +; EG-NOT: MEM_RAT + +; CM: MEM_RAT_CACHELESS STORE_DWORD +; CM-NOT: MEM_RAT define void @store_i25(i25 addrspace(1)* %out, i25 %in) { entry: store i25 %in, i25 addrspace(1)* %out @@ -95,9 +111,13 @@ } ; FUNC-LABEL: {{^}}store_v2i8: +; v2i8 is naturally 2B aligned ; EG: MEM_RAT MSKOR ; EG-NOT: MEM_RAT MSKOR +; CM: MEM_RAT MSKOR +; CM-NOT: MEM_RAT MSKOR + ; GCN: buffer_store_short define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { entry: @@ -106,6 +126,23 @@ ret void } +; FUNC-LABEL: {{^}}store_v2i8_unaligned: +; EG: MEM_RAT MSKOR +; EG: MEM_RAT MSKOR +; EG-NOT: MEM_RAT MSKOR + +; CM: MEM_RAT MSKOR +; CM: MEM_RAT MSKOR +; CM-NOT: MEM_RAT MSKOR + +; SI: buffer_store_byte +define void @store_v2i8_unaligned(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { +entry: + %0 = trunc <2 x i32> %in to <2 x i8> + store <2 x i8> %0, <2 x i8> addrspace(1)* %out, align 1 + ret void +} + ; FUNC-LABEL: {{^}}store_v2i16: ; EG: MEM_RAT_CACHELESS STORE_RAW @@ -120,6 +157,26 @@ ret void } +; FUNC-LABEL: {{^}}store_v2i16_unaligned: +; EG: MEM_RAT MSKOR +; EG: MEM_RAT MSKOR +; EG-NOT: MEM_RAT MSKOR +; EG-NOT: MEM_RAT_CACHELESS STORE_RAW + +; CM: MEM_RAT MSKOR +; CM: MEM_RAT MSKOR +; CM-NOT: MEM_RAT MSKOR +; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD + +; SI: buffer_store_short +; SI: buffer_store_short +define void @store_v2i16_unaligned(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { +entry: + %0 = trunc <2 x i32> %in to <2 x i16> + store <2 x i16> %0, <2 x i16> addrspace(1)* %out, align 2 + ret void +} + ; FUNC-LABEL: {{^}}store_v4i8: ; EG: MEM_RAT_CACHELESS STORE_RAW @@ -133,6 +190,54 @@ ret void } +; FUNC-LABEL: {{^}}store_v4i8_unaligned: +; EG: MEM_RAT MSKOR +; EG: MEM_RAT MSKOR +; EG: MEM_RAT MSKOR +; EG: MEM_RAT MSKOR +; EG-NOT: MEM_RAT MSKOR +; EG-NOT: MEM_RAT_CACHELESS STORE_RAW + +; CM: MEM_RAT MSKOR +; CM: MEM_RAT MSKOR +; CM: MEM_RAT MSKOR +; CM: MEM_RAT MSKOR +; CM-NOT: MEM_RAT MSKOR +; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD + +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI-NOT: buffer_store_dword +define void @store_v4i8_unaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { +entry: + %0 = trunc <4 x i32> %in to <4 x i8> + store <4 x i8> %0, <4 x i8> addrspace(1)* %out, align 1 + ret void +} + +; FUNC-LABEL: {{^}}store_v4i8_halfaligned: +; EG: MEM_RAT MSKOR +; EG: MEM_RAT MSKOR +; EG-NOT: MEM_RAT MSKOR +; EG-NOT: MEM_RAT_CACHELESS STORE_RAW + +; CM: MEM_RAT MSKOR +; CM: MEM_RAT MSKOR +; CM-NOT: MEM_RAT MSKOR +; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD + +; SI: buffer_store_short +; SI: buffer_store_short +; SI-NOT: buffer_store_dword +define void @store_v4i8_halfaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { +entry: + %0 = trunc <4 x i32> %in to <4 x i8> + store <4 x i8> %0, <4 x i8> addrspace(1)* %out, align 2 + ret void +} + ; floating-point store ; FUNC-LABEL: {{^}}store_f32: ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1 @@ -147,7 +252,9 @@ } ; FUNC-LABEL: {{^}}store_v4i16: -; MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW +; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY + +; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+}} ; GCN: buffer_store_dwordx2 define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) { @@ -198,6 +305,20 @@ ret void } +; FUNC-LABEL: {{^}}store_v4i32_unaligned: +; EG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.XYZW}} +; EG-NOT: MEM_RAT_CACHELESS STORE_RAW + +; CM: MEM_RAT_CACHELESS STORE_DWORD +; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD + +; SI: buffer_store_dwordx4 +define void @store_v4i32_unaligned(<4 x i32> addrspace(1)* %out, <4 x i32> %in) { +entry: + store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 + ret void +} + ; v4f32 store ; FUNC-LABEL: {{^}}store_v4f32: ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1 @@ -215,6 +336,9 @@ ; FUNC-LABEL: {{^}}store_i64_i8: ; EG: MEM_RAT MSKOR + +; CM: MEM_RAT MSKOR + ; GCN: buffer_store_byte define void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) { entry: @@ -234,16 +358,15 @@ } ; The stores in this function are combined by the optimizer to create a -; 64-bit store with 32-bit alignment. This is legal for GCN and the legalizer +; 64-bit store with 32-bit alignment. This is legal and the legalizer ; should not try to split the 64-bit store back into 2 32-bit stores. -; -; Evergreen / Northern Islands don't support 64-bit stores yet, so there should -; be two 32-bit stores. ; FUNC-LABEL: {{^}}vecload2: -; EG: MEM_RAT_CACHELESS STORE_RAW +; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XY, T[0-9]+\.X}}, 1 +; EG-NOT: MEM_RAT_CACHELESS STORE_RAW ; CM: MEM_RAT_CACHELESS STORE_DWORD +; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD ; GCN: buffer_store_dwordx2 define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 { Index: llvm/trunk/test/CodeGen/AMDGPU/store-local.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/store-local.ll +++ llvm/trunk/test/CodeGen/AMDGPU/store-local.ll @@ -5,6 +5,9 @@ ; FUNC-LABEL: {{^}}store_local_i1: ; EG: LDS_BYTE_WRITE + +; CM: LDS_BYTE_WRITE + ; GCN: ds_write_b8 define void @store_local_i1(i1 addrspace(3)* %out) { entry: @@ -15,6 +18,8 @@ ; FUNC-LABEL: {{^}}store_local_i8: ; EG: LDS_BYTE_WRITE +; CM: LDS_BYTE_WRITE + ; GCN: ds_write_b8 define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) { store i8 %in, i8 addrspace(3)* %out @@ -24,6 +29,8 @@ ; FUNC-LABEL: {{^}}store_local_i16: ; EG: LDS_SHORT_WRITE +; CM: LDS_SHORT_WRITE + ; GCN: ds_write_b16 define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) { store i16 %in, i16 addrspace(3)* %out @@ -54,12 +61,54 @@ ret void } +; FUNC-LABEL: {{^}}store_local_v4i8_unaligned: +; EG: LDS_BYTE_WRITE +; EG: LDS_BYTE_WRITE +; EG: LDS_BYTE_WRITE +; EG: LDS_BYTE_WRITE +; EG-NOT: LDS_WRITE + +; CM: LDS_BYTE_WRITE +; CM: LDS_BYTE_WRITE +; CM: LDS_BYTE_WRITE +; CM: LDS_BYTE_WRITE +; CM-NOT: LDS_WRITE + +; GCN: ds_write_b8 +; GCN: ds_write_b8 +; GCN: ds_write_b8 +; GCN: ds_write_b8 +define void @store_local_v4i8_unaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { +entry: + store <4 x i8> %in, <4 x i8> addrspace(3)* %out, align 1 + ret void +} + +; FUNC-LABEL: {{^}}store_local_v4i8_halfaligned: +; EG: LDS_SHORT_WRITE +; EG: LDS_SHORT_WRITE +; EG-NOT: LDS_WRITE + +; CM: LDS_SHORT_WRITE +; CM: LDS_SHORT_WRITE +; CM-NOT: LDS_WRITE + +; GCN: ds_write_b16 +; GCN: ds_write_b16 +define void @store_local_v4i8_halfaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { +entry: + store <4 x i8> %in, <4 x i8> addrspace(3)* %out, align 2 + ret void +} + ; FUNC-LABEL: {{^}}store_local_v2i32: ; EG: LDS_WRITE ; EG: LDS_WRITE +; EG-NOT: LDS_WRITE ; CM: LDS_WRITE ; CM: LDS_WRITE +; CM-NOT: LDS_WRITE ; GCN: ds_write_b64 define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) {