Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -271,6 +271,15 @@ return true; } + /// isStoreBitCastBeneficial() - Mirror of isLoadBitCastBeneficial(). Return + /// true if the following transform is beneficial. + /// + /// (store (y (conv x)), y*)) -> (store x, (x*)) + virtual bool isStoreBitCastBeneficial(EVT StoreVT, EVT BitcastVT) const { + // Default to the same logic as stores. + return isLoadBitCastBeneficial(StoreVT, BitcastVT); + } + /// Return true if it is expected to be cheaper to do a store of a non-zero /// vector constant with the given size and type for the address space than to /// store the individual scalar element constants. Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11863,17 +11863,21 @@ // resultant store does not need a higher alignment than the original. if (Value.getOpcode() == ISD::BITCAST && !ST->isTruncatingStore() && ST->isUnindexed()) { - unsigned OrigAlign = ST->getAlignment(); EVT SVT = Value.getOperand(0).getValueType(); - unsigned Align = DAG.getDataLayout().getABITypeAlignment( - SVT.getTypeForEVT(*DAG.getContext())); - if (Align <= OrigAlign && - ((!LegalOperations && !ST->isVolatile()) || - TLI.isOperationLegalOrCustom(ISD::STORE, SVT))) - return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), - Ptr, ST->getPointerInfo(), ST->isVolatile(), - ST->isNonTemporal(), OrigAlign, - ST->getAAInfo()); + if (((!LegalOperations && !ST->isVolatile()) || + TLI.isOperationLegalOrCustom(ISD::STORE, SVT)) && + TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT)) { + unsigned OrigAlign = ST->getAlignment(); + bool Fast = false; + if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), SVT, + ST->getAddressSpace(), OrigAlign, &Fast) && + Fast) { + return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), + Ptr, ST->getPointerInfo(), ST->isVolatile(), + ST->isNonTemporal(), OrigAlign, + ST->getAAInfo()); + } + } } // Turn 'store undef, Ptr' -> nothing. Index: test/CodeGen/AMDGPU/reduce-store-width-alignment.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/reduce-store-width-alignment.ll @@ -0,0 +1,53 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}store_v2i32_as_v4i16_align_4: +; GCN: s_load_dwordx2 +; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} +define void @store_v2i32_as_v4i16_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 { + %x.bc = bitcast <2 x i32> %x to <4 x i16> + store <4 x i16> %x.bc, <4 x i16> addrspace(3)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}store_v4i32_as_v8i16_align_4: +; GCN: s_load_dwordx4 +; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 +; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} +define void @store_v4i32_as_v8i16_align_4(<8 x i16> addrspace(3)* align 4 %out, <4 x i32> %x) #0 { + %x.bc = bitcast <4 x i32> %x to <8 x i16> + store <8 x i16> %x.bc, <8 x i16> addrspace(3)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}store_v2i32_as_i64_align_4: +; GCN: s_load_dwordx2 +; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} +define void @store_v2i32_as_i64_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 { + %x.bc = bitcast <2 x i32> %x to <4 x i16> + store <4 x i16> %x.bc, <4 x i16> addrspace(3)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}store_v4i32_as_v2i64_align_4: +; GCN: s_load_dwordx4 +; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 +; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} +define void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)* align 4 %out, <4 x i32> %x) #0 { + %x.bc = bitcast <4 x i32> %x to <2 x i64> + store <2 x i64> %x.bc, <2 x i64> addrspace(3)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}store_v4i16_as_v2i32_align_4: +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} +define void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 { + %x.bc = bitcast <4 x i16> %x to <2 x i32> + store <2 x i32> %x.bc, <2 x i32> addrspace(3)* %out, align 4 + ret void +} + +attributes #0 = { nounwind }