Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -271,6 +271,15 @@ return true; } + /// isStoreBitCastBeneficial() - Mirror of isLoadBitCastBeneficial(). Return + /// true if the following transform is beneficial. + /// + /// (store (y (conv x)), y*)) -> (store x, (x*)) + virtual bool isStoreBitCastBeneficial(EVT StoreVT, EVT BitcastVT) const { + // Default to the same logic as stores. + return isLoadBitCastBeneficial(StoreVT, BitcastVT); + } + /// Return true if it is expected to be cheaper to do a store of a non-zero /// vector constant with the given size and type for the address space than to /// store the individual scalar element constants. Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11911,17 +11911,21 @@ // resultant store does not need a higher alignment than the original. if (Value.getOpcode() == ISD::BITCAST && !ST->isTruncatingStore() && ST->isUnindexed()) { - unsigned OrigAlign = ST->getAlignment(); EVT SVT = Value.getOperand(0).getValueType(); - unsigned Align = DAG.getDataLayout().getABITypeAlignment( - SVT.getTypeForEVT(*DAG.getContext())); - if (Align <= OrigAlign && - ((!LegalOperations && !ST->isVolatile()) || - TLI.isOperationLegalOrCustom(ISD::STORE, SVT))) - return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), - Ptr, ST->getPointerInfo(), ST->isVolatile(), - ST->isNonTemporal(), OrigAlign, - ST->getAAInfo()); + if (((!LegalOperations && !ST->isVolatile()) || + TLI.isOperationLegalOrCustom(ISD::STORE, SVT)) && + TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT)) { + unsigned OrigAlign = ST->getAlignment(); + bool Fast = false; + if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), SVT, + ST->getAddressSpace(), OrigAlign, &Fast) && + Fast) { + return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), + Ptr, ST->getPointerInfo(), ST->isVolatile(), + ST->isNonTemporal(), OrigAlign, + ST->getAAInfo()); + } + } } // Turn 'store undef, Ptr' -> nothing. Index: test/CodeGen/AMDGPU/reduce-store-width-alignment.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/reduce-store-width-alignment.ll @@ -0,0 +1,53 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}store_v2i32_as_v4i16_align_4: +; GCN: s_load_dwordx2 +; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} +define void @store_v2i32_as_v4i16_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 { + %x.bc = bitcast <2 x i32> %x to <4 x i16> + store <4 x i16> %x.bc, <4 x i16> addrspace(3)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}store_v4i32_as_v8i16_align_4: +; GCN: s_load_dwordx4 +; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 +; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} +define void @store_v4i32_as_v8i16_align_4(<8 x i16> addrspace(3)* align 4 %out, <4 x i32> %x) #0 { + %x.bc = bitcast <4 x i32> %x to <8 x i16> + store <8 x i16> %x.bc, <8 x i16> addrspace(3)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}store_v2i32_as_i64_align_4: +; GCN: s_load_dwordx2 +; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} +define void @store_v2i32_as_i64_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 { + %x.bc = bitcast <2 x i32> %x to <4 x i16> + store <4 x i16> %x.bc, <4 x i16> addrspace(3)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}store_v4i32_as_v2i64_align_4: +; GCN: s_load_dwordx4 +; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 +; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} +define void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)* align 4 %out, <4 x i32> %x) #0 { + %x.bc = bitcast <4 x i32> %x to <2 x i64> + store <2 x i64> %x.bc, <2 x i64> addrspace(3)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}store_v4i16_as_v2i32_align_4: +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} +define void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 { + %x.bc = bitcast <4 x i16> %x to <2 x i32> + store <2 x i32> %x.bc, <2 x i32> addrspace(3)* %out, align 4 + ret void +} + +attributes #0 = { nounwind } Index: test/CodeGen/ARM/vector-store.ll =================================================================== --- test/CodeGen/ARM/vector-store.ll +++ test/CodeGen/ARM/vector-store.ll @@ -12,12 +12,14 @@ } define void @store_v8i8_update(<8 x i8>** %ptr, <8 x i8> %val) { -;CHECK-LABEL: store_v8i8_update: -;CHECK: vst1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]! +; CHECK-LABEL: store_v8i8_update: +; CHECK: str r{{[0-9]+}}, [{{r[0-9]+}}] +; CHECK: str r{{[0-9]+}}, [{{r[0-9]+}}, #4] +; CHECK: add.w r{{[0-9]+}}, r{{[0-9]+}}, #8 %A = load <8 x i8>*, <8 x i8>** %ptr store <8 x i8> %val, <8 x i8>* %A, align 1 %inc = getelementptr <8 x i8>, <8 x i8>* %A, i38 1 - store <8 x i8>* %inc, <8 x i8>** %ptr + store <8 x i8>* %inc, <8 x i8>** %ptr ret void } @@ -30,8 +32,10 @@ } define void @store_v4i16_update(<4 x i16>** %ptr, <4 x i16> %val) { -;CHECK-LABEL: store_v4i16_update: -;CHECK: vst1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]! +; CHECK-LABEL: store_v4i16_update: +; CHECK: str r{{[0-9]+}}, [{{r[0-9]+}}] +; CHECK: str r{{[0-9]+}}, [{{r[0-9]+}}, #4] +; CHECK: add.w r{{[0-9]+}}, r{{[0-9]+}}, #8 %A = load <4 x i16>*, <4 x i16>** %ptr store <4 x i16> %val, <4 x i16>* %A, align 1 %inc = getelementptr <4 x i16>, <4 x i16>* %A, i34 1 @@ -48,8 +52,11 @@ } define void @store_v2i32_update(<2 x i32>** %ptr, <2 x i32> %val) { -;CHECK-LABEL: store_v2i32_update: -;CHECK: vst1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]! +; CHECK-LABEL: store_v2i32_update: +; CHECK: str r{{[0-9]+}}, [{{r[0-9]+}}] +; CHECK: str r{{[0-9]+}}, [{{r[0-9]+}}, #4] +; CHECK: add.w r{{[0-9]+}}, r{{[0-9]+}}, #8 + %A = load <2 x i32>*, <2 x i32>** %ptr store <2 x i32> %val, <2 x i32>* %A, align 1 %inc = getelementptr <2 x i32>, <2 x i32>* %A, i32 1 @@ -66,13 +73,15 @@ } define void @store_v2f32_update(<2 x float>** %ptr, <2 x float> %val) { -;CHECK-LABEL: store_v2f32_update: -;CHECK: vst1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]! - %A = load <2 x float>*, <2 x float>** %ptr - store <2 x float> %val, <2 x float>* %A, align 1 - %inc = getelementptr <2 x float>, <2 x float>* %A, i32 1 - store <2 x float>* %inc, <2 x float>** %ptr - ret void +; CHECK-LABEL: store_v2f32_update: +; CHECK: str r{{[0-9]+}}, [{{r[0-9]+}}] +; CHECK: str r{{[0-9]+}}, [{{r[0-9]+}}, #4] +; CHECK: add.w r{{[0-9]+}}, r{{[0-9]+}}, #8 + %A = load <2 x float>*, <2 x float>** %ptr + store <2 x float> %val, <2 x float>* %A, align 1 + %inc = getelementptr <2 x float>, <2 x float>* %A, i32 1 + store <2 x float>* %inc, <2 x float>** %ptr + ret void } define void @store_v1i64(<1 x i64>** %ptr, <1 x i64> %val) { @@ -84,12 +93,15 @@ } define void @store_v1i64_update(<1 x i64>** %ptr, <1 x i64> %val) { -;CHECK-LABEL: store_v1i64_update: -;CHECK: vst1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]! +; CHECK-LABEL: store_v1i64_update: +; CHECK: str r{{[0-9]+}}, [{{r[0-9]+}}] +; CHECK: str r{{[0-9]+}}, [{{r[0-9]+}}, #4] +; CHECK: add.w r{{[0-9]+}}, r{{[0-9]+}}, #8 + %A = load <1 x i64>*, <1 x i64>** %ptr store <1 x i64> %val, <1 x i64>* %A, align 1 %inc = getelementptr <1 x i64>, <1 x i64>* %A, i31 1 - store <1 x i64>* %inc, <1 x i64>** %ptr + store <1 x i64>* %inc, <1 x i64>** %ptr ret void } Index: test/CodeGen/X86/avx-vextractf128.ll =================================================================== --- test/CodeGen/X86/avx-vextractf128.ll +++ test/CodeGen/X86/avx-vextractf128.ll @@ -119,7 +119,7 @@ define void @t9(i64* %p) { ; CHECK-LABEL: t9: ; CHECK: ## BB#0: -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0 ; CHECK-NEXT: vmovups %ymm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq