Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -329,6 +329,10 @@ return false; } + /// \brief Return true if it is cheaper to split the store of a merged int val + /// from a pair of smaller values into multiple stores. + virtual bool isMultiStoresCheaperThanBitsMerge() const { return false; } + /// \brief Return if the target supports combining a /// chain like: /// \code Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -374,6 +374,7 @@ SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL); SDValue ReduceLoadWidth(SDNode *N); SDValue ReduceLoadOpStoreWidth(SDNode *N); + SDValue splitMergedValStore(StoreSDNode *ST); SDValue TransformFPLoadStorePair(SDNode *N); SDValue reduceBuildVecExtToExtBuildVec(SDNode *N); SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N); @@ -12155,9 +12156,122 @@ return NewSt; } + if (TLI.isMultiStoresCheaperThanBitsMerge()) { + if (SDValue NewSt = splitMergedValStore(ST)) + return NewSt; + } + return ReduceLoadOpStoreWidth(N); } +/// For the instruction sequence of store below, i32_tmp and float_tmp +/// are bundled together as an i64 data before stored into memory. If the +/// i64 data is not used outside of the store, it is more efficent to +/// generate separate stores for i32_tmp and float_tmp. +/// +/// Instruction sequence of i64 Store: +/// t1: i32 = bitcast float_tmp +/// t2: i64 = zero_extend t1 +/// t3: i64 = zero_extend i32_tmp +/// t4: i64 = shl t3, Constant:i8<32> +/// t5: i64 = or i64 t2, t4 +/// t6: ch = store t0, t5, FrameIndex:i64<0>, undef:i64 +/// +/// Instruction sequence of splitted i32 stores: +/// t1: i32 = bitcast float_tmp +/// t2: ch = store t0, t1, FrameIndex:i64<0>, undef:i64 +/// t3: i64 = add FrameIndex:i64<0>, Constant:i64<4> +/// t4: ch = store t0, i32_tmp, t3, undef:i64 +/// +/// Similarly, +/// {i32, i32} pair store can be converted from i64 store to two i32 stores. +/// {i32, i16} pair store can be converted from i64 store to two i32 stores. +/// {i16, i16} pair store can be converted from i32 store to two i16 stores. +/// {i16, i8} pair store can be converted from i32 store to two i16 stores. +/// {i8, i8} pair store can be converted from i16 store to two i8 stores. +/// +/// The store patterns are commonly seen from the simple code snippet below +/// if only std::make_pair(...) is sroa transformed before inlined into hoo. +/// void goo(const std::pair &); +/// hoo() { +/// ... +/// goo(std::make_pair(tmp, ftmp)); +/// ... +/// } +/// +SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) { + if (OptLevel == CodeGenOpt::None) + return SDValue(); + + SDValue Val = ST->getValue(); + SDLoc DL(ST); + + // Match OR operand. + if (!Val.getValueType().isScalarInteger() || !Val.getNode()->hasOneUse() || + Val.getOpcode() != ISD::OR) + return SDValue(); + + // Match SHL operand and get Lower and Higher parts of Val. + SDValue Op1 = Val.getOperand(0); + SDValue Op2 = Val.getOperand(1); + SDValue Shl, Lo, Hi; + if (Op1.getOpcode() == ISD::SHL) { + Shl = Op1; + Lo = Op2; + Hi = Shl.getOperand(0); + } else if (Op2.getOpcode() == ISD::SHL) { + Shl = Op2; + Lo = Op1; + Hi = Shl.getOperand(0); + } else { + return SDValue(); + } + + if (!Shl.hasOneUse()) + return SDValue(); + + // Match SHL amount to HalfValBitSize. + unsigned HalfValBitSize = Val.getValueType().getSizeInBits() / 2; + ConstantSDNode *ShAmt = dyn_cast(Shl.getOperand(1)); + if (!ShAmt || ShAmt->getAPIntValue() != HalfValBitSize) + return SDValue(); + + // Lo and Hi are zero-extended from int with size less equal than 32 + // to i64. + if (Lo.getOpcode() != ISD::ZERO_EXTEND || !Lo.hasOneUse() || + !Lo.getOperand(0).getValueType().isScalarInteger() || + Lo.getOperand(0).getValueType().getSizeInBits() > HalfValBitSize || + Hi.getOpcode() != ISD::ZERO_EXTEND || !Hi.hasOneUse() || + !Hi.getOperand(0).getValueType().isScalarInteger() || + Hi.getOperand(0).getValueType().getSizeInBits() > HalfValBitSize) + return SDValue(); + + // Start to split store. + unsigned Alignment = ST->getAlignment(); + MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags(); + AAMDNodes AAInfo = ST->getAAInfo(); + + // Change the sizes of Lo and Hi's value types to HalfValBitSize. + EVT VT = EVT::getIntegerVT(*DAG.getContext(), HalfValBitSize); + Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Lo.getOperand(0)); + Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Hi.getOperand(0)); + + SDValue Chain = ST->getChain(); + SDValue Ptr = ST->getBasePtr(); + // Lower value store. + SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(), + ST->getAlignment(), MMOFlags, AAInfo); + Ptr = + DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(HalfValBitSize / 8, DL, Ptr.getValueType())); + // Higher value store. + SDValue St1 = + DAG.getStore(Chain, DL, Hi, Ptr, + ST->getPointerInfo().getWithOffset(HalfValBitSize / 8), + Alignment / 2, MMOFlags, AAInfo); + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, St0, St1); +} + SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { SDValue InVec = N->getOperand(0); SDValue InVal = N->getOperand(1); Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -764,6 +764,8 @@ return VT == MVT::f32 || VT == MVT::f64 || VT.isVector(); } + bool isMultiStoresCheaperThanBitsMerge() const override { return true; } + bool hasAndNotCompare(SDValue Y) const override; /// Return the value type to use for ISD::SETCC. Index: test/Transforms/InstCombine/split-store.ll =================================================================== --- test/Transforms/InstCombine/split-store.ll +++ test/Transforms/InstCombine/split-store.ll @@ -0,0 +1,193 @@ +; RUN: llc < %s | FileCheck %s + +declare void @llvm.lifetime.start(i64, i8* nocapture) +declare void @llvm.lifetime.end(i64, i8* nocapture) + +declare void @goo1(%"pair1"* dereferenceable(8)) local_unnamed_addr +%"pair1" = type { i32, float } + +; CHECK-LABEL: int32_float_pair +; CHECK: movss %xmm0, 4(%rsp) +; CHECK: movl %edi, (%rsp) +; CHECK: leaq (%rsp), %rdi +define void @int32_float_pair(i32 %tmp1, float %tmp2) local_unnamed_addr { +entry: + %ref.tmp = alloca i64, align 8 + %tmpcast = bitcast i64* %ref.tmp to %"pair1"* + %t0 = bitcast i64* %ref.tmp to i8* + call void @llvm.lifetime.start(i64 8, i8* %t0) + %t1 = bitcast float %tmp2 to i32 + %retval.sroa.2.0.insert.ext.i = zext i32 %t1 to i64 + %retval.sroa.2.0.insert.shift.i = shl nuw i64 %retval.sroa.2.0.insert.ext.i, 32 + %retval.sroa.0.0.insert.ext.i = zext i32 %tmp1 to i64 + %retval.sroa.0.0.insert.insert.i = or i64 %retval.sroa.2.0.insert.shift.i, %retval.sroa.0.0.insert.ext.i + store i64 %retval.sroa.0.0.insert.insert.i, i64* %ref.tmp, align 8 + call void @goo1(%"pair1"* dereferenceable(8) %tmpcast) + call void @llvm.lifetime.end(i64 8, i8* %t0) + ret void +} + +declare void @goo2(%"pair2"* dereferenceable(8)) local_unnamed_addr +%"pair2" = type { float, i32 } + +; CHECK-LABEL: float_int32_pair +; CHECK: movl %edi, 4(%rsp) +; CHECK: movss %xmm0, (%rsp) +; CHECK: leaq (%rsp), %rdi +define void @float_int32_pair(float %tmp1, i32 %tmp2) local_unnamed_addr #0 { +entry: + %ref.tmp = alloca i64, align 8 + %tmpcast = bitcast i64* %ref.tmp to %"pair2"* + %t0 = bitcast i64* %ref.tmp to i8* + call void @llvm.lifetime.start(i64 8, i8* %t0) #5 + %t1 = bitcast float %tmp1 to i32 + %retval.sroa.2.0.insert.ext.i = zext i32 %tmp2 to i64 + %retval.sroa.2.0.insert.shift.i = shl nuw i64 %retval.sroa.2.0.insert.ext.i, 32 + %retval.sroa.0.0.insert.ext.i = zext i32 %t1 to i64 + %retval.sroa.0.0.insert.insert.i = or i64 %retval.sroa.2.0.insert.shift.i, %retval.sroa.0.0.insert.ext.i + store i64 %retval.sroa.0.0.insert.insert.i, i64* %ref.tmp, align 8 + call void @goo2(%"pair2"* dereferenceable(8) %tmpcast) + call void @llvm.lifetime.end(i64 8, i8* %t0) #5 + ret void +} + +declare void @goo3(%"pair3"* dereferenceable(8)) local_unnamed_addr +%"pair3" = type { i32, i32 } + +; CHECK-LABEL: int32_int32_pair +; CHECK: movl %esi, 4(%rsp) +; CHECK: movl %edi, (%rsp) +; CHECK: leaq (%rsp), %rdi +define void @int32_int32_pair(i32 %tmp1, i32 %tmp2) local_unnamed_addr { +entry: + %ref.tmp = alloca i64, align 8 + %tmpcast = bitcast i64* %ref.tmp to %"pair3"* + %t0 = bitcast i64* %ref.tmp to i8* + call void @llvm.lifetime.start(i64 8, i8* %t0) + %retval.sroa.2.0.insert.ext.i = zext i32 %tmp2 to i64 + %retval.sroa.2.0.insert.shift.i = shl nuw i64 %retval.sroa.2.0.insert.ext.i, 32 + %retval.sroa.0.0.insert.ext.i = zext i32 %tmp1 to i64 + %retval.sroa.0.0.insert.insert.i = or i64 %retval.sroa.2.0.insert.shift.i, %retval.sroa.0.0.insert.ext.i + store i64 %retval.sroa.0.0.insert.insert.i, i64* %ref.tmp, align 8 + call void @goo3(%"pair3"* dereferenceable(8) %tmpcast) + call void @llvm.lifetime.end(i64 8, i8* %t0) + ret void +} + +declare void @goo4(%"pair4"* dereferenceable(8)) local_unnamed_addr +%"pair4" = type { i32, i16 } + +; CHECK-LABEL: int32_int16_pair +; CHECK: movl %edi, (%rsp) +; CHECK: movzwl %si, %eax +; CHECK: movl %eax, 4(%rsp) +; CHECK: leaq (%rsp), %rdi +define void @int32_int16_pair(i32 %tmp1, i16 signext %tmp2) local_unnamed_addr { +entry: + %ref.tmp = alloca i64, align 8 + %tmpcast = bitcast i64* %ref.tmp to %"pair4"* + %t0 = bitcast i64* %ref.tmp to i8* + call void @llvm.lifetime.start(i64 8, i8* %t0) + %retval.sroa.2.0.insert.ext.i = zext i16 %tmp2 to i64 + %retval.sroa.2.0.insert.shift.i = shl nuw nsw i64 %retval.sroa.2.0.insert.ext.i, 32 + %retval.sroa.0.0.insert.ext.i = zext i32 %tmp1 to i64 + %retval.sroa.0.0.insert.insert.i = or i64 %retval.sroa.2.0.insert.shift.i, %retval.sroa.0.0.insert.ext.i + store i64 %retval.sroa.0.0.insert.insert.i, i64* %ref.tmp, align 8 + call void @goo4(%"pair4"* dereferenceable(8) %tmpcast) + call void @llvm.lifetime.end(i64 8, i8* %t0) + ret void +} + +declare void @goo5(%"pair5"* dereferenceable(8)) local_unnamed_addr +%"pair5" = type { i32, i8 } + +; CHECK-LABEL: int32_int8_pair +; CHECK: movl %edi, (%rsp) +; CHECK: movzbl %sil, %eax +; CHECK: movl %eax, 4(%rsp) +; CHECK: leaq (%rsp), %rdi +define void @int32_int8_pair(i32 %tmp1, i8 signext %tmp2) local_unnamed_addr { +entry: + %ref.tmp = alloca i64, align 8 + %tmpcast = bitcast i64* %ref.tmp to %"pair5"* + %t0 = bitcast i64* %ref.tmp to i8* + call void @llvm.lifetime.start(i64 8, i8* %t0) + %retval.sroa.2.0.insert.ext.i = zext i8 %tmp2 to i64 + %retval.sroa.2.0.insert.shift.i = shl nuw nsw i64 %retval.sroa.2.0.insert.ext.i, 32 + %retval.sroa.0.0.insert.ext.i = zext i32 %tmp1 to i64 + %retval.sroa.0.0.insert.insert.i = or i64 %retval.sroa.2.0.insert.shift.i, %retval.sroa.0.0.insert.ext.i + store i64 %retval.sroa.0.0.insert.insert.i, i64* %ref.tmp, align 8 + call void @goo5(%"pair5"* dereferenceable(8) %tmpcast) + call void @llvm.lifetime.end(i64 8, i8* %t0) + ret void +} + +declare void @goo6(%"pair6"* dereferenceable(8)) local_unnamed_addr +%"pair6" = type { i16, i16 } + +; CHECK-LABEL: int16_int16_pair +; CHECK: movw %si, 2(%rsp) +; CHECK: movw %di, (%rsp) +; CHECK: leaq (%rsp), %rdi +define void @int16_int16_pair(i16 %tmp1, i16 signext %tmp2) local_unnamed_addr { +entry: + %ref.tmp = alloca i32, align 8 + %tmpcast = bitcast i32* %ref.tmp to %"pair6"* + %t0 = bitcast i32* %ref.tmp to i8* + call void @llvm.lifetime.start(i64 8, i8* %t0) + %retval.sroa.2.0.insert.ext.i = zext i16 %tmp2 to i32 + %retval.sroa.2.0.insert.shift.i = shl nuw nsw i32 %retval.sroa.2.0.insert.ext.i, 16 + %retval.sroa.0.0.insert.ext.i = zext i16 %tmp1 to i32 + %retval.sroa.0.0.insert.insert.i = or i32 %retval.sroa.2.0.insert.shift.i, %retval.sroa.0.0.insert.ext.i + store i32 %retval.sroa.0.0.insert.insert.i, i32* %ref.tmp, align 8 + call void @goo6(%"pair6"* dereferenceable(8) %tmpcast) + call void @llvm.lifetime.end(i64 8, i8* %t0) + ret void +} + +declare void @goo7(%"pair7"* dereferenceable(8)) local_unnamed_addr +%"pair7" = type { i16, i8 } + +; CHECK-LABEL: int16_int8_pair +; CHECK: movw %di, (%rsp) +; CHECK: movzbl %sil, %eax +; CHECK: movw %ax, 2(%rsp) +; CHECK: leaq (%rsp), %rdi +define void @int16_int8_pair(i16 %tmp1, i8 signext %tmp2) local_unnamed_addr { +entry: + %ref.tmp = alloca i32, align 8 + %tmpcast = bitcast i32* %ref.tmp to %"pair7"* + %t0 = bitcast i32* %ref.tmp to i8* + call void @llvm.lifetime.start(i64 8, i8* %t0) + %retval.sroa.2.0.insert.ext.i = zext i8 %tmp2 to i32 + %retval.sroa.2.0.insert.shift.i = shl nuw nsw i32 %retval.sroa.2.0.insert.ext.i, 16 + %retval.sroa.0.0.insert.ext.i = zext i16 %tmp1 to i32 + %retval.sroa.0.0.insert.insert.i = or i32 %retval.sroa.2.0.insert.shift.i, %retval.sroa.0.0.insert.ext.i + store i32 %retval.sroa.0.0.insert.insert.i, i32* %ref.tmp, align 8 + call void @goo7(%"pair7"* dereferenceable(8) %tmpcast) + call void @llvm.lifetime.end(i64 8, i8* %t0) + ret void +} + +declare void @goo8(%"pair8"* dereferenceable(8)) local_unnamed_addr +%"pair8" = type { i8, i8 } + +; CHECK-LABEL: int8_int8_pair +; CHECK: movb %sil, 1(%rsp) +; CHECK: movb %dil, (%rsp) +; CHECK: leaq (%rsp), %rdi +define void @int8_int8_pair(i8 %tmp1, i8 signext %tmp2) local_unnamed_addr { +entry: + %ref.tmp = alloca i16, align 8 + %tmpcast = bitcast i16* %ref.tmp to %"pair8"* + %t0 = bitcast i16* %ref.tmp to i8* + call void @llvm.lifetime.start(i64 8, i8* %t0) + %retval.sroa.2.0.insert.ext.i = zext i8 %tmp2 to i16 + %retval.sroa.2.0.insert.shift.i = shl nuw nsw i16 %retval.sroa.2.0.insert.ext.i, 8 + %retval.sroa.0.0.insert.ext.i = zext i8 %tmp1 to i16 + %retval.sroa.0.0.insert.insert.i = or i16 %retval.sroa.2.0.insert.shift.i, %retval.sroa.0.0.insert.ext.i + store i16 %retval.sroa.0.0.insert.insert.i, i16* %ref.tmp, align 8 + call void @goo8(%"pair8"* dereferenceable(8) %tmpcast) + call void @llvm.lifetime.end(i64 8, i8* %t0) + ret void +}