Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -330,6 +330,12 @@ return false; } + /// \brief Return true if it is cheaper to split the store of a merged int val + /// from a pair of smaller values into multiple stores. + virtual bool isMultiStoresCheaperThanBitsMerge(SDValue Lo, SDValue Hi) const { + return false; + } + /// \brief Return if the target supports combining a /// chain like: /// \code Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -374,6 +374,7 @@ SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL); SDValue ReduceLoadWidth(SDNode *N); SDValue ReduceLoadOpStoreWidth(SDNode *N); + SDValue splitMergedValStore(StoreSDNode *ST); SDValue TransformFPLoadStorePair(SDNode *N); SDValue reduceBuildVecExtToExtBuildVec(SDNode *N); SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N); @@ -12196,9 +12197,111 @@ return NewSt; } + if (SDValue NewSt = splitMergedValStore(ST)) + return NewSt; + return ReduceLoadOpStoreWidth(N); } +/// For the instruction sequence of store below, F and I values +/// are bundled together as an i64 value before being stored into memory. +/// Sometimes it is more efficent to generate separate stores for F and I, +/// which can remove the bitwise instructions or sink them to colder places. +/// +/// (store (or (zext (bitcast F to i32) to i64), +/// (shl (zext I to i64), 32)), addr) --> +/// (store F, addr) and (store I, addr+4) +/// +/// Similarly, splitting for other merged store can also be beneficial, like: +/// For pair of {i32, i32}, i64 store --> two i32 stores. +/// For pair of {i32, i16}, i64 store --> two i32 stores. +/// For pair of {i16, i16}, i32 store --> two i16 stores. +/// For pair of {i16, i8}, i32 store --> two i16 stores. +/// For pair of {i8, i8}, i16 store --> two i8 stores. +/// +/// We allow each target to determine specifically which kind of splitting is +/// supported. +/// +/// The store patterns are commonly seen from the simple code snippet below +/// if only std::make_pair(...) is sroa transformed before inlined into hoo. +/// void goo(const std::pair &); +/// hoo() { +/// ... +/// goo(std::make_pair(tmp, ftmp)); +/// ... +/// } +/// +SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) { + if (OptLevel == CodeGenOpt::None) + return SDValue(); + + SDValue Val = ST->getValue(); + SDLoc DL(ST); + + // Match OR operand. + if (!Val.getValueType().isScalarInteger() || Val.getOpcode() != ISD::OR) + return SDValue(); + + // Match SHL operand and get Lower and Higher parts of Val. + SDValue Op1 = Val.getOperand(0); + SDValue Op2 = Val.getOperand(1); + SDValue Lo, Hi; + if (Op1.getOpcode() != ISD::SHL) { + std::swap(Op1, Op2); + if (Op1.getOpcode() != ISD::SHL) + return SDValue(); + } + Lo = Op2; + Hi = Op1.getOperand(0); + if (!Op1.hasOneUse()) + return SDValue(); + + // Match shift amount to HalfValBitSize. + unsigned HalfValBitSize = Val.getValueType().getSizeInBits() / 2; + ConstantSDNode *ShAmt = dyn_cast(Op1.getOperand(1)); + if (!ShAmt || ShAmt->getAPIntValue() != HalfValBitSize) + return SDValue(); + + // Lo and Hi are zero-extended from int with size less equal than 32 + // to i64. + if (Lo.getOpcode() != ISD::ZERO_EXTEND || !Lo.hasOneUse() || + !Lo.getOperand(0).getValueType().isScalarInteger() || + Lo.getOperand(0).getValueType().getSizeInBits() > HalfValBitSize || + Hi.getOpcode() != ISD::ZERO_EXTEND || !Hi.hasOneUse() || + !Hi.getOperand(0).getValueType().isScalarInteger() || + Hi.getOperand(0).getValueType().getSizeInBits() > HalfValBitSize) + return SDValue(); + + if (!TLI.isMultiStoresCheaperThanBitsMerge(Lo.getOperand(0), + Hi.getOperand(0))) + return SDValue(); + + // Start to split store. + unsigned Alignment = ST->getAlignment(); + MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags(); + AAMDNodes AAInfo = ST->getAAInfo(); + + // Change the sizes of Lo and Hi's value types to HalfValBitSize. + EVT VT = EVT::getIntegerVT(*DAG.getContext(), HalfValBitSize); + Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Lo.getOperand(0)); + Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Hi.getOperand(0)); + + SDValue Chain = ST->getChain(); + SDValue Ptr = ST->getBasePtr(); + // Lower value store. + SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(), + ST->getAlignment(), MMOFlags, AAInfo); + Ptr = + DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(HalfValBitSize / 8, DL, Ptr.getValueType())); + // Higher value store. + SDValue St1 = + DAG.getStore(Chain, DL, Hi, Ptr, + ST->getPointerInfo().getWithOffset(HalfValBitSize / 8), + Alignment / 2, MMOFlags, AAInfo); + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, St0, St1); +} + SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { SDValue InVec = N->getOperand(0); SDValue InVal = N->getOperand(1); Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -764,6 +764,26 @@ return VT == MVT::f32 || VT == MVT::f64 || VT.isVector(); } + bool isMultiStoresCheaperThanBitsMerge(SDValue Lo, + SDValue Hi) const override { + // If the pair to store is a mixture of float and int values, we will + // save two bitwise instructions and one float-to-int instruction and + // increase one store instruction. There is potentially a more + // significant benefit because it avoids the float->int domain switch + // for input value. So It is more likely a win. + if (Lo.getOpcode() == ISD::BITCAST || Hi.getOpcode() == ISD::BITCAST) { + SDValue Opd = (Lo.getOpcode() == ISD::BITCAST) ? Lo.getOperand(0) + : Hi.getOperand(0); + if (Opd.getValueType().isFloatingPoint()) + return true; + } + // If the pair only contains int values, we will save two bitwise + // instructions and increase one store instruction (costing one more + // store buffer). Since the benefit is more blurred so we leave + // such pair out until we get testcase to prove it is a win. + return false; + } + bool hasAndNotCompare(SDValue Y) const override; /// Return the value type to use for ISD::SETCC. Index: test/Transforms/InstCombine/split-store.ll =================================================================== --- test/Transforms/InstCombine/split-store.ll +++ test/Transforms/InstCombine/split-store.ll @@ -0,0 +1,62 @@ +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; CHECK-LABEL: int32_float_pair +; CHECK: movss %xmm0, 4(%rsi) +; CHECK: movl %edi, (%rsi) +define void @int32_float_pair(i32 %tmp1, float %tmp2, i64* %ref.tmp) local_unnamed_addr { +entry: + %t0 = bitcast float %tmp2 to i32 + %t1 = zext i32 %t0 to i64 + %t2 = shl nuw i64 %t1, 32 + %t3 = zext i32 %tmp1 to i64 + %t4 = or i64 %t2, %t3 + store i64 %t4, i64* %ref.tmp, align 8 + ret void +} + +; CHECK-LABEL: float_int32_pair +; CHECK: movl %edi, 4(%rsi) +; CHECK: movss %xmm0, (%rsi) +define void @float_int32_pair(float %tmp1, i32 %tmp2, i64* %ref.tmp) local_unnamed_addr #0 { +entry: + %t0 = bitcast float %tmp1 to i32 + %t1 = zext i32 %tmp2 to i64 + %t2 = shl nuw i64 %t1, 32 + %t3 = zext i32 %t0 to i64 + %t4 = or i64 %t2, %t3 + store i64 %t4, i64* %ref.tmp, align 8 + ret void +} + +; CHECK-LABEL: int16_float_pair +; CHECK: movss %xmm0, 4(%rsi) +; CHECK: movzwl %di, %eax +; CHECK: movl %eax, (%rsi) +define void @int16_float_pair(i16 signext %tmp1, float %tmp2, i64* %ref.tmp) local_unnamed_addr { +entry: + %t0 = bitcast float %tmp2 to i32 + %t1 = zext i32 %t0 to i64 + %t2 = shl nuw i64 %t1, 32 + %t3 = zext i16 %tmp1 to i64 + %t4 = or i64 %t2, %t3 + store i64 %t4, i64* %ref.tmp, align 8 + ret void +} + +; CHECK-LABEL: int8_float_pair +; CHECK: movss %xmm0, 4(%rsi) +; CHECK: movzbl %dil, %eax +; CHECK: movl %eax, (%rsi) +define void @int8_float_pair(i8 signext %tmp1, float %tmp2, i64* %ref.tmp) local_unnamed_addr { +entry: + %t0 = bitcast float %tmp2 to i32 + %t1 = zext i32 %t0 to i64 + %t2 = shl nuw i64 %t1, 32 + %t3 = zext i8 %tmp1 to i64 + %t4 = or i64 %t2, %t3 + store i64 %t4, i64* %ref.tmp, align 8 + ret void +}