Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -5179,6 +5179,127 @@ return false; } +/// For the instruction sequence of store below, F and I values +/// are bundled together as an i64 value before being stored into memory. +/// Sometimes it is more efficent to generate separate stores for F and I, +/// which can remove the bitwise instructions or sink them to colder places. +/// +/// (store (or (zext (bitcast F to i32) to i64), +/// (shl (zext I to i64), 32)), addr) --> +/// (store F, addr) and (store I, addr+4) +/// +/// Similarly, splitting for other merged store can also be beneficial, like: +/// For pair of {i32, i32}, i64 store --> two i32 stores. +/// For pair of {i32, i16}, i64 store --> two i32 stores. +/// For pair of {i16, i16}, i32 store --> two i16 stores. +/// For pair of {i16, i8}, i32 store --> two i16 stores. +/// For pair of {i8, i8}, i16 store --> two i8 stores. +/// +/// We allow each target to determine specifically which kind of splitting is +/// supported. +/// +/// The store patterns are commonly seen from the simple code snippet below +/// if only std::make_pair(...) is sroa transformed before inlined into hoo. +/// void goo(const std::pair &); +/// hoo() { +/// ... +/// goo(std::make_pair(tmp, ftmp)); +/// ... +/// } +/// +/// Although we already have similar splitting in DAG Combine, we duplicate +/// it in CodeGenPrepare to catch the case in which pattern is across +/// multiple BBs. The logic in DAG Combine is kept to catch case generated +/// during code expansion. +static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL, + const TargetLowering &TLI) { + // Match OR operand. + BinaryOperator *OR = dyn_cast(SI.getValueOperand()); + if (!OR || OR->getOpcode() != Instruction::Or || + OR->getParent() != SI.getParent()) + return false; + + // Match SHL operand and get Lower and Higher parts of Val. + Value *Op1 = OR->getOperand(0); + Value *Op2 = OR->getOperand(1); + BinaryOperator *SHL = dyn_cast(Op1); + if (!SHL || SHL->getOpcode() != Instruction::Shl) { + std::swap(Op1, Op2); + SHL = dyn_cast(Op1); + if (!SHL || SHL->getOpcode() != Instruction::Shl) + return false; + } + if (!SHL->hasOneUse()) + return false; + + // Match shift amount to HalfValBitSize. + unsigned HalfValBitSize = + DL.getTypeSizeInBits(SI.getValueOperand()->getType()) / 2; + ConstantInt *CI = dyn_cast(SHL->getOperand(1)); + if (!CI || CI->getValue() != HalfValBitSize) + return false; + + // Check ZL and ZH are zero-extended from int with size less equal than 32 + // to i64. + ZExtInst *ZL = dyn_cast(Op2); + ZExtInst *ZH = dyn_cast(SHL->getOperand(0)); + if (!ZL || !ZL->hasOneUse() || !ZH || !ZH->hasOneUse()) + return false; + Value *LValue = ZL->getOperand(0); + Value *HValue = ZH->getOperand(0); + if (!LValue->getType()->isIntegerTy() || + DL.getTypeSizeInBits(LValue->getType()) > HalfValBitSize || + !HValue->getType()->isIntegerTy() || + DL.getTypeSizeInBits(HValue->getType()) > HalfValBitSize) + return false; + + // If LValue/HValue is a bitcast instruction, use the EVT before bitcast + // as the input of target query. + EVT LowTy = EVT::getEVT(LValue->getType()); + EVT HighTy = EVT::getEVT(HValue->getType()); + BasicBlock *CurBB = OR->getParent(); + if (BitCastInst *BC = dyn_cast(LValue)) + LowTy = EVT::getEVT(BC->getOperand(0)->getType(), true); + if (BitCastInst *BC = dyn_cast(HValue)) + HighTy = EVT::getEVT(BC->getOperand(0)->getType(), true); + + if (!TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy)) + return false; + + // Start to split store. + IRBuilder<> Builder(SI.getContext()); + + // If LValue/HValue is a bitcast in another BB and has only one use, move + // it to current BB so it may be merged with the splitted stores by dag + // combiner. + BitCastInst *LBC = dyn_cast(LValue); + if (LBC && LBC->hasOneUse() && LBC->getParent() != CurBB) { + Builder.SetInsertPoint(ZL); + LValue = Builder.CreateBitCast(LBC->getOperand(0), LBC->getType()); + } + BitCastInst *HBC = dyn_cast(HValue); + if (HBC && HBC->hasOneUse() && HBC->getParent() != CurBB) { + Builder.SetInsertPoint(ZH); + HValue = Builder.CreateBitCast(HBC->getOperand(0), HBC->getType()); + } + + Builder.SetInsertPoint(&SI); + Type *Ty = Type::getIntNTy(SI.getContext(), HalfValBitSize); + Type *PtrTy = Ty->getPointerTo(SI.getPointerAddressSpace()); + Value *Low = Builder.CreateZExtOrBitCast(LValue, Ty); + Value *LowAddr = Builder.CreateBitCast(SI.getOperand(1), PtrTy); + Builder.CreateAlignedStore(Low, LowAddr, SI.getAlignment()); + + Value *High = Builder.CreateZExtOrBitCast(HValue, Ty); + Value *HighAddr = Builder.CreateGEP( + Ty, LowAddr, ConstantInt::get(Type::getInt32Ty(SI.getContext()), 1)); + Builder.CreateAlignedStore(High, HighAddr, SI.getAlignment() / 2); + + // Delete the old store and the bitwise instructions generating int64. + SI.eraseFromParent(); + return true; +} + bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) { // Bail out if we inserted the instruction to prevent optimizations from // stepping on each other's toes. @@ -5243,6 +5364,8 @@ } if (StoreInst *SI = dyn_cast(I)) { + if (TLI && splitMergedValStore(*SI, *DL, *TLI)) + return true; stripInvariantGroupMetadata(*SI); if (TLI) { unsigned AS = SI->getPointerAddressSpace(); Index: test/CodeGen/X86/split-store.ll =================================================================== --- test/CodeGen/X86/split-store.ll +++ test/CodeGen/X86/split-store.ll @@ -1,8 +1,8 @@ ; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s ; CHECK-LABEL: int32_float_pair -; CHECK: movss %xmm0, 4(%rsi) ; CHECK: movl %edi, (%rsi) +; CHECK: movss %xmm0, 4(%rsi) define void @int32_float_pair(i32 %tmp1, float %tmp2, i64* %ref.tmp) { entry: %t0 = bitcast float %tmp2 to i32 @@ -15,8 +15,8 @@ } ; CHECK-LABEL: float_int32_pair -; CHECK: movl %edi, 4(%rsi) ; CHECK: movss %xmm0, (%rsi) +; CHECK: movl %edi, 4(%rsi) define void @float_int32_pair(float %tmp1, i32 %tmp2, i64* %ref.tmp) { entry: %t0 = bitcast float %tmp1 to i32 @@ -29,9 +29,9 @@ } ; CHECK-LABEL: int16_float_pair -; CHECK: movss %xmm0, 4(%rsi) ; CHECK: movzwl %di, %eax ; CHECK: movl %eax, (%rsi) +; CHECK: movss %xmm0, 4(%rsi) define void @int16_float_pair(i16 signext %tmp1, float %tmp2, i64* %ref.tmp) { entry: %t0 = bitcast float %tmp2 to i32 @@ -44,9 +44,9 @@ } ; CHECK-LABEL: int8_float_pair -; CHECK: movss %xmm0, 4(%rsi) ; CHECK: movzbl %dil, %eax ; CHECK: movl %eax, (%rsi) +; CHECK: movss %xmm0, 4(%rsi) define void @int8_float_pair(i8 signext %tmp1, float %tmp2, i64* %ref.tmp) { entry: %t0 = bitcast float %tmp2 to i32 @@ -57,3 +57,19 @@ store i64 %t4, i64* %ref.tmp, align 8 ret void } + +; CHECK-LABEL: mbb_int32_float_pair +; CHECK: movl %edi, (%rsi) +; CHECK: movss %xmm0, 4(%rsi) +define void @mbb_int32_float_pair(i32 %tmp1, float %tmp2, i64* %ref.tmp) { +entry: + %t0 = bitcast float %tmp2 to i32 + br label %next +next: + %t1 = zext i32 %t0 to i64 + %t2 = shl nuw i64 %t1, 32 + %t3 = zext i32 %tmp1 to i64 + %t4 = or i64 %t2, %t3 + store i64 %t4, i64* %ref.tmp, align 8 + ret void +}