Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -124,6 +124,10 @@ "profile-guided-section-prefix", cl::Hidden, cl::init(true), cl::desc("Use profile info to add section prefix for hot/cold functions")); +static cl::opt ForceSplitStore( + "force-split-store", cl::Hidden, cl::init(false), + cl::desc("Force store splitting no matter what the target query says.")); + namespace { typedef SmallPtrSet SetOfInstrs; typedef PointerIntPair TypeIsSExt; @@ -5263,6 +5267,115 @@ return false; } +/// For the instruction sequence of store below, F and I values +/// are bundled together as an i64 value before being stored into memory. +/// Sometimes it is more efficent to generate separate stores for F and I, +/// which can remove the bitwise instructions or sink them to colder places. +/// +/// (store (or (zext (bitcast F to i32) to i64), +/// (shl (zext I to i64), 32)), addr) --> +/// (store F, addr) and (store I, addr+4) +/// +/// Similarly, splitting for other merged store can also be beneficial, like: +/// For pair of {i32, i32}, i64 store --> two i32 stores. +/// For pair of {i32, i16}, i64 store --> two i32 stores. +/// For pair of {i16, i16}, i32 store --> two i16 stores. +/// For pair of {i16, i8}, i32 store --> two i16 stores. +/// For pair of {i8, i8}, i16 store --> two i8 stores. +/// +/// We allow each target to determine specifically which kind of splitting is +/// supported. +/// +/// The store patterns are commonly seen from the simple code snippet below +/// if only std::make_pair(...) is sroa transformed before inlined into hoo. +/// void goo(const std::pair &); +/// hoo() { +/// ... +/// goo(std::make_pair(tmp, ftmp)); +/// ... +/// } +/// +/// Although we already have similar splitting in DAG Combine, we duplicate +/// it in CodeGenPrepare to catch the case in which pattern is across +/// multiple BBs. The logic in DAG Combine is kept to catch case generated +/// during code expansion. +static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL, + const TargetLowering &TLI) { + Type *StoreType = SI.getValueOperand()->getType(); + unsigned HalfValBitSize = DL.getTypeSizeInBits(StoreType) / 2; + Type *SplitStoreType = Type::getIntNTy(SI.getContext(), HalfValBitSize); + + // Handle simple but common cases only. + if (DL.getTypeStoreSizeInBits(StoreType) != DL.getTypeSizeInBits(StoreType) || + DL.getTypeStoreSizeInBits(SplitStoreType) != + DL.getTypeSizeInBits(SplitStoreType)) + return false; + + // Match the following patterns: + // (store (or (zext LValue to i64), + // (shl (zext HValue to i64), 32)), HalfValBitSize) + // or + // (store (or (shl (zext HValue to i64), 32)), HalfValBitSize) + // (zext LValue to i64), + // Expect both operands of OR and the first operand of SHL have only + // one use. + Value *LValue, *HValue; + if (!match(SI.getValueOperand(), + m_c_Or(m_OneUse(m_ZExt(m_Value(LValue))), + m_OneUse(m_Shl(m_OneUse(m_ZExt(m_Value(HValue))), + m_SpecificInt(HalfValBitSize)))))) + return false; + + // Check LValue and HValue are int with size less or equal than 32. + if (!LValue->getType()->isIntegerTy() || + DL.getTypeSizeInBits(LValue->getType()) > HalfValBitSize || + !HValue->getType()->isIntegerTy() || + DL.getTypeSizeInBits(HValue->getType()) > HalfValBitSize) + return false; + + // If LValue/HValue is a bitcast instruction, use the EVT before bitcast + // as the input of target query. + auto *LBC = dyn_cast(LValue); + auto *HBC = dyn_cast(HValue); + EVT LowTy = LBC ? EVT::getEVT(LBC->getOperand(0)->getType()) + : EVT::getEVT(LValue->getType()); + EVT HighTy = HBC ? EVT::getEVT(HBC->getOperand(0)->getType()) + : EVT::getEVT(HValue->getType()); + if (!ForceSplitStore && !TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy)) + return false; + + // Start to split store. + IRBuilder<> Builder(SI.getContext()); + Builder.SetInsertPoint(&SI); + + // If LValue/HValue is a bitcast in another BB, create a new one in current + // BB so it may be merged with the splitted stores by dag combiner. + if (LBC && LBC->getParent() != SI.getParent()) + LValue = Builder.CreateBitCast(LBC->getOperand(0), LBC->getType()); + if (HBC && HBC->getParent() != SI.getParent()) + HValue = Builder.CreateBitCast(HBC->getOperand(0), HBC->getType()); + + auto CreateSplitStore = [&](Value *V, bool Upper) { + V = Builder.CreateZExtOrBitCast(V, SplitStoreType); + Value *Addr = Builder.CreateBitCast( + SI.getOperand(1), + SplitStoreType->getPointerTo(SI.getPointerAddressSpace())); + if (Upper) + Addr = Builder.CreateGEP( + SplitStoreType, Addr, + ConstantInt::get(Type::getInt32Ty(SI.getContext()), 1)); + Builder.CreateAlignedStore(V, Addr, Upper ? SI.getAlignment() / 2 + : SI.getAlignment()); + }; + + CreateSplitStore(LValue, false); + CreateSplitStore(HValue, true); + + // Delete the old store. + SI.eraseFromParent(); + return true; +} + bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) { // Bail out if we inserted the instruction to prevent optimizations from // stepping on each other's toes. @@ -5327,6 +5440,8 @@ } if (StoreInst *SI = dyn_cast(I)) { + if (TLI && splitMergedValStore(*SI, *DL, *TLI)) + return true; stripInvariantGroupMetadata(*SI); if (TLI) { unsigned AS = SI->getPointerAddressSpace(); Index: test/CodeGen/X86/split-store.ll =================================================================== --- test/CodeGen/X86/split-store.ll +++ test/CodeGen/X86/split-store.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-unknown-unknown -force-split-store < %s | FileCheck %s ; CHECK-LABEL: int32_float_pair ; CHECK: movl %edi, (%rsi) @@ -57,3 +57,200 @@ store i64 %t4, i64* %ref.tmp, align 8 ret void } + +; CHECK-LABEL: int32_int32_pair +; CHECK: movl %edi, (%rdx) +; CHECK: movl %esi, 4(%rdx) +define void @int32_int32_pair(i32 %tmp1, i32 %tmp2, i64* %ref.tmp) { +entry: + %t1 = zext i32 %tmp2 to i64 + %t2 = shl nuw i64 %t1, 32 + %t3 = zext i32 %tmp1 to i64 + %t4 = or i64 %t2, %t3 + store i64 %t4, i64* %ref.tmp, align 8 + ret void +} + +; CHECK-LABEL: int16_int16_pair +; CHECK: movw %di, (%rdx) +; CHECK: movw %si, 2(%rdx) +define void @int16_int16_pair(i16 signext %tmp1, i16 signext %tmp2, i32* %ref.tmp) { +entry: + %t1 = zext i16 %tmp2 to i32 + %t2 = shl nuw i32 %t1, 16 + %t3 = zext i16 %tmp1 to i32 + %t4 = or i32 %t2, %t3 + store i32 %t4, i32* %ref.tmp, align 4 + ret void +} + +; CHECK-LABEL: int8_int8_pair +; CHECK: movb %dil, (%rdx) +; CHECK: movb %sil, 1(%rdx) +define void @int8_int8_pair(i8 signext %tmp1, i8 signext %tmp2, i16* %ref.tmp) { +entry: + %t1 = zext i8 %tmp2 to i16 + %t2 = shl nuw i16 %t1, 8 + %t3 = zext i8 %tmp1 to i16 + %t4 = or i16 %t2, %t3 + store i16 %t4, i16* %ref.tmp, align 2 + ret void +} + +; CHECK-LABEL: int31_int31_pair +; CHECK: andl $2147483647, %edi +; CHECK: movl %edi, (%rdx) +; CHECK: andl $2147483647, %esi +; CHECK: movl %esi, 4(%rdx) +define void @int31_int31_pair(i31 %tmp1, i31 %tmp2, i64* %ref.tmp) { +entry: + %t1 = zext i31 %tmp2 to i64 + %t2 = shl nuw i64 %t1, 32 + %t3 = zext i31 %tmp1 to i64 + %t4 = or i64 %t2, %t3 + store i64 %t4, i64* %ref.tmp, align 8 + ret void +} + +; CHECK-LABEL: int31_int17_pair +; CHECK: andl $2147483647, %edi +; CHECK: movl %edi, (%rdx) +; CHECK: andl $131071, %esi +; CHECK: movl %esi, 4(%rdx) +define void @int31_int17_pair(i31 %tmp1, i17 %tmp2, i64* %ref.tmp) { +entry: + %t1 = zext i17 %tmp2 to i64 + %t2 = shl nuw i64 %t1, 32 + %t3 = zext i31 %tmp1 to i64 + %t4 = or i64 %t2, %t3 + store i64 %t4, i64* %ref.tmp, align 8 + ret void +} + +; CHECK-LABEL: int7_int3_pair +; CHECK: andb $127, %dil +; CHECK: movb %dil, (%rdx) +; CHECK: andb $7, %sil +; CHECK: movb %sil, 1(%rdx) +define void @int7_int3_pair(i7 signext %tmp1, i3 signext %tmp2, i16* %ref.tmp) { +entry: + %t1 = zext i3 %tmp2 to i16 + %t2 = shl nuw i16 %t1, 8 + %t3 = zext i7 %tmp1 to i16 + %t4 = or i16 %t2, %t3 + store i16 %t4, i16* %ref.tmp, align 2 + ret void +} + +; CHECK-LABEL: int24_int24_pair +; CHECK: movw %di, (%rdx) +; CHECK: shrl $16, %edi +; CHECK: movb %dil, 2(%rdx) +; CHECK: movl %esi, %eax +; CHECK: shrl $16, %eax +; CHECK: movb %al, 6(%rdx) +; CHECK: movw %si, 4(%rdx) +define void @int24_int24_pair(i24 signext %tmp1, i24 signext %tmp2, i48* %ref.tmp) { +entry: + %t1 = zext i24 %tmp2 to i48 + %t2 = shl nuw i48 %t1, 24 + %t3 = zext i24 %tmp1 to i48 + %t4 = or i48 %t2, %t3 + store i48 %t4, i48* %ref.tmp, align 2 + ret void +} + +; getTypeSizeInBits(i12) != getTypeStoreSizeInBits(i12), so store split doesn't kick in. +; CHECK-LABEL: int12_int12_pair +; CHECK: movl %esi, %eax +; CHECK: shll $12, %eax +; CHECK: andl $4095, %edi +; CHECK: orl %eax, %edi +; CHECK: shrl $4, %esi +; CHECK: movb %sil, 2(%rdx) +; CHECK: movw %di, (%rdx) +define void @int12_int12_pair(i12 signext %tmp1, i12 signext %tmp2, i24* %ref.tmp) { +entry: + %t1 = zext i12 %tmp2 to i24 + %t2 = shl nuw i24 %t1, 12 + %t3 = zext i12 %tmp1 to i24 + %t4 = or i24 %t2, %t3 + store i24 %t4, i24* %ref.tmp, align 2 + ret void +} + +; getTypeSizeInBits(i14) != getTypeStoreSizeInBits(i14), so store split doesn't kick in. +; CHECK-LABEL: int7_int7_pair +; CHECK: movzbl %sil, %eax +; CHECK: shll $7, %eax +; CHECK: andb $127, %dil +; CHECK: movzbl %dil, %ecx +; CHECK: orl %eax, %ecx +; CHECK: andl $16383, %ecx +; CHECK: movw %cx, (%rdx) +define void @int7_int7_pair(i7 signext %tmp1, i7 signext %tmp2, i14* %ref.tmp) { +entry: + %t1 = zext i7 %tmp2 to i14 + %t2 = shl nuw i14 %t1, 7 + %t3 = zext i7 %tmp1 to i14 + %t4 = or i14 %t2, %t3 + store i14 %t4, i14* %ref.tmp, align 2 + ret void +} + +; getTypeSizeInBits(i2) != getTypeStoreSizeInBits(i2), so store split doesn't kick in. +; CHECK-LABEL: int1_int1_pair +; CHECK: addb %sil, %sil +; CHECK: andb $1, %dil +; CHECK: orb %sil, %dil +; CHECK: andb $3, %dil +; CHECK: movb %dil, (%rdx) +define void @int1_int1_pair(i1 signext %tmp1, i1 signext %tmp2, i2* %ref.tmp) { +entry: + %t1 = zext i1 %tmp2 to i2 + %t2 = shl nuw i2 %t1, 1 + %t3 = zext i1 %tmp1 to i2 + %t4 = or i2 %t2, %t3 + store i2 %t4, i2* %ref.tmp, align 1 + ret void +} + +; CHECK-LABEL: mbb_int32_float_pair +; CHECK: movl %edi, (%rsi) +; CHECK: movss %xmm0, 4(%rsi) +define void @mbb_int32_float_pair(i32 %tmp1, float %tmp2, i64* %ref.tmp) { +entry: + %t0 = bitcast float %tmp2 to i32 + br label %next +next: + %t1 = zext i32 %t0 to i64 + %t2 = shl nuw i64 %t1, 32 + %t3 = zext i32 %tmp1 to i64 + %t4 = or i64 %t2, %t3 + store i64 %t4, i64* %ref.tmp, align 8 + ret void +} + +; CHECK-LABEL: mbb_int32_float_multi_stores +; CHECK: movl %edi, (%rsi) +; CHECK: movss %xmm0, 4(%rsi) +; CHECK: # %bb2 +; CHECK: movl %edi, (%rdx) +; CHECK: movss %xmm0, 4(%rdx) +define void @mbb_int32_float_multi_stores(i32 %tmp1, float %tmp2, i64* %ref.tmp, i64* %ref.tmp1, i1 %cmp) { +entry: + %t0 = bitcast float %tmp2 to i32 + br label %bb1 +bb1: + %t1 = zext i32 %t0 to i64 + %t2 = shl nuw i64 %t1, 32 + %t3 = zext i32 %tmp1 to i64 + %t4 = or i64 %t2, %t3 + store i64 %t4, i64* %ref.tmp, align 8 + br i1 %cmp, label %bb2, label %exitbb +bb2: + store i64 %t4, i64* %ref.tmp1, align 8 + br label %exitbb +exitbb: + ret void +}