Index: lib/Transforms/InstCombine/InstCombineInternal.h =================================================================== --- lib/Transforms/InstCombine/InstCombineInternal.h +++ lib/Transforms/InstCombine/InstCombineInternal.h @@ -595,6 +595,7 @@ bool Inside); Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI); Instruction *MatchBSwap(BinaryOperator &I); + bool SplitInt64Store(StoreInst &SI); bool SimplifyStoreAtEndOfBlock(StoreInst &SI); Instruction *SimplifyMemTransfer(MemIntrinsic *MI); Instruction *SimplifyMemSet(MemSetInst *MI); Index: lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -1265,9 +1265,88 @@ if (SimplifyStoreAtEndOfBlock(SI)) return nullptr; // xform done! + SplitInt64Store(SI); return nullptr; } +/// For the instruction sequence of int64 store below, %int_tmp and %float_tmp +/// are bundled together as an int64 data before stored into memory. If the +/// int64 data is not used outside of the store, it is more efficent to +/// generate separate stores for %int_tmp and %float_tmp. +/// +/// Instruction sequence of int64 Store: +/// %ref.tmp = alloca i64, align 8 +/// %1 = bitcast float %float_tmp to i32 +/// %sroa.1.ext = zext i32 %1 to i64 +/// %sroa.1.shift = shl nuw i64 %sroa.1.ext, 32 +/// %sroa.0.ext = zext i32 %int_tmp to i64 +/// %sroa.0.insert = or i64 %sroa.1.shift, %sroa.0.ext +/// store i64 %retval.sroa.0.0.insert.insert.i, i64* %ref.tmp, align 8 +/// +/// Instruction sequence of splitted stores: +/// %ref.tmp = alloca i64, align 8 +/// %1 = bitcast i64* %ref.tmp to i32* +/// store i32 %int_tmp, i32* %1, align 4 +/// %2 = getelementptr i32, i32* %1, i64 1 +/// %3 = bitcast i32* %2 to float* +/// store float %float_tmp, float* %3, align 4 +/// +/// The int64 store pattern is commonly seen from the simple code snippet below +/// if only std::make_pair(...) is sroa transformed before inlined into hoo. +/// void goo(const std::pair &); +/// hoo() { +/// ... +/// goo(std::make_pair(tmp, ftmp)); +/// ... +/// } +/// +bool InstCombiner::SplitInt64Store(StoreInst &SI) { + Value *Val = SI.getOperand(0); + if (!Val->getType()->isIntegerTy(64) || !Val->hasOneUse()) + return false; + BinaryOperator *OR = dyn_cast(Val); + if (!OR || OR->getOpcode() != Instruction::Or || !OR->hasOneUse()) + return false; + + Value *Op1 = OR->getOperand(0); + BinaryOperator *SHL = dyn_cast(Op1); + if (!SHL || SHL->getOpcode() != Instruction::Shl || !SHL->hasOneUse()) + return false; + ConstantInt *CI = dyn_cast(SHL->getOperand(1)); + if (!CI || CI->getLimitedValue() != 32) + return false; + + // Z1 and Z2 should only have one use and the source operands can fit + // into i32. + ZExtInst *Z1 = dyn_cast(SHL->getOperand(0)); + ZExtInst *Z2 = dyn_cast(OR->getOperand(1)); + if (!Z1 || !Z1->hasOneUse() || !Z1->getOperand(0)->getType()->isIntegerTy() || + DL.getTypeSizeInBits(Z1->getOperand(0)->getType()) > 32) + return false; + if (!Z2 || !Z2->hasOneUse() || !Z2->getOperand(0)->getType()->isIntegerTy() || + DL.getTypeSizeInBits(Z2->getOperand(0)->getType()) > 32) + return false; + + // Now it is ok to split the int64 store into two int32 stores. + Value *Low = + Builder->CreateZExtOrBitCast(Z2->getOperand(0), Builder->getInt32Ty()); + Value *LowAddr = Builder->CreateBitCast(SI.getOperand(1), + Type::getInt32PtrTy(SI.getContext())); + Builder->CreateAlignedStore(Low, LowAddr, SI.getAlignment()); + Value *HighAddr = Builder->CreateConstGEP1_32(LowAddr, 1); + Value *High = + Builder->CreateZExtOrBitCast(Z1->getOperand(0), Builder->getInt32Ty()); + Builder->CreateAlignedStore(High, HighAddr, SI.getAlignment() / 2); + + // Delete the old store and the bitwise instructions generating int64. + eraseInstFromFunction(SI); + eraseInstFromFunction(*OR); + eraseInstFromFunction(*SHL); + eraseInstFromFunction(*Z1); + eraseInstFromFunction(*Z2); + return true; +} + /// SimplifyStoreAtEndOfBlock - Turn things like: /// if () { *P = v1; } else { *P = v2 } /// into a phi node with a store in the successor. Index: test/Transforms/InstCombine/split-store.ll =================================================================== --- test/Transforms/InstCombine/split-store.ll +++ test/Transforms/InstCombine/split-store.ll @@ -0,0 +1,118 @@ +; RUN: opt -instcombine -S < %s | FileCheck %s + +declare void @llvm.lifetime.start(i64, i8* nocapture) +declare void @llvm.lifetime.end(i64, i8* nocapture) + +declare void @goo1(%"pair1"* dereferenceable(8)) local_unnamed_addr +%"pair1" = type { i32, float } + +; CHECK-LABEL: @int32_float_pair( +; CHECK: store i32 %tmp1 +; CHECK: store float %tmp2 +define void @int32_float_pair(i32 %tmp1, float %tmp2) local_unnamed_addr { +entry: + %ref.tmp = alloca i64, align 8 + %tmpcast = bitcast i64* %ref.tmp to %"pair1"* + %t0 = bitcast i64* %ref.tmp to i8* + call void @llvm.lifetime.start(i64 8, i8* %t0) + %t1 = bitcast float %tmp2 to i32 + %retval.sroa.2.0.insert.ext.i = zext i32 %t1 to i64 + %retval.sroa.2.0.insert.shift.i = shl nuw i64 %retval.sroa.2.0.insert.ext.i, 32 + %retval.sroa.0.0.insert.ext.i = zext i32 %tmp1 to i64 + %retval.sroa.0.0.insert.insert.i = or i64 %retval.sroa.2.0.insert.shift.i, %retval.sroa.0.0.insert.ext.i + store i64 %retval.sroa.0.0.insert.insert.i, i64* %ref.tmp, align 8 + call void @goo1(%"pair1"* dereferenceable(8) %tmpcast) + call void @llvm.lifetime.end(i64 8, i8* %t0) + ret void +} + +declare void @goo2(%"pair2"* dereferenceable(8)) local_unnamed_addr +%"pair2" = type { float, i32 } + +; CHECK-LABEL: @float_int32_pair( +; CHECK: store float %tmp1 +; CHECK: store i32 %tmp2 +define void @float_int32_pair(float %tmp1, i32 %tmp2) local_unnamed_addr #0 { +entry: + %ref.tmp = alloca i64, align 8 + %tmpcast = bitcast i64* %ref.tmp to %"pair2"* + %t0 = bitcast i64* %ref.tmp to i8* + call void @llvm.lifetime.start(i64 8, i8* %t0) #5 + %t1 = bitcast float %tmp1 to i32 + %retval.sroa.2.0.insert.ext.i = zext i32 %tmp2 to i64 + %retval.sroa.2.0.insert.shift.i = shl nuw i64 %retval.sroa.2.0.insert.ext.i, 32 + %retval.sroa.0.0.insert.ext.i = zext i32 %t1 to i64 + %retval.sroa.0.0.insert.insert.i = or i64 %retval.sroa.2.0.insert.shift.i, %retval.sroa.0.0.insert.ext.i + store i64 %retval.sroa.0.0.insert.insert.i, i64* %ref.tmp, align 8 + call void @goo2(%"pair2"* dereferenceable(8) %tmpcast) + call void @llvm.lifetime.end(i64 8, i8* %t0) #5 + ret void +} + +declare void @goo3(%"pair3"* dereferenceable(8)) local_unnamed_addr +%"pair3" = type { i32, i32 } + +; CHECK-LABEL: @int32_int32_pair( +; CHECK: store i32 %tmp1 +; CHECK: store i32 %tmp2 +define void @int32_int32_pair(i32 %tmp1, i32 %tmp2) local_unnamed_addr { +entry: + %ref.tmp = alloca i64, align 8 + %tmpcast = bitcast i64* %ref.tmp to %"pair3"* + %t0 = bitcast i64* %ref.tmp to i8* + call void @llvm.lifetime.start(i64 8, i8* %t0) + %retval.sroa.2.0.insert.ext.i = zext i32 %tmp2 to i64 + %retval.sroa.2.0.insert.shift.i = shl nuw i64 %retval.sroa.2.0.insert.ext.i, 32 + %retval.sroa.0.0.insert.ext.i = zext i32 %tmp1 to i64 + %retval.sroa.0.0.insert.insert.i = or i64 %retval.sroa.2.0.insert.shift.i, %retval.sroa.0.0.insert.ext.i + store i64 %retval.sroa.0.0.insert.insert.i, i64* %ref.tmp, align 8 + call void @goo3(%"pair3"* dereferenceable(8) %tmpcast) + call void @llvm.lifetime.end(i64 8, i8* %t0) + ret void +} + +declare void @goo4(%"pair4"* dereferenceable(8)) local_unnamed_addr +%"pair4" = type { i32, i16 } + +; CHECK-LABEL: @int32_int16_pair( +; CHECK: store i32 %tmp1 +; CHECK: %[[EXT:.+]] = zext i16 %tmp2 to i32 +; CHECK: store i32 %[[EXT]] +define void @int32_int16_pair(i32 %tmp1, i16 signext %tmp2) local_unnamed_addr { +entry: + %ref.tmp = alloca i64, align 8 + %tmpcast = bitcast i64* %ref.tmp to %"pair4"* + %t0 = bitcast i64* %ref.tmp to i8* + call void @llvm.lifetime.start(i64 8, i8* %t0) + %retval.sroa.2.0.insert.ext.i = zext i16 %tmp2 to i64 + %retval.sroa.2.0.insert.shift.i = shl nuw nsw i64 %retval.sroa.2.0.insert.ext.i, 32 + %retval.sroa.0.0.insert.ext.i = zext i32 %tmp1 to i64 + %retval.sroa.0.0.insert.insert.i = or i64 %retval.sroa.2.0.insert.shift.i, %retval.sroa.0.0.insert.ext.i + store i64 %retval.sroa.0.0.insert.insert.i, i64* %ref.tmp, align 8 + call void @goo4(%"pair4"* dereferenceable(8) %tmpcast) + call void @llvm.lifetime.end(i64 8, i8* %t0) + ret void +} + +declare void @goo5(%"pair5"* dereferenceable(8)) local_unnamed_addr +%"pair5" = type { i32, i8 } + +; CHECK-LABEL: @int32_int8_pair( +; CHECK: store i32 %tmp1 +; CHECK: %[[EXT:.+]] = zext i8 %tmp2 to i32 +; CHECK: store i32 %[[EXT]] +define void @int32_int8_pair(i32 %tmp1, i8 signext %tmp2) local_unnamed_addr { +entry: + %ref.tmp = alloca i64, align 8 + %tmpcast = bitcast i64* %ref.tmp to %"pair5"* + %t0 = bitcast i64* %ref.tmp to i8* + call void @llvm.lifetime.start(i64 8, i8* %t0) + %retval.sroa.2.0.insert.ext.i = zext i8 %tmp2 to i64 + %retval.sroa.2.0.insert.shift.i = shl nuw nsw i64 %retval.sroa.2.0.insert.ext.i, 32 + %retval.sroa.0.0.insert.ext.i = zext i32 %tmp1 to i64 + %retval.sroa.0.0.insert.insert.i = or i64 %retval.sroa.2.0.insert.shift.i, %retval.sroa.0.0.insert.ext.i + store i64 %retval.sroa.0.0.insert.insert.i, i64* %ref.tmp, align 8 + call void @goo5(%"pair5"* dereferenceable(8) %tmpcast) + call void @llvm.lifetime.end(i64 8, i8* %t0) + ret void +}