Index: include/llvm/Analysis/ValueTracking.h =================================================================== --- include/llvm/Analysis/ValueTracking.h +++ include/llvm/Analysis/ValueTracking.h @@ -221,7 +221,8 @@ /// return the i8 value that it is represented with. This is true for all i8 /// values obviously, but is also true for i32 0, i32 -1, i16 0xF0F0, double /// 0.0 etc. If the value can't be handled with a repeated byte store (e.g. - /// i16 0x1234), return null. + /// i16 0x1234), return null. If the value is entirely undef and padding, + /// return undef. Value *isBytewiseValue(Value *V); /// Given an aggregrate and an sequence of indices, see if the scalar value Index: lib/Analysis/ValueTracking.cpp =================================================================== --- lib/Analysis/ValueTracking.cpp +++ lib/Analysis/ValueTracking.cpp @@ -3042,62 +3042,92 @@ return true; } -/// If the specified value can be set by repeating the same byte in memory, -/// return the i8 value that it is represented with. This is -/// true for all i8 values obviously, but is also true for i32 0, i32 -1, -/// i16 0xF0F0, double 0.0 etc. If the value can't be handled with a repeated -/// byte store (e.g. i16 0x1234), return null. Value *llvm::isBytewiseValue(Value *V) { + // All byte-wide stores are splatable, even of arbitrary variables. - if (V->getType()->isIntegerTy(8)) return V; + if (V->getType()->isIntegerTy(8)) + return V; + + LLVMContext &Ctx = V->getContext(); + + // Undef don't care. + auto *UndefInt8 = UndefValue::get(Type::getInt8Ty(Ctx)); + if (isa(V)) + return UndefInt8; + + Constant *C = dyn_cast(V); + if (!C) { + // Conceptually, we could handle things like: + // %a = zext i8 %X to i16 + // %b = shl i16 %a, 8 + // %c = or i16 %a, %b + // but until there is an example that actually needs this, it doesn't seem + // worth worrying about. + return nullptr; + } // Handle 'null' ConstantArrayZero etc. - if (Constant *C = dyn_cast(V)) - if (C->isNullValue()) - return Constant::getNullValue(Type::getInt8Ty(V->getContext())); + if (C->isNullValue()) + return Constant::getNullValue(Type::getInt8Ty(Ctx)); - // Constant float and double values can be handled as integer values if the + // Constant floating-point values can be handled as integer values if the // corresponding integer value is "byteable". An important case is 0.0. - if (ConstantFP *CFP = dyn_cast(V)) { - if (CFP->getType()->isFloatTy()) - V = ConstantExpr::getBitCast(CFP, Type::getInt32Ty(V->getContext())); - if (CFP->getType()->isDoubleTy()) - V = ConstantExpr::getBitCast(CFP, Type::getInt64Ty(V->getContext())); + if (ConstantFP *CFP = dyn_cast(C)) { + Type *Ty = nullptr; + if (CFP->getType()->isHalfTy()) + Ty = Type::getInt16Ty(Ctx); + else if (CFP->getType()->isFloatTy()) + Ty = Type::getInt32Ty(Ctx); + else if (CFP->getType()->isDoubleTy()) + Ty = Type::getInt64Ty(Ctx); // Don't handle long double formats, which have strange constraints. + return Ty ? isBytewiseValue(ConstantExpr::getBitCast(CFP, Ty)) : nullptr; } // We can handle constant integers that are multiple of 8 bits. - if (ConstantInt *CI = dyn_cast(V)) { + if (ConstantInt *CI = dyn_cast(C)) { if (CI->getBitWidth() % 8 == 0) { assert(CI->getBitWidth() > 8 && "8 bits should be handled above!"); - if (!CI->getValue().isSplat(8)) return nullptr; - return ConstantInt::get(V->getContext(), CI->getValue().trunc(8)); + return ConstantInt::get(Ctx, CI->getValue().trunc(8)); } } - // A ConstantDataArray/Vector is splatable if all its members are equal and - // also splatable. - if (ConstantDataSequential *CA = dyn_cast(V)) { - Value *Elt = CA->getElementAsConstant(0); - Value *Val = isBytewiseValue(Elt); - if (!Val) + auto Merge = [&](Value *LHS, Value *RHS) -> Value * { + if (LHS == RHS) + return LHS; + if (!LHS || !RHS) return nullptr; + if (LHS == UndefInt8) + return RHS; + if (RHS == UndefInt8) + return LHS; + return nullptr; + }; - for (unsigned I = 1, E = CA->getNumElements(); I != E; ++I) - if (CA->getElementAsConstant(I) != Elt) + if (ConstantDataSequential *CA = dyn_cast(C)) { + Value *Val = UndefInt8; + for (unsigned I = 0, E = CA->getNumElements(); I != E; ++I) + if (!(Val = Merge(Val, isBytewiseValue(CA->getElementAsConstant(I))))) return nullptr; + return Val; + } + + if (isa(C)) { + Constant *Splat = cast(C)->getSplatValue(); + return Splat ? isBytewiseValue(Splat) : nullptr; + } + if (isa(C) || isa(C)) { + Value *Val = UndefInt8; + for (unsigned I = 0, E = C->getNumOperands(); I != E; ++I) + if (!(Val = Merge(Val, isBytewiseValue(C->getOperand(I))))) + return nullptr; return Val; } - // Conceptually, we could handle things like: - // %a = zext i8 %X to i16 - // %b = shl i16 %a, 8 - // %c = or i16 %a, %b - // but until there is an example that actually needs this, it doesn't seem - // worth worrying about. + // Don't try to handle the handful of other constants. return nullptr; } Index: lib/Transforms/Scalar/LoopIdiomRecognize.cpp =================================================================== --- lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -347,6 +347,9 @@ /// Note that we don't ever attempt to use memset_pattern8 or 4, because these /// just replicate their input array and then pass on to memset_pattern16. static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) { + // FIXME: This could check for UndefValue because it can be merged into any + // other valid pattern. + // If the value isn't a constant, we can't promote it to being in a constant // array. We could theoretically do a store to an alloca or something, but // that doesn't seem worthwhile. @@ -645,9 +648,13 @@ if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) { if (ForMemset) { + if (isa(FirstSplatValue)) + FirstSplatValue = SecondSplatValue; if (FirstSplatValue != SecondSplatValue) continue; } else { + if (isa(FirstPatternValue)) + FirstPatternValue = SecondPatternValue; if (FirstPatternValue != SecondPatternValue) continue; } Index: lib/Transforms/Scalar/MemCpyOptimizer.cpp =================================================================== --- lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -413,7 +413,10 @@ if (!NextStore->isSimple()) break; // Check to see if this stored value is of the same byte-splattable value. - if (ByteVal != isBytewiseValue(NextStore->getOperand(0))) + Value *StoredByte = isBytewiseValue(NextStore->getOperand(0)); + if (isa(ByteVal) && StoredByte) + ByteVal = StoredByte; + if (ByteVal != StoredByte) break; // Check to see if this store is to a constant offset from the start ptr. @@ -751,8 +754,8 @@ // 0xA0A0A0A0 and 0.0. auto *V = SI->getOperand(0); if (Value *ByteVal = isBytewiseValue(V)) { - if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(), - ByteVal)) { + if (Instruction *I = + tryMergingIntoMemset(SI, SI->getPointerOperand(), ByteVal)) { BBI = I->getIterator(); // Don't invalidate iterator. return true; } @@ -788,12 +791,13 @@ bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) { // See if there is another memset or store neighboring this memset which // allows us to widen out the memset to do a single larger store. - if (isa(MSI->getLength()) && !MSI->isVolatile()) - if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(), - MSI->getValue())) { + if (isa(MSI->getLength()) && !MSI->isVolatile()) { + Value *ByteVal = MSI->getValue(); + if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(), ByteVal)) { BBI = I->getIterator(); // Don't invalidate iterator. return true; } + } return false; } Index: test/Transforms/MemCpyOpt/fca2memcpy.ll =================================================================== --- test/Transforms/MemCpyOpt/fca2memcpy.ll +++ test/Transforms/MemCpyOpt/fca2memcpy.ll @@ -73,13 +73,16 @@ ret void } -; If the store address is computed ina complex manner, make +; If the store address is computed in a complex manner, make ; sure we lift the computation as well if needed and possible. define void @addrproducer(%S* %src, %S* %dst) { -; CHECK-LABEL: addrproducer -; CHECK: %dst2 = getelementptr %S, %S* %dst, i64 1 -; CHECK: call void @llvm.memmove.p0i8.p0i8.i64 -; CHECK-NEXT: store %S undef, %S* %dst +; CHECK-LABEL: addrproducer( +; CHECK-NEXT: %[[DSTCAST:[0-9]+]] = bitcast %S* %dst to i8* +; CHECK-NEXT: %dst2 = getelementptr %S, %S* %dst, i64 1 +; CHECK-NEXT: %[[DST2CAST:[0-9]+]] = bitcast %S* %dst2 to i8* +; CHECK-NEXT: %[[SRCCAST:[0-9]+]] = bitcast %S* %src to i8* +; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64(i8* align 8 %[[DST2CAST]], i8* align 8 %[[SRCCAST]], i64 16, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 %[[DSTCAST]], i8 undef, i64 16, i1 false) ; CHECK-NEXT: ret void %1 = load %S, %S* %src store %S undef, %S* %dst @@ -89,7 +92,14 @@ } define void @aliasaddrproducer(%S* %src, %S* %dst, i32* %dstidptr) { -; CHECK-LABEL: aliasaddrproducer +; CHECK-LABEL: aliasaddrproducer( +; CHECK-NEXT: %[[SRC:[0-9]+]] = load %S, %S* %src +; CHECK-NEXT: %[[DSTCAST:[0-9]+]] = bitcast %S* %dst to i8* +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 %[[DSTCAST]], i8 undef, i64 16, i1 false) +; CHECK-NEXT: %dstindex = load i32, i32* %dstidptr +; CHECK-NEXT: %dst2 = getelementptr %S, %S* %dst, i32 %dstindex +; CHECK-NEXT: store %S %[[SRC]], %S* %dst2 +; CHECK-NEXT: ret void %1 = load %S, %S* %src store %S undef, %S* %dst %dstindex = load i32, i32* %dstidptr @@ -99,7 +109,16 @@ } define void @noaliasaddrproducer(%S* %src, %S* noalias %dst, i32* noalias %dstidptr) { -; CHECK-LABEL: noaliasaddrproducer +; CHECK-LABEL: noaliasaddrproducer( +; CHECK-NEXT: %[[SRCCAST:[0-9]+]] = bitcast %S* %src to i8* +; CHECK-NEXT: %[[LOADED:[0-9]+]] = load i32, i32* %dstidptr +; CHECK-NEXT: %dstindex = or i32 %[[LOADED]], 1 +; CHECK-NEXT: %dst2 = getelementptr %S, %S* %dst, i32 %dstindex +; CHECK-NEXT: %[[DST2CAST:[0-9]+]] = bitcast %S* %dst2 to i8* +; CHECK-NEXT: %[[SRCCAST2:[0-9]+]] = bitcast %S* %src to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[DST2CAST]], i8* align 8 %[[SRCCAST2]], i64 16, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 %[[SRCCAST]], i8 undef, i64 16, i1 false) +; CHECK-NEXT: ret void %1 = load %S, %S* %src store %S undef, %S* %src %2 = load i32, i32* %dstidptr