Index: llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp =================================================================== --- llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -17,13 +17,13 @@ #include "llvm/ADT/StringMap.h" #include "llvm/ADT/Triple.h" #include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Analysis/CaptureTracking.h" -#include "llvm/Analysis/Loads.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" @@ -36,6 +36,7 @@ #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SizeOpts.h" using namespace llvm; @@ -1169,17 +1170,77 @@ /// Fold memset[_chk](malloc(n), 0, n) --> calloc(1, n). Value *LibCallSimplifier::foldMallocMemset(CallInst *Memset, IRBuilderBase &B) { + if (isa(Memset)) { + bool canBeFold = false; + do { + if (auto *MemSetInstr = dyn_cast_or_null(Memset)) { + if (MemSetInstr->getVolatileCst() == + ConstantInt::getTrue(Memset->getContext())) + break; + + if (MemSetInstr->getRawDest()->getType()->getPointerAddressSpace() != 0) + break; + + canBeFold = true; + } + } while (0); + + if (!canBeFold) + return nullptr; + } + // This has to be a memset of zeros (bzero). auto *FillValue = dyn_cast(Memset->getArgOperand(1)); if (!FillValue || FillValue->getZExtValue() != 0) return nullptr; + auto CanFoldMallocFn = [](CallInst *Malloc, CallInst *Memset) { + if (Malloc->hasOneUse()) + return true; + + for (Use &U : Malloc->uses()) { + User *I = U.getUser(); + + // skip memset function itself + if (I == Memset) + continue; + + // considering the following ir: + // + // %call = call noalias i8* @malloc(i64 %conv) #4 + // %cmp = icmp eq i8* %call, null + // + // when the call malloc to allocate virtual memory, + // it might have null-check + if (CmpInst *Cmp = dyn_cast_or_null(I)) { + uint64_t index = U.getOperandNo() == 0 ? 1 : 0; + auto *Ops = Cmp->getOperand(index); + auto CurPredicate = Cmp->getPredicate(); + if ((CurPredicate != ICmpInst::ICMP_EQ && + CurPredicate != ICmpInst::ICMP_NE) || + Ops != Constant::getNullValue(Ops->getType())) + return false; + continue; + } + + // if the code satisfied the following ir: + // + // %call = call noalias i8* @malloc(i64 %conv) #4 + // %1 = bitcast i8* %call to float* + // + // malloc's return value is void *, so it might have a bitcast + if (!isa(I)) + return false; + } + return true; + }; + // TODO: We should handle the case where the malloc has more than one use. // This is necessary to optimize common patterns such as when the result of // the malloc is checked against null or when a memset intrinsic is used in // place of a memset library call. auto *Malloc = dyn_cast(Memset->getArgOperand(0)); - if (!Malloc || !Malloc->hasOneUse()) + if (!Malloc || !CanFoldMallocFn(Malloc, Memset)) return nullptr; // Is the inner call really malloc()? @@ -1214,12 +1275,13 @@ Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilderBase &B) { Value *Size = CI->getArgOperand(2); annotateNonNullAndDereferenceable(CI, 0, Size, DL); - if (isa(CI)) - return nullptr; if (auto *Calloc = foldMallocMemset(CI, B)) return Calloc; + if (isa(CI)) + return nullptr; + // memset(p, v, n) -> llvm.memset(align 1 p, v, n) Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false); CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val, Size, Align(1)); Index: llvm/test/Transforms/InstCombine/memset-1.ll =================================================================== --- llvm/test/Transforms/InstCombine/memset-1.ll +++ llvm/test/Transforms/InstCombine/memset-1.ll @@ -63,12 +63,11 @@ define float* @pr25892(i32 %size) #0 { ; CHECK-LABEL: @pr25892( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL:%.*]] = tail call i8* @malloc(i32 [[SIZE:%.*]]) #0 +; CHECK-NEXT: [[CALL:%.*]] = call i8* @calloc(i32 1, i32 [[SIZE:%.*]]) ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8* [[CALL]], null ; CHECK-NEXT: br i1 [[CMP]], label [[CLEANUP:%.*]], label [[IF_END:%.*]] ; CHECK: if.end: ; CHECK-NEXT: [[BC:%.*]] = bitcast i8* [[CALL]] to float* -; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* nonnull align 1 [[CALL]], i8 0, i32 [[SIZE]], i1 false) #0 ; CHECK-NEXT: br label [[CLEANUP]] ; CHECK: cleanup: ; CHECK-NEXT: [[RETVAL_0:%.*]] = phi float* [ [[BC]], [[IF_END]] ], [ null, [[ENTRY:%.*]] ] @@ -202,4 +201,3 @@ attributes #0 = { nounwind ssp uwtable } attributes #1 = { nounwind } attributes #2 = { nounwind readnone } - Index: llvm/test/Transforms/InstCombine/promot_malloc_and_memset_intrinsic.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/InstCombine/promot_malloc_and_memset_intrinsic.ll @@ -0,0 +1,70 @@ +; RUN: opt < %s -O2 -S | FileCheck %s +; RUN: opt < %s -O3 -S | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } +%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } + +@stderr = external dso_local global %struct._IO_FILE*, align 8 +@.str = private unnamed_addr constant [28 x i8] c"Can't allocate data array.\0A\00", align 1 + +; check the case1: +; 1. malloc & mmeset +; 2. null-check +; 4. bitcast +; Function Attrs: noinline nounwind uwtable +define dso_local float* @foo(i32 %size) #0 { +entry: +; CHECK: %calloc = tail call i8* @calloc(i64 1, i64 %conv) +; CHECK-NEXT: %cmp = icmp eq i8* %calloc, null + + %size.addr = alloca i32, align 4 + %fdata = alloca float*, align 8 + store i32 %size, i32* %size.addr, align 4 + %0 = load i32, i32* %size.addr, align 4 + %conv = sext i32 %0 to i64 + %call = call noalias i8* @malloc(i64 %conv) #5 + %1 = bitcast i8* %call to float* + store float* %1, float** %fdata, align 8 + %2 = load float*, float** %fdata, align 8 + %cmp = icmp eq float* %2, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %3 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call2 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %3, i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str, i64 0, i64 0)) + call void @exit(i32 1) #6 + unreachable + +if.end: ; preds = %entry +; CHECK: %2 = bitcast i8* %calloc to float* +; CHECK-NEXT: ret float* %2 + %4 = load float*, float** %fdata, align 8 + %5 = bitcast float* %4 to i8* + %6 = load i32, i32* %size.addr, align 4 + %conv3 = sext i32 %6 to i64 + call void @llvm.memset.p0i8.i64(i8* align 4 %5, i8 0, i64 %conv3, i1 false) + %7 = load float*, float** %fdata, align 8 + ret float* %7 +} + +; Function Attrs: nounwind +declare dso_local noalias i8* @malloc(i64) #1 + +declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #2 + +; Function Attrs: noreturn nounwind +declare dso_local void @exit(i32) #3 + +; Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #4 + +attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { noreturn nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { argmemonly nofree nosync nounwind willreturn writeonly } +attributes #5 = { nounwind } +attributes #6 = { noreturn nounwind } \ No newline at end of file