diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h --- a/llvm/include/llvm/Analysis/ValueTracking.h +++ b/llvm/include/llvm/Analysis/ValueTracking.h @@ -120,6 +120,11 @@ bool isOnlyUsedInZeroEqualityComparison(const Instruction *CxtI); + /// Return true if, in the def-use chain of the given instruction, the + /// only users are integer compares with 0 or instructions that do not + /// change its value (e.g. ext instructions or combined compares). + bool isIterativelyOnlyUsedInZeroComparison(const Instruction *CxtI); + /// Return true if the given value is known to be non-zero when defined. For /// vectors, return true if every element is known to be non-zero when /// defined. For pointers, if the context instruction and dominator tree are diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -570,6 +570,7 @@ [IntrWriteMem, IntrArgMemOnly, IntrWillReturn, NoCapture>, WriteOnly>, ImmArg>]>; +def int_strlen16 : Intrinsic<[llvm_i32_ty], [llvm_anyptr_ty], [IntrArgMemOnly]>; // FIXME: Add version of these floating point intrinsics which allow non-default // rounding modes and FP exception handling. diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -300,6 +300,52 @@ return true; } +bool llvm::isIterativelyOnlyUsedInZeroComparison(const Instruction *CxtI) { + SmallVector UserStack; + SmallPtrSet Visited; + UserStack.push_back(CxtI); + + while (!UserStack.empty()) { + auto *Curr = UserStack.back(); + UserStack.pop_back(); + // Cycles not allowed + if (!Visited.insert(Curr).second) + return false; + bool ValidUser = false; + if (Curr == CxtI || // Skip the root + isa(Curr) || // Iterate through phi nodes + Curr->getOpcode() == Instruction::SExt || + Curr->getOpcode() == Instruction::ZExt) + ValidUser = true; + + // 'or' instructions indicate a combined compare: + // e.g. if(strlen(a) || strlen(b)) + auto *BO = dyn_cast(Curr); + if (BO && BO->getOpcode() == BinaryOperator::Or) + ValidUser = true; + + // We are detecting if all values originating from + // CxtI sink to an icmp eq with 0. If this instruction is found, + // the remaining items in the stack must be checked. + auto *IC = dyn_cast(Curr); + if (IC && IC->isEquality()) + if (auto *C = dyn_cast(IC->getOperand(1))) + if (C->isZeroValue() || C->isNullValue()) + continue; + + if (!ValidUser) + return false; + + for (auto *U : Curr->users()) { + auto *UserInst = dyn_cast(U); + if (!UserInst) + return false; + UserStack.push_back(UserInst); + } + } + return true; +} + static bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth, const Query &Q); diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -249,6 +249,9 @@ "cgp-optimize-phi-types", cl::Hidden, cl::init(false), cl::desc("Enable converting phi types in CodeGenPrepare")); +static void expandStrlen16(Instruction *MemoryInst, Value *Addr, + Type *AccessTy); + namespace { enum ExtType { @@ -2011,6 +2014,16 @@ if (II) { switch (II->getIntrinsicID()) { default: break; + case Intrinsic::strlen16: { + assert(CI->getNumArgOperands() == 1 + && "the strlen16 intrinsic is expected to have only one argument."); + Value *Arg = CI->getArgOperand(0); + assert(Arg && Arg->getType()->isPointerTy() + && "Argument to the strlen16 intrinsic must be a pointer."); + expandStrlen16(CI, Arg, Arg->getType()); + ModifiedDT = true; + return true; + } case Intrinsic::assume: { II->eraseFromParent(); return true; @@ -4849,6 +4862,56 @@ return true; } +/// Expand the strlen16 intrinsic call to its loop equivalent +static void expandStrlen16(Instruction *Strlen16Inst, Value *StringArg, + Type *AccessTy) { + BasicBlock *OrigBB = Strlen16Inst->getParent(); + Function *F = OrigBB->getParent(); + BasicBlock *ResultBB = OrigBB->splitBasicBlock(Strlen16Inst, "result"); + BasicBlock *LoopExitBB = + BasicBlock::Create(F->getContext(), "loopexit", F, ResultBB); + BasicBlock *LoopBB = + BasicBlock::Create(F->getContext(), "loopbody", F, LoopExitBB); + + // The loop body has a gep incrementing by 1, each char loaded is + // null checked. If not null, get the next char. + IRBuilder<> LoopBuilder(LoopBB); + PHINode *LoopPhi = LoopBuilder.CreatePHI(AccessTy, 0, "s16exp.phi"); + LoopPhi->addIncoming(StringArg, OrigBB); + Value *PTmp = LoopBuilder.CreateInBoundsGEP( + LoopPhi, ConstantInt::get(LoopBuilder.getInt64Ty(), 1), "s16exp.pTmp"); + Value *Element = LoopBuilder.CreateLoad(PTmp, "s16exp.ele"); + LoopBuilder.CreateCondBr( + LoopBuilder.CreateICmpEQ(Element, + ConstantInt::get(LoopBuilder.getInt16Ty(), 0)), + LoopExitBB, LoopBB); + LoopPhi->addIncoming(PTmp, LoopBB); + + OrigBB->getTerminator()->eraseFromParent(); + IRBuilder<> OrigBuilder(OrigBB); + OrigBuilder.CreateBr(LoopBB); + + // The loop exit computes a pointer difference between the + // last loaded char and the string base + IRBuilder<> LoopExitBuilder(LoopExitBB); + PHINode *LoopExitPhi = + LoopExitBuilder.CreatePHI(AccessTy, 0, "s16exp.exit.phi"); + LoopExitPhi->addIncoming(PTmp, LoopBB); + Value *EndPtr = LoopExitBuilder.CreatePtrToInt( + LoopExitPhi, LoopExitBuilder.getInt64Ty(), "s16.endptr"); + Value *StartPtr = LoopExitBuilder.CreatePtrToInt( + StringArg, LoopExitBuilder.getInt64Ty(), "s16.startptr"); + Value *Sub = LoopExitBuilder.CreateSub(EndPtr, StartPtr, "s16.diff"); + Value *Strlen16Len = LoopExitBuilder.CreateLShr( + Sub, ConstantInt::get(LoopExitBuilder.getInt64Ty(), 1), "s16.length"); + Value *Result = + LoopExitBuilder.CreateTrunc(Strlen16Len, Strlen16Inst->getType()); + LoopExitBuilder.CreateBr(ResultBB); + + Strlen16Inst->replaceAllUsesWith(Result); + Strlen16Inst->eraseFromParent(); +} + /// Return true if the specified values are defined in a /// different basic block than BB. static bool IsNonLocalValue(Value *V, BasicBlock *BB) { diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -79,6 +79,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" @@ -101,6 +102,7 @@ #include using namespace llvm; +using namespace llvm::PatternMatch; #define DEBUG_TYPE "loop-idiom" @@ -113,6 +115,11 @@ "with -Os/-Oz"), cl::init(true), cl::Hidden); +static cl::opt DisableStrlenIdiom( + "disable-strlen-idiom", + cl::desc("Disable strlen loop idiom recognize and transform"), + cl::init(false), cl::Hidden); + namespace { class LoopIdiomRecognize { @@ -203,6 +210,7 @@ Value *Var, Instruction *DefX, const DebugLoc &DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop); + bool recognizeAndInsertStrlen(); /// @} }; @@ -1203,7 +1211,8 @@ << "] Noncountable Loop %" << CurLoop->getHeader()->getName() << "\n"); - return recognizePopcount() || recognizeAndInsertFFS(); + return recognizePopcount() || recognizeAndInsertFFS() || + recognizeAndInsertStrlen(); } /// Check if the given conditional branch is based on the comparison between @@ -1575,6 +1584,243 @@ return true; } +/// Helper function that creates strlen16 intrinsic function calls. +static CallInst *createStrlen16Intrinsic(IRBuilder<> &IRBuilder, Value *Val) { + Value *Ops[] = {Val}; + Type *Tys[] = {Val->getType()}; + Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); + Function *Func = Intrinsic::getDeclaration(M, Intrinsic::strlen16, Tys); + CallInst *CI = IRBuilder.CreateCall(Func, Ops); + return CI; +} + +/// Recognizes a strlen idiom by checking for loops that either keep +/// a variable counter or increment a char pointer and then subtract +/// with the base pointer. +/// +/// If detected, transforms the relevant code to a strlen function +/// call, and returns true; otherwise, returns false. +bool LoopIdiomRecognize::recognizeAndInsertStrlen() { + if (DisableStrlenIdiom) + return false; + + // Give up if the loop has multiple blocks or multiple backedges. + if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) return false; + + BasicBlock *LoopBody = *(CurLoop->block_begin()); + // The strlen loop idiom can only have a maximum of 7 instructions: + // this arises when there is an increment variable in the loop. + const unsigned MaxLoopSize = 7; + if (LoopBody->size() > MaxLoopSize) + return false; + + // It should have a preheader containing nothing but an unconditional branch. + BasicBlock *PH = CurLoop->getLoopPreheader(); + if (!PH || &PH->front() != PH->getTerminator()) + return false; + auto *EntryBI = dyn_cast(PH->getTerminator()); + if (!EntryBI || EntryBI->isConditional()) + return false; + + // It should have a precondition block where the generated strlen intrinsic + // function can be inserted. + auto *PreCondBB = PH->getSinglePredecessor(); + if (!PreCondBB) + return false; + auto *PreCondBI = dyn_cast(PreCondBB->getTerminator()); + if (!PreCondBI || PreCondBI->isUnconditional()) + return false; + + // The loop exit must be conditioned on an icmp with 0. + // The icmp operand has to be a load on some SSA reg that increments + // by 1 in the loop. + auto *LoopTerm = dyn_cast(LoopBody->getTerminator()); + auto *LoopCond = matchCondition(LoopTerm, LoopBody); + if (!LoopCond || !LoopCond->hasOneUse()) + return false; + auto *LoopLoad = dyn_cast(LoopCond); + if (!LoopLoad) + return false; + + // Check that the vregs (phi and gep) are of valid form. + auto *IncPtr = LoopLoad->getPointerOperand(); + PHINode *LoopPHI = nullptr; + GetElementPtrInst *LoopGEP = nullptr; + if ((LoopGEP = dyn_cast(IncPtr))) { + LoopPHI = dyn_cast(LoopGEP->getPointerOperand()); + if (!LoopPHI || LoopPHI->getParent() != LoopBody) + return false; + // The phi node has to have incoming values from loop body and preheader + if (LoopPHI->getBasicBlockIndex(LoopBody) == -1 || + LoopPHI->getBasicBlockIndex(PH) == -1) + return false; + // The IncPtr could be a phi node if the first char was already null checked + } else if ((LoopPHI = dyn_cast(IncPtr))) { + if (LoopPHI->getBasicBlockIndex(LoopBody) == -1 || + LoopPHI->getBasicBlockIndex(PH) == -1) + return false; + LoopGEP = dyn_cast( + LoopPHI->getIncomingValueForBlock(LoopBody)); + } else + return false; + if (!LoopPHI || !LoopGEP || LoopGEP->getParent() != LoopBody) + return false; + if (LoopPHI->getNumIncomingValues() != 2 || + LoopPHI->getIncomingValueForBlock(LoopBody) != LoopGEP) + return false; + // The increment can only accumulate constant offset of 1 + auto &DL = CurLoop->getHeader()->getModule()->getDataLayout(); + APInt Offset(DL.getPointerTypeSizeInBits(LoopGEP->getType()), 0); + if (!LoopGEP->accumulateConstantOffset(DL, Offset)) + return false; + + // The string base (source) must be of pointer type. + Value *StrSrc = LoopPHI->getIncomingValueForBlock(PH); + PointerType *PT = dyn_cast(StrSrc->getType()); + if (!PT) + return false; + // Only transform strlen8 or strlen16 + if (PT->getPointerElementType()->getTypeID() != Type::IntegerTyID) + return false; + unsigned PtBW = PT->getPointerElementType()->getIntegerBitWidth(); + if (PtBW != 8 && PtBW != 16) + return false; + if (Offset != PtBW / 8) + return false; + + Value *ResInstr = nullptr; + if (IncPtr->hasNUses(3)) { + // Check if the increment vreg computes a pointer difference + // with the string base. If so, the loop should do nothing but increment it. + PtrToIntInst *SubLHS = nullptr; + Value *SubRHS = nullptr; + for (auto *U : IncPtr->users()) { + if (U == LoopLoad || U == LoopPHI || U == LoopGEP) + continue; + auto *LHSPhi = dyn_cast(U); + if (!LHSPhi || LHSPhi->getNumIncomingValues() != 1 || + !LHSPhi->hasOneUse()) + return false; + SubLHS = dyn_cast(LHSPhi->user_back()); + break; + } + // Patternmatching IR that subtracts pointers of this form: + // %inc.lcssa = phi i8* [ %inc, %loop ] + // %sub.ptr.lhs = ptrtoint i8* %inc.lcssa to i64 + // %sub.ptr.rhs = ptrtoint i8* %str.base to i64 + // %sub.ptr.sub = sub i64 %sub.ptr.lhs, %sub.ptr.rhs + if (!SubLHS || !SubLHS->hasOneUse()) + return false; + ResInstr = SubLHS->user_back(); + if (!match(ResInstr, m_Sub(m_Specific(SubLHS), m_Value(SubRHS))) || + !isa(SubRHS) || !SubRHS->hasOneUse()) + return false; + // Check that the RHS is the string base + if (cast(SubRHS)->getPointerOperand() != StrSrc) + return false; + + // If the string is not of i8*, there should be an lshr on the result + if (PtBW != 8) { + if (!ResInstr->hasOneUse()) + return false; + auto *LShr = dyn_cast(ResInstr->user_back()); + if (!LShr || LShr->getOpcode() != Instruction::LShr) + return false; + // Verify the shift amount + const auto *C = dyn_cast(LShr->getOperand(1)); + if (!C || C->getZExtValue() != PtBW / 8 - 1) + return false; + ResInstr = LShr; + } + + // loop can only have phi, gep, load, icmp, and br + if (LoopBody->size() != 5) + return false; + + } else if (IncPtr->hasNUses(2)) { + // If the increment vreg does not compute a pointer difference, + // check if there is an add instruction that is used as the strlen result. + // If so, the loop should do nothing else. + // Seeking form: + // %add.phi = phi i64 [ %inc, %loop ], [ 0, %loop.preheader ] + // %inc = add i64 %add.phi, 1 + for (BasicBlock::iterator I = LoopBody->begin(), E = LoopBody->end(); + I != E;) { + Instruction *Inst = &*I++; + BinaryOperator *AddOneOp; + if ((AddOneOp = dyn_cast(Inst))) { + if (AddOneOp->getOpcode() != Instruction::Add) + return false; + ConstantInt *AddInc = dyn_cast(AddOneOp->getOperand(1)); + if (!AddInc || !AddInc->isOne()) + return false; + auto *AddPhi = dyn_cast(AddOneOp->getOperand(0)); + if (!AddPhi || !AddPhi->hasOneUse() || + AddPhi->getNumIncomingValues() != 2 || + AddPhi->getBasicBlockIndex(LoopBody) == -1 || + AddPhi->getBasicBlockIndex(PH) == -1 || + AddPhi->getIncomingValueForBlock(LoopBody) != AddOneOp || + AddPhi->getParent() != LoopBody) + return false; + ConstantInt *StartVal = + dyn_cast(AddPhi->getIncomingValueForBlock(PH)); + if (!StartVal || !StartVal->isZero()) + return false; + ResInstr = AddOneOp; + break; + } + } + // loop can only have ptr phi, ptr gep, inc phi, inc add, + // load, icmp, and br. + if (LoopBody->size() != 7) + return false; + } else + return false; + + // Scan every instruction in the loop to ensure it is doing nothing else. + for (BasicBlock::iterator I = LoopBody->begin(), E = LoopBody->end(); + I != E;) { + Instruction *Inst = &*I++; + if (Inst == LoopLoad || Inst == LoopCond) + if (!Inst->hasOneUse()) + return false; + if (Inst->mayHaveSideEffects()) + return false; + if (Inst != LoopTerm && !Inst->getType()->isIntegerTy() && + !Inst->getType()->isPointerTy()) + return false; + } + + // Insert strlen and RAUW + IRBuilder<> Builder(PreCondBI); + Value *FuncCall = nullptr; + switch (PtBW) { + default: + return false; + case 8: + FuncCall = emitStrLen(StrSrc, Builder, DL, TLI); + break; + case 16: + FuncCall = createStrlen16Intrinsic(Builder, StrSrc); + break; + } + Value *Strlen = Builder.CreateZExtOrTrunc(FuncCall, ResInstr->getType()); + ResInstr->replaceAllUsesWith(Strlen); + RecursivelyDeleteTriviallyDeadInstructions(ResInstr, TLI); + + // Remove the loop-exit branch + ConstantInt *NewLoopCond = LoopTerm->getSuccessor(0) == LoopBody + ? Builder.getFalse() + : Builder.getTrue(); + LoopTerm->setCondition(NewLoopCond); + + // Delete the dead loop instructions + deleteDeadInstruction(cast(LoopCond)); + deleteDeadInstruction(cast(IncPtr)); + SE->forgetLoop(CurLoop); + return true; +} + /// Recognizes a population count idiom in a non-countable loop. /// /// If detected, transforms the relevant code to issue the popcount intrinsic diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -705,7 +705,7 @@ // strlen(x) != 0 --> *x != 0 // strlen(x) == 0 --> *x == 0 - if (isOnlyUsedInZeroEqualityComparison(CI)) + if (isIterativelyOnlyUsedInZeroComparison(CI)) return B.CreateZExt(B.CreateLoad(B.getIntNTy(CharSize), Src, "strlenfirst"), CI->getType()); @@ -3025,6 +3025,8 @@ return optimizeMemCpy(CI, Builder); case Intrinsic::memmove: return optimizeMemMove(CI, Builder); + case Intrinsic::strlen16: + return optimizeStringLength(CI, Builder, 16); default: return nullptr; } diff --git a/llvm/test/Transforms/LoopIdiom/expand-strlen16.ll b/llvm/test/Transforms/LoopIdiom/expand-strlen16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopIdiom/expand-strlen16.ll @@ -0,0 +1,43 @@ +; RUN: opt -codegenprepare -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +; Replace the call to strlen16 intrinsic by it's equivalent loop + +; CHECK: lor.lhs.false: ; preds = %entry +; CHECK-NEXT: br label %loopbody +; CHECK: loopbody: ; preds = %lor.lhs.false, %loopbody +; CHECK-NEXT: %s16exp.phi = phi i16* [ %src, %lor.lhs.false ], [ %s16exp.pTmp, %loopbody ] +; CHECK-NEXT: %s16exp.pTmp = getelementptr inbounds i16, i16* %s16exp.phi, i64 1 +; CHECK-NEXT: %s16exp.ele = load i16, i16* %s16exp.pTmp +; CHECK-NEXT: %0 = icmp eq i16 %s16exp.ele, 0 +; CHECK-NEXT: br i1 %0, label %loopexit, label %loopbody +; CHECK: loopexit: ; preds = %loopbody +; CHECK-NEXT: %s16.endptr = ptrtoint i16* %s16exp.pTmp to i64 +; CHECK-NEXT: %s16.startptr = ptrtoint i16* %src to i64 +; CHECK-NEXT: %s16.diff = sub i64 %s16.endptr, %s16.startptr +; CHECK-NEXT: %s16.length = lshr i64 %s16.diff, 1 +; CHECK-NEXT: %1 = trunc i64 %s16.length to i32 +; CHECK-NEXT: ret i32 %1 + +; Function Attrs: nounwind +define i32 @myStringLen(i16* %src) local_unnamed_addr #0 { +entry: + %cmp = icmp eq i16* %src, null + br i1 %cmp, label %return, label %lor.lhs.false + +lor.lhs.false: ; preds = %entry + %0 = tail call i32 @llvm.strlen16.p0i16(i16* nonnull %src) +; CHECK-NOT: call i32 @llvm.strlen16.p0i16 + ret i32 %0 + +return: ; preds = %entry + ret i32 0 +} + +; Function Attrs: argmemonly nounwind +declare i32 @llvm.strlen16.p0i16(i16*) #1 + +attributes #0 = { nounwind } +attributes #1 = { argmemonly nounwind } diff --git a/llvm/test/Transforms/LoopIdiom/recognize-strlen.ll b/llvm/test/Transforms/LoopIdiom/recognize-strlen.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopIdiom/recognize-strlen.ll @@ -0,0 +1,184 @@ +; RUN: opt -loop-idiom < %s -S | FileCheck %s +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +define i64 @valid_strlen1(i8* %Str) { +entry: + %tobool = icmp eq i8* %Str, null + br i1 %tobool, label %cleanup, label %lor.lhs.false + +lor.lhs.false: ; preds = %entry + %0 = load i8, i8* %Str, align 1 + %cmp = icmp eq i8 %0, 0 + br i1 %cmp, label %cleanup, label %for.inc + +; CHECK: call i64 @strlen(i8* %Str) +; CHECK: br i1 true +for.inc: ; preds = %lor.lhs.false, %for.inc + %Src.09 = phi i8* [ %incdec.ptr, %for.inc ], [ %Str, %lor.lhs.false ] + %incdec.ptr = getelementptr inbounds i8, i8* %Src.09, i64 1 + %.pr = load i8, i8* %incdec.ptr, align 1 + %tobool2 = icmp eq i8 %.pr, 0 + br i1 %tobool2, label %for.end, label %for.inc + +; CHECK-NOT: sub +for.end: ; preds = %for.inc + %sub.ptr.lhs.cast = ptrtoint i8* %incdec.ptr to i64 + %sub.ptr.rhs.cast = ptrtoint i8* %Str to i64 + %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast + br label %cleanup + +cleanup: ; preds = %lor.lhs.false, %entry, %for.end + %retval.0 = phi i64 [ %sub.ptr.sub, %for.end ], [ 0, %entry ], [ 0, %lor.lhs.false ] + ret i64 %retval.0 +} + + +define i64 @valid_strlen2(i8* %Str) { +entry: + %tobool = icmp eq i8* %Str, null + br i1 %tobool, label %cleanup, label %for.cond + +; CHECK: call i64 @strlen(i8* %Str) +; CHECK: br i1 true +for.cond: ; preds = %entry, %for.cond + %Src.0 = phi i8* [ %incdec.ptr, %for.cond ], [ %Str, %entry ] + %0 = load i8, i8* %Src.0, align 1 + %tobool1 = icmp eq i8 %0, 0 + %incdec.ptr = getelementptr inbounds i8, i8* %Src.0, i64 1 + br i1 %tobool1, label %for.end, label %for.cond + +; CHECK-NOT: sub +for.end: ; preds = %for.cond + %sub.ptr.lhs.cast = ptrtoint i8* %Src.0 to i64 + %sub.ptr.rhs.cast = ptrtoint i8* %Str to i64 + %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast + br label %cleanup + + cleanup: ; preds = %entry, %for.end + %retval.0 = phi i64 [ %sub.ptr.sub, %for.end ], [ 0, %entry ] + ret i64 %retval.0 +} + + +define i64 @valid_strlen3(i8* readonly %Str) { +entry: + %tobool = icmp eq i8* %Str, null + br i1 %tobool, label %cleanup, label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + %0 = load i8, i8* %Str, align 1 + %tobool16 = icmp eq i8 %0, 0 + br i1 %tobool16, label %cleanup, label %while.body + +; CHECK: call i64 @strlen(i8* %Str) +; CHECK: br i1 true +while.body: ; preds = %while.cond.preheader, %while.body + %Len.08 = phi i64 [ %inc, %while.body ], [ 0, %while.cond.preheader ] + %Str.addr.07 = phi i8* [ %incdec.ptr, %while.body ], [ %Str, %while.cond.preheader ] + %incdec.ptr = getelementptr inbounds i8, i8* %Str.addr.07, i64 1 + %inc = add i64 %Len.08, 1 + %1 = load i8, i8* %incdec.ptr, align 1 + %tobool1 = icmp eq i8 %1, 0 + br i1 %tobool1, label %cleanup, label %while.body + +cleanup: ; preds = %while.body, %while.cond.preheader, %entry + %retval.0 = phi i64 [ 0, %entry ], [ 0, %while.cond.preheader ], [ %inc, %while.body ] + ret i64 %retval.0 +} + + +define zeroext i32 @valid_strlen4(i8* %Str) { +entry: + %tobool = icmp eq i8* %Str, null + br i1 %tobool, label %cleanup, label %lor.lhs.false + +lor.lhs.false: ; preds = %entry + %0 = load i8, i8* %Str, align 1 + %cmp = icmp eq i8 %0, 0 + br i1 %cmp, label %cleanup, label %for.inc + +; CHECK: call i64 @strlen(i8* %Str) +; CHECK: br i1 true +for.inc: ; preds = %lor.lhs.false, %for.inc + %Src.010 = phi i8* [ %incdec.ptr, %for.inc ], [ %Str, %lor.lhs.false ] + %incdec.ptr = getelementptr inbounds i8, i8* %Src.010, i64 1 + %.pr = load i8, i8* %incdec.ptr, align 1 + %tobool2 = icmp eq i8 %.pr, 0 + br i1 %tobool2, label %for.end, label %for.inc + +; CHECK: trunc i64 %strlen to i32 +; CHECK-NOT: sub +for.end: ; preds = %for.inc + %sub.ptr.lhs.cast = ptrtoint i8* %incdec.ptr to i64 + %sub.ptr.rhs.cast = ptrtoint i8* %Str to i64 + %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast + %conv3 = trunc i64 %sub.ptr.sub to i32 + br label %cleanup + +cleanup: ; preds = %lor.lhs.false, %entry, %for.end + %retval.0 = phi i32 [ %conv3, %for.end ], [ 0, %entry ], [ 0, %lor.lhs.false ] + ret i32 %retval.0 +} + + +define i32 @valid_strlen5(i16* %Str) #1 { +entry: + %cmp = icmp eq i16* %Str, null + br i1 %cmp, label %return, label %lor.lhs.false + +lor.lhs.false: ; preds = %entry + %0 = load i16, i16* %Str, align 1 + %cmp1 = icmp eq i16 %0, 0 + br i1 %cmp1, label %return, label %while.cond + +; CHECK: call i32 @llvm.strlen16.p0i16(i16* %Str) +; CHECK: br i1 true +while.cond: ; preds = %lor.lhs.false, %while.cond + %src.pn = phi i16* [ %incdec.ptr, %while.cond ], [ %Str, %lor.lhs.false ] + %incdec.ptr = getelementptr inbounds i16, i16* %src.pn, i64 1 + %1 = load i16, i16* %incdec.ptr, align 1 + %tobool = icmp eq i16 %1, 0 + br i1 %tobool, label %while.end, label %while.cond + +; CHECK-NOT: sub +; CHECK-NOT: lshr +while.end: ; preds = %while.cond + %sub.ptr.lhs.cast = ptrtoint i16* %incdec.ptr to i64 + %sub.ptr.rhs.cast = ptrtoint i16* %Str to i64 + %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast + %2 = lshr exact i64 %sub.ptr.sub, 1 + %conv2 = trunc i64 %2 to i32 + br label %return + +return: ; preds = %entry, %lor.lhs.false, %while.end + %retval.0 = phi i32 [ %conv2, %while.end ], [ 0, %lor.lhs.false ], [ 0, %entry ] + ret i32 %retval.0 +} + + +define i64 @valid_strlen6(i16* %Str) { +entry: + %tobool = icmp eq i16* %Str, null + br i1 %tobool, label %cleanup, label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + %0 = load i16, i16* %Str, align 1 + %tobool16 = icmp eq i16 %0, 0 + br i1 %tobool16, label %cleanup, label %while.body + +; CHECK: call i32 @llvm.strlen16.p0i16(i16* %Str) +; CHECK: br i1 true +while.body: ; preds = %while.cond.preheader, %while.body + %Len.08 = phi i64 [ %inc, %while.body ], [ 0, %while.cond.preheader ] + %Str.addr.07 = phi i16* [ %incdec.ptr, %while.body ], [ %Str, %while.cond.preheader ] + %incdec.ptr = getelementptr inbounds i16, i16* %Str.addr.07, i64 1 + %inc = add i64 %Len.08, 1 + %1 = load i16, i16* %incdec.ptr, align 1 + %tobool1 = icmp eq i16 %1, 0 + br i1 %tobool1, label %cleanup, label %while.body + +cleanup: ; preds = %while.body, %while.cond.preheader, %entry + %retval.0 = phi i64 [ 0, %entry ], [ 0, %while.cond.preheader ], [ %inc, %while.body ] + ret i64 %retval.0 +} diff --git a/llvm/test/Transforms/LoopIdiom/strlen-fold.ll b/llvm/test/Transforms/LoopIdiom/strlen-fold.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopIdiom/strlen-fold.ll @@ -0,0 +1,145 @@ +; RUN: opt -O1 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +; If all users of a strlen16 call are icmp with 0, +; then fold the call into a single load of the first char +define i32 @basic(i16* %src) { +entry: +;CHECK-LABEL: basic + +; CHECK: %strlenfirst = load i16, i16* %src +; CHECK-NEXT: %strlenfirst + + %call = call i32 @llvm.strlen16.p0i16(i16* %src) + %cmp = icmp eq i32 %call, 0 + %cond = select i1 %cmp, i32 1, i32 2 + +; CHECK: @llvm.strlen16.p0i16 +; CHECK-NOT: %strlenfirst + + %call2 = call i32 @llvm.strlen16.p0i16(i16* %src) + %cmp2 = icmp eq i32 %call2, 1 + %cond2 = select i1 %cmp2, i32 5, i32 9 + + %sum = add i32 %cond, %cond2 + ret i32 %sum +} + +; If all users of a strlen16 call are icmp with 0 or are +; "combined compares", ie. (icmp (A|B), 0), then either +; strlen16s can be folded. +define i32 @combined_cmp(i16* %str1, i16* %str2) { +entry: +; CHECK-LABEL: combined_cmp + +; CHECK: %strlenfirst{{[0-9]*}} = load i16, i16* %str1 +; CHECK-NOT: @llvm.strlen16.p0i16 + %call = call i32 @llvm.strlen16.p0i16(i16* %str1) + +; CHECK: %strlenfirst{{[0-9]*}} = load i16, i16* %str2 +; CHECK-NOT: @llvm.strlen16.p0i16 + %call1 = call i32 @llvm.strlen16.p0i16(i16* %str2) + + %or = or i32 %call1, %call + %cmp = icmp eq i32 %or, 0 + %cond = select i1 %cmp, i32 1, i32 2 + ret i32 %cond +} + +; Check that every strlen sinks to an icmp eq with 0. +; If not, the call cannot be folded. +define i32 @invalid_combined_cmp(i16* %str1, i16* %str2) { +entry: +; CHECK-LABEL: invalid_combined_cmp + +; CHECK-NOT: %strlenfirst + %call = call i32 @llvm.strlen16.p0i16(i16* %str1) + %call1 = call i32 @llvm.strlen16.p0i16(i16* %str2) + %or = or i32 %call, %call1 + %add = add i32 %call, %call1 + %cmp = icmp eq i32 %or, 0 + %cond = select i1 %cmp, i32 1, i32 2 + %ret = add i32 %cond, %add + ret i32 %ret +} + +; If instructions in the def-use graph after the root (strlen16) +; and the leafs (terminators) are bit extension instructions then +; the strlen16 may still be a candidate for folding since +; these instructions don't require the full strlen in order to be +; correct. +define i1 @ext_cmp(i16* %str1, i16* %str2) { +entry: +; CHECK-LABEL: ext_cmp + +; CHECK: %strlenfirst{{[0-9]*}} = load i16, i16* %str1 +; CHECK-NOT: @llvm.strlen16.p0i16 + %call = call i32 @llvm.strlen16.p0i16(i16* %str1) + +; CHECK: %strlenfirst{{[0-9]*}} = load i16, i16* %str2 +; CHECK-NOT: @llvm.strlen16.p0i16 + %call1 = call i32 @llvm.strlen16.p0i16(i16* %str2) + + %zext = zext i32 %call to i64 + %zext1 = zext i32 %call1 to i64 + %or = or i64 %zext, %zext1 + %tobool = icmp sgt i64 %or, 0 + ret i1 %tobool +} + + +; If an intermediate instruction is a truncation then the root +; strlen16 cannot be considered for folding. +define i1 @trunc_cmp(i16* %str) { +entry: +; CHECK-LABEL: trunc_cmp + +; CHECK: @llvm.strlen16.p0i16 +; CHECK-NOT: %strlenfirst + %call = call i32 @llvm.strlen16.p0i16(i16* %str) + + %trunc = trunc i32 %call to i8 + %tobool = icmp eq i8 %trunc, 0 + ret i1 %tobool +} + + +; If the operands of the combined compare (A|B) are phi instructions +; descending from a strlen16, the strlen16s are still candidates +; for folding. +define i1 @phi_cmp(i16* %str1, i16* %str2) { +; CHECK-LABEL: phi_cmp +str1_null_check: + %cmp = icmp eq i16* %str1, null + br i1 %cmp, label %str2_null_check, label %str1_strlen + +str1_strlen: +; CHECK: %strlenfirst{{[0-9]*}} = load i16, i16* %str1 +; CHECK-NOT: @llvm.strlen16.p0i16 + %call = call i32 @llvm.strlen16.p0i16(i16* nonnull %str1) + br label %str2_null_check + +str2_null_check: + ; %call1 is either the string length or 0 if null + %call1 = phi i32 [ %call, %str1_strlen ], [ 0, %str1_null_check ] + %cmp.i1 = icmp eq i16* %str2, null + br i1 %cmp.i1, label %combined_cmp, label %str2_strlen + +str2_strlen: +; CHECK-NOT: @llvm.strlen16.p0i16 +; CHECK: %strlenfirst{{[0-9]*}} = load i16, i16* %str2 + %call2 = call i32 @llvm.strlen16.p0i16(i16* nonnull %str2) + br label %combined_cmp + +combined_cmp: + %call3 = phi i32 [ %call2, %str2_strlen ], [ 0, %str2_null_check ] + %or = or i32 %call1, %call3 + %cmp.i2 = icmp eq i32 %or, 0 + ret i1 %cmp.i2 +} + +declare i32 @llvm.strlen16.p0i16(i16*) #0 + +attributes #0 = { argmemonly nounwind }