Index: lib/Target/ARM/ARMCodeGenPrepare.cpp =================================================================== --- lib/Target/ARM/ARMCodeGenPrepare.cpp +++ lib/Target/ARM/ARMCodeGenPrepare.cpp @@ -109,24 +109,25 @@ namespace { class IRPromoter { SmallPtrSet NewInsts; - SmallVector InstsToRemove; - DenseMap TruncTysMap; + SmallPtrSet InstsToRemove; + DenseMap> TruncTysMap; SmallPtrSet Promoted; Module *M = nullptr; LLVMContext &Ctx; IntegerType *ExtTy = nullptr; IntegerType *OrigTy = nullptr; - - void PrepareConstants(SmallPtrSetImpl &Visited, - SmallPtrSetImpl &SafeToPromote); - void ExtendSources(SmallPtrSetImpl &Sources); - void PromoteTree(SmallPtrSetImpl &Visited, - SmallPtrSetImpl &Sources, - SmallPtrSetImpl &Sinks, - SmallPtrSetImpl &SafeToPromote); - void TruncateSinks(SmallPtrSetImpl &Sources, - SmallPtrSetImpl &Sinks); - void Cleanup(SmallPtrSetImpl &Visited); + SmallPtrSetImpl *Visited; + SmallPtrSetImpl *Sources; + SmallPtrSetImpl *Sinks; + SmallPtrSetImpl *SafeToPromote; + + void ReplaceAllUsersOfWith(Value *From, Value *To); + void PrepareConstants(void); + void ExtendSources(void); + void ConvertTruncs(void); + void PromoteTree(void); + void TruncateSinks(void); + void Cleanup(void); public: IRPromoter(Module *M) : M(M), Ctx(M->getContext()), @@ -192,6 +193,10 @@ return V->getType()->getScalarSizeInBits() > ARMCodeGenPrepare::TypeSize; } +static bool LessThanTypeSize(Value *V) { + return V->getType()->getScalarSizeInBits() < ARMCodeGenPrepare::TypeSize; +} + /// Some instructions can use 8- and 16-bit operands, and we don't need to /// promote anything larger. We disallow booleans to make life easier when /// dealing with icmps but allow any other integer that is <= 16 bits. Void @@ -250,12 +255,12 @@ return LessOrEqualTypeSize(Store->getValueOperand()); if (auto *Return = dyn_cast(V)) return LessOrEqualTypeSize(Return->getReturnValue()); - if (auto *Trunc = dyn_cast(V)) - return EqualTypeSize(Trunc->getOperand(0)); if (auto *ZExt = dyn_cast(V)) return GreaterThanTypeSize(ZExt); + if (auto *Switch = dyn_cast(V)) + return LessThanTypeSize(Switch->getCondition()); if (auto *ICmp = dyn_cast(V)) - return ICmp->isSigned(); + return ICmp->isSigned() || LessThanTypeSize(ICmp->getOperand(0)); return isa(V); } @@ -426,23 +431,32 @@ llvm_unreachable("unhandled opcode for narrow intrinsic"); } -static void ReplaceAllUsersOfWith(Value *From, Value *To) { +void IRPromoter::ReplaceAllUsersOfWith(Value *From, Value *To) { SmallVector Users; Instruction *InstTo = dyn_cast(To); + bool ReplacedAll = true; + + LLVM_DEBUG(dbgs() << "ARM CGP: Replacing " << *From << " with " << *To + << "\n"); + for (Use &U : From->uses()) { auto *User = cast(U.getUser()); - if (InstTo && User->isIdenticalTo(InstTo)) + if (InstTo && User->isIdenticalTo(InstTo)) { + ReplacedAll = false; continue; + } Users.push_back(User); } for (auto *U : Users) U->replaceUsesOfWith(From, To); + + if (ReplacedAll) + if (auto *I = dyn_cast(From)) + InstsToRemove.insert(I); } -void -IRPromoter::PrepareConstants(SmallPtrSetImpl &Visited, - SmallPtrSetImpl &SafeToPromote) { +void IRPromoter::PrepareConstants() { IRBuilder<> Builder{Ctx}; // First step is to prepare the instructions for mutation. Most constants // just need to be zero extended into their new type, but complications arise @@ -463,12 +477,12 @@ // immediate as operand 1, we create an equivalent instruction using a // positive immediate. That positive immediate can then be zext along with // all the other immediates later. - for (auto *V : Visited) { + for (auto *V : *Visited) { if (!isa(V)) continue; auto *I = cast(V); - if (SafeToPromote.count(I)) { + if (SafeToPromote->count(I)) { if (!isa(I)) continue; @@ -493,16 +507,16 @@ NewInst->copyIRFlags(I); NewInsts.insert(NewInst); } - InstsToRemove.push_back(I); + InstsToRemove.insert(I); I->replaceAllUsesWith(NewVal); } } } for (auto *I : NewInsts) - Visited.insert(I); + Visited->insert(I); } -void IRPromoter::ExtendSources(SmallPtrSetImpl &Sources) { +void IRPromoter::ExtendSources() { IRBuilder<> Builder{Ctx}; auto InsertZExt = [&](Value *V, Instruction *InsertPt) { @@ -520,13 +534,13 @@ I->moveAfter(InsertPt); NewInsts.insert(I); } + ReplaceAllUsersOfWith(V, ZExt); - TruncTysMap[ZExt] = TruncTysMap[V]; }; // Now, insert extending instructions between the sources and their users. LLVM_DEBUG(dbgs() << "ARM CGP: Promoting sources:\n"); - for (auto V : Sources) { + for (auto V : *Sources) { LLVM_DEBUG(dbgs() << " - " << *V << "\n"); if (auto *I = dyn_cast(V)) InsertZExt(I, I); @@ -540,22 +554,19 @@ } } -void IRPromoter::PromoteTree(SmallPtrSetImpl &Visited, - SmallPtrSetImpl &Sources, - SmallPtrSetImpl &Sinks, - SmallPtrSetImpl &SafeToPromote) { +void IRPromoter::PromoteTree() { LLVM_DEBUG(dbgs() << "ARM CGP: Mutating the tree..\n"); IRBuilder<> Builder{Ctx}; // Mutate the types of the instructions within the tree. Here we handle // constant operands. - for (auto *V : Visited) { - if (Sources.count(V)) + for (auto *V : *Visited) { + if (Sources->count(V)) continue; auto *I = cast(V); - if (Sinks.count(I)) + if (Sinks->count(I)) continue; for (unsigned i = 0, e = I->getNumOperands(); i < e; ++i) { @@ -578,15 +589,15 @@ // Finally, any instructions that should be promoted but haven't yet been, // need to be handled using intrinsics. - for (auto *V : Visited) { + for (auto *V : *Visited) { auto *I = dyn_cast(V); if (!I) continue; - if (Sources.count(I) || Sinks.count(I)) + if (Sources->count(I) || Sinks->count(I)) continue; - if (!shouldPromote(I) || SafeToPromote.count(I) || NewInsts.count(I)) + if (!shouldPromote(I) || SafeToPromote->count(I) || NewInsts.count(I)) continue; assert(EnableDSP && "DSP intrinisc insertion not enabled!"); @@ -600,29 +611,21 @@ Builder.SetCurrentDebugLocation(I->getDebugLoc()); Value *Args[] = { I->getOperand(0), I->getOperand(1) }; CallInst *Call = Builder.CreateCall(DSPInst, Args); - ReplaceAllUsersOfWith(I, Call); - InstsToRemove.push_back(I); NewInsts.insert(Call); - TruncTysMap[Call] = OrigTy; + ReplaceAllUsersOfWith(I, Call); } } -void IRPromoter::TruncateSinks(SmallPtrSetImpl &Sources, - SmallPtrSetImpl &Sinks) { +void IRPromoter::TruncateSinks() { LLVM_DEBUG(dbgs() << "ARM CGP: Fixing up the sinks:\n"); IRBuilder<> Builder{Ctx}; - auto InsertTrunc = [&](Value *V) -> Instruction* { + auto InsertTrunc = [&](Value *V, Type *TruncTy) -> Instruction* { if (!isa(V) || !isa(V->getType())) return nullptr; - if ((!Promoted.count(V) && !NewInsts.count(V)) || !TruncTysMap.count(V) || - Sources.count(V)) - return nullptr; - - Type *TruncTy = TruncTysMap[V]; - if (TruncTy == ExtTy) + if ((!Promoted.count(V) && !NewInsts.count(V)) || Sources->count(V)) return nullptr; LLVM_DEBUG(dbgs() << "ARM CGP: Creating " << *TruncTy << " Trunc for " @@ -636,14 +639,15 @@ // Fix up any stores or returns that use the results of the promoted // chain. - for (auto I : Sinks) { - LLVM_DEBUG(dbgs() << " - " << *I << "\n"); + for (auto I : *Sinks) { + LLVM_DEBUG(dbgs() << "ARM CGP: For Sink: " << *I << "\n"); // Handle calls separately as we need to iterate over arg operands. if (auto *Call = dyn_cast(I)) { for (unsigned i = 0; i < Call->getNumArgOperands(); ++i) { Value *Arg = Call->getArgOperand(i); - if (Instruction *Trunc = InsertTrunc(Arg)) { + Type *Ty = TruncTysMap[Call][i]; + if (Instruction *Trunc = InsertTrunc(Arg, Ty)) { Trunc->moveBefore(Call); Call->setArgOperand(i, Trunc); } @@ -651,9 +655,20 @@ continue; } + // Special case switches because we need to truncate the condition. + if (auto *Switch = dyn_cast(I)) { + Type *Ty = TruncTysMap[Switch][0]; + if (Instruction *Trunc = InsertTrunc(Switch->getCondition(), Ty)) { + Trunc->moveBefore(Switch); + Switch->setCondition(Trunc); + } + continue; + } + // Now handle the others. for (unsigned i = 0; i < I->getNumOperands(); ++i) { - if (Instruction *Trunc = InsertTrunc(I->getOperand(i))) { + Type *Ty = TruncTysMap[I][i]; + if (Instruction *Trunc = InsertTrunc(I->getOperand(i), Ty)) { Trunc->moveBefore(I); I->setOperand(i, Trunc); } @@ -661,10 +676,10 @@ } } -void IRPromoter::Cleanup(SmallPtrSetImpl &Visited) { +void IRPromoter::Cleanup() { // Some zexts will now have become redundant, along with their trunc // operands, so remove them - for (auto V : Visited) { + for (auto V : *Visited) { if (!isa(V)) continue; @@ -674,9 +689,9 @@ Value *Src = ZExt->getOperand(0); if (ZExt->getSrcTy() == ZExt->getDestTy()) { - LLVM_DEBUG(dbgs() << "ARM CGP: Removing unnecessary cast.\n"); + LLVM_DEBUG(dbgs() << "ARM CGP: Removing unnecessary cast: " << *ZExt + << "\n"); ReplaceAllUsersOfWith(ZExt, Src); - InstsToRemove.push_back(ZExt); continue; } @@ -686,10 +701,7 @@ auto *Trunc = cast(Src); assert(Trunc->getOperand(0)->getType() == ExtTy && "expected inserted trunc to be operating on i32"); - LLVM_DEBUG(dbgs() << "ARM CGP: Replacing zext with trunc operand: " - << *Trunc->getOperand(0)); ReplaceAllUsersOfWith(ZExt, Trunc->getOperand(0)); - InstsToRemove.push_back(ZExt); } } @@ -705,6 +717,29 @@ Promoted.clear(); } +void IRPromoter::ConvertTruncs() { + IRBuilder<> Builder{Ctx}; + + for (auto *V : *Visited) { + if (!isa(V) || Sources->count(V)) + continue; + + auto *Trunc = cast(V); + assert(LessThanTypeSize(Trunc) && "expected narrow trunc"); + + Builder.SetInsertPoint(Trunc); + unsigned NumBits = + cast(Trunc->getType())->getScalarSizeInBits(); + ConstantInt *Mask = ConstantInt::get(Ctx, APInt::getMaxValue(NumBits)); + Value *Masked = Builder.CreateAnd(Trunc->getOperand(0), Mask); + + if (auto *I = dyn_cast(Masked)) + NewInsts.insert(I); + + ReplaceAllUsersOfWith(Trunc, Masked); + } +} + void IRPromoter::Mutate(Type *OrigTy, SmallPtrSetImpl &Visited, SmallPtrSetImpl &Sources, @@ -718,28 +753,47 @@ assert(OrigTy->getPrimitiveSizeInBits() < ExtTy->getPrimitiveSizeInBits() && "original type not smaller than extended type"); - // Cache original types. - for (auto *V : Visited) - TruncTysMap[V] = V->getType(); + this->Visited = &Visited; + this->Sources = &Sources; + this->Sinks = &Sinks; + this->SafeToPromote = &SafeToPromote; + + // Cache original types of the values that will likely need truncating + for (auto *I : Sinks) { + if (auto *Call = dyn_cast(I)) { + for (unsigned i = 0; i < Call->getNumArgOperands(); ++i) { + Value *Arg = Call->getArgOperand(i); + TruncTysMap[Call].push_back(Arg->getType()); + } + } else if (auto *Switch = dyn_cast(I)) + TruncTysMap[I].push_back(Switch->getCondition()->getType()); + else { + for (unsigned i = 0; i < I->getNumOperands(); ++i) + TruncTysMap[I].push_back(I->getOperand(i)->getType()); + } + } // Convert adds and subs using negative immediates to equivalent instructions // that use positive constants. - PrepareConstants(Visited, SafeToPromote); + PrepareConstants(); // Insert zext instructions between sources and their users. - ExtendSources(Sources); + ExtendSources(); + + // Convert any truncs, that aren't sources, into AND masks. + ConvertTruncs(); // Promote visited instructions, mutating their types in place. Also insert // DSP intrinsics, if enabled, for adds and subs which would be unsafe to // promote. - PromoteTree(Visited, Sources, Sinks, SafeToPromote); + PromoteTree(); // Insert trunc instructions for use by calls, stores etc... - TruncateSinks(Sources, Sinks); + TruncateSinks(); // Finally, remove unecessary zexts and truncs, delete old instructions and // clear the data structures. - Cleanup(Visited); + Cleanup(); LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete\n"); } Index: test/CodeGen/ARM/CGP/arm-cgp-calls.ll =================================================================== --- test/CodeGen/ARM/CGP/arm-cgp-calls.ll +++ test/CodeGen/ARM/CGP/arm-cgp-calls.ll @@ -200,11 +200,23 @@ ret i1 %retval } +; CHECK-LABEL: promote_arg_pass_to_call +; CHECK-NOT: uxt +define i16 @promote_arg_pass_to_call(i16 zeroext %arg1, i16 zeroext %arg2) { + %conv = add nuw i16 %arg1, 15 + %mul = mul nuw nsw i16 %conv, 3 + %cmp = icmp ult i16 %mul, %arg2 + %trunc = trunc i16 %arg1 to i8 + %res = call zeroext i16 @dummy4(i1 %cmp, i8 %trunc, i16 %arg1) + ret i16 %res +} + declare i32 @assert(...) declare i8 @dummy_i8(i8) declare i8 @dummy2(i8*, i8, i8) declare i16 @dummy3(i16) +declare i16 @dummy4(i1, i8, i16) declare dso_local i32 @e(...) local_unnamed_addr #1 declare dso_local zeroext i16 @f(...) local_unnamed_addr #1 Index: test/CodeGen/ARM/CGP/arm-cgp-casts.ll =================================================================== --- test/CodeGen/ARM/CGP/arm-cgp-casts.ll +++ test/CodeGen/ARM/CGP/arm-cgp-casts.ll @@ -583,6 +583,8 @@ ret i8 %retval } +; CHECK-COMMON-LABEL: bitcast_i1 +; CHECK-COMMON-NOT: uxt define i32 @bitcast_i1(i16 zeroext %a, i32 %b, i32 %c) { entry: %0 = bitcast i1 1 to i1 @@ -601,3 +603,40 @@ %retval = phi i32 [ %select, %if.then ], [ 0, %entry ] ret i32 %retval } + +; CHECK-COMMON-LABEL: search_back_through_trunc +; CHECK-COMMON-NOT: uxt +; CHECK-COMMON: cmp +; CHECK-COMMON: strb +; CHECK-COMMON: strb +define void @search_back_through_trunc(i8* %a, i8* %b, i8* %c, i8* %d, i16* %e) { +entry: + %0 = load i8, i8* %a, align 1 + %conv106 = zext i8 %0 to i16 + %shl = shl nuw i16 %conv106, 8 + %1 = load i8, i8* %b, align 1 + %conv108 = zext i8 %1 to i16 + %or109 = or i16 %shl, %conv108 + %2 = load i8, i8* %c, align 1 + %conv119 = zext i8 %2 to i16 + %shl120 = shl nuw i16 %conv119, 8 + %3 = load i8, i8* %d, align 1 + %conv122 = zext i8 %3 to i16 + %or123 = or i16 %shl120, %conv122 + %cmp133 = icmp eq i16 %or109, %or123 + br i1 %cmp133, label %if.end183, label %if.else136 + +if.else136: + %4 = load i16, i16* %e, align 2 + %extract.t854 = trunc i16 %4 to i8 + %extract856 = lshr i16 %4, 8 + %extract.t857 = trunc i16 %extract856 to i8 + br label %if.end183 + +if.end183: + %w.0.off0 = phi i8 [ %extract.t854, %if.else136 ], [ %1, %entry ] + %w.0.off8 = phi i8 [ %extract.t857, %if.else136 ], [ %2, %entry ] + store i8 %w.0.off8, i8* %c, align 1 + store i8 %w.0.off0, i8* %d, align 1 + ret void +} Index: test/CodeGen/ARM/CGP/arm-cgp-phis-ret.ll =================================================================== --- test/CodeGen/ARM/CGP/arm-cgp-phis-ret.ll +++ test/CodeGen/ARM/CGP/arm-cgp-phis-ret.ll @@ -172,3 +172,15 @@ exit: ret i16 %unrelated } + +; CHECK-COMMON-LABEL: promote_arg_return +; CHECK-COMMON-NOT: uxt +; CHECK-COMMON: strb +define i16 @promote_arg_return(i16 zeroext %arg1, i16 zeroext %arg2, i8* %res) { + %add = add nuw i16 %arg1, 15 + %mul = mul nuw nsw i16 %add, 3 + %cmp = icmp ult i16 %mul, %arg2 + %conv = zext i1 %cmp to i8 + store i8 %conv, i8* %res + ret i16 %arg1 +}