Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -395,16 +395,15 @@ /// \brief Return if the target supports combining a /// chain like: /// \code - /// %andResult = and %val1, #imm-with-one-bit-set; + /// %andResult = and %val1, #mask /// %icmpResult = icmp %andResult, 0 - /// br i1 %icmpResult, label %dest1, label %dest2 /// \endcode /// into a single machine instruction of a form like: /// \code - /// brOnBitSet %register, #bitNumber, dest + /// cc = test %register, #mask /// \endcode - bool isMaskAndBranchFoldingLegal() const { - return MaskAndBranchFoldingIsLegal; + virtual bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const { + return false; } /// Return true if the target should transform: @@ -2197,10 +2196,6 @@ /// the branch is usually predicted right. bool PredictableSelectIsExpensive; - /// MaskAndBranchFoldingIsLegal - Indicates if the target supports folding - /// a mask of a single bit, a compare, and a branch into a single instruction. - bool MaskAndBranchFoldingIsLegal; - /// \see enableExtLdPromotion. bool EnableExtLdPromotion; Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -77,7 +77,6 @@ STATISTIC(NumRetsDup, "Number of return instructions duplicated"); STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); -STATISTIC(NumAndCmpsMoved, "Number of and/cmp's pushed into branches"); STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed"); static cl::opt DisableBranchOpts( @@ -215,7 +214,6 @@ bool optimizeExtractElementInst(Instruction *Inst); bool dupRetToEnableTailCallOpts(BasicBlock *BB); bool placeDbgValues(Function &F); - bool sinkAndCmp(Function &F); bool extLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI, Instruction *&Inst, const SmallVectorImpl &Exts, @@ -290,14 +288,8 @@ // find a node corresponding to the value. EverMadeChange |= placeDbgValues(F); - // If there is a mask, compare against zero, and branch that can be combined - // into a single target instruction, push the mask and compare into branch - // users. Do this before OptimizeBlock -> OptimizeInst -> - // OptimizeCmpExpression, which perturbs the pattern being searched for. - if (!DisableBranchOpts) { - EverMadeChange |= sinkAndCmp(F); + if (!DisableBranchOpts) EverMadeChange |= splitBranchCondition(F); - } bool MadeChange = true; while (MadeChange) { @@ -1090,6 +1082,77 @@ return false; } +/// Duplicate and sink the given 'and' instruction into user blocks where it is +/// used in a compare to allow isel to generate better code for targets where +/// this operation can be combined. +/// +/// Return true if any changes are made. +static bool sinkAndCmp0Expression(Instruction *AndI, + const TargetLowering &TLI) { + + // Nothing to do for single use in same basic block. + if (AndI->hasOneUse() && + AndI->getParent() == cast(*AndI->user_begin())->getParent()) + return false; + + // Try to avoid cases where sinking/duplicating is likely to increase register + // pressure. + if (!isa(AndI->getOperand(0)) && + !isa(AndI->getOperand(1)) && + AndI->getOperand(0)->hasOneUse() && AndI->getOperand(1)->hasOneUse()) + return false; + + if (!TLI.isMaskAndCmp0FoldingBeneficial(*AndI)) + return false; + + for (auto *U : AndI->users()) { + Instruction *User = cast(U); + + // Only sink for and mask feeding icmp with 0. + if (!dyn_cast(User)) + return false; + + auto *CmpC = dyn_cast(User->getOperand(1)); + if (!CmpC || !CmpC->isZero()) + return false; + } + + DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n"); + DEBUG(AndI->getParent()->dump()); + + // Push the 'and' into the same block as the icmp 0. There should only be + // one (icmp (and, 0)) in each block, since CSE/GVN should have removed any + // others, so we don't need to keep track of which BBs we insert into. + for (Value::user_iterator UI = AndI->user_begin(), E = AndI->user_end(); + UI != E; ) { + Use &TheUse = UI.getUse(); + Instruction *User = cast(*UI); + + // Preincrement use iterator so we don't invalidate it. + ++UI; + + DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n"); + + // Keep the 'and' in the same place if the use is already in the same block. + Instruction *InsertPt = + User->getParent() == AndI->getParent() ? AndI : User; + Instruction *InsertedAnd = + BinaryOperator::Create(Instruction::And, AndI->getOperand(0), + AndI->getOperand(1), "", InsertPt); + // Propagate the debug info. + InsertedAnd->setDebugLoc(AndI->getDebugLoc()); + + // Replace a use of the 'and' with a use of the new 'and'. + TheUse = InsertedAnd; + ++NumAndUses; + DEBUG(User->getParent()->dump()); + } + + // We removed all uses, nuke the and. + AndI->eraseFromParent(); + return true; +} + /// Check if the candidates could be combined with a shift instruction, which /// includes: /// 1. Truncate instruction @@ -4534,13 +4597,10 @@ !(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy())) return false; - // Skip loads we've already transformed or have no reason to transform. - if (Load->hasOneUse()) { - User *LoadUser = *Load->user_begin(); - if (cast(LoadUser)->getParent() == Load->getParent() && - !dyn_cast(LoadUser)) - return false; - } + // Skip loads we've already transformed. + if (Load->hasOneUse() && + InsertedInsts.count(cast(*Load->user_begin()))) + return false; // Look at all uses of Load, looking through phis, to determine how many bits // of the loaded value are needed. @@ -4636,6 +4696,9 @@ IRBuilder<> Builder(Load->getNextNode()); auto *NewAnd = dyn_cast( Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits))); + // Mark this instruction as "inserted by CGP", so that other + // optimizations don't touch it. + InsertedInsts.insert(NewAnd); // Replace all uses of load with new and (except for the use of load in the // new and itself). @@ -5550,6 +5613,10 @@ BinaryOperator *BinOp = dyn_cast(I); + if (BinOp && (BinOp->getOpcode() == Instruction::And) && + EnableAndCmpSinking && TLI) + return sinkAndCmp0Expression(BinOp, *TLI); + if (BinOp && (BinOp->getOpcode() == Instruction::AShr || BinOp->getOpcode() == Instruction::LShr)) { ConstantInt *CI = dyn_cast(BinOp->getOperand(1)); @@ -5679,68 +5746,6 @@ return MadeChange; } -// If there is a sequence that branches based on comparing a single bit -// against zero that can be combined into a single instruction, and the -// target supports folding these into a single instruction, sink the -// mask and compare into the branch uses. Do this before OptimizeBlock -> -// OptimizeInst -> OptimizeCmpExpression, which perturbs the pattern being -// searched for. -bool CodeGenPrepare::sinkAndCmp(Function &F) { - if (!EnableAndCmpSinking) - return false; - if (!TLI || !TLI->isMaskAndBranchFoldingLegal()) - return false; - bool MadeChange = false; - for (BasicBlock &BB : F) { - // Does this BB end with the following? - // %andVal = and %val, #single-bit-set - // %icmpVal = icmp %andResult, 0 - // br i1 %cmpVal label %dest1, label %dest2" - BranchInst *Brcc = dyn_cast(BB.getTerminator()); - if (!Brcc || !Brcc->isConditional()) - continue; - ICmpInst *Cmp = dyn_cast(Brcc->getOperand(0)); - if (!Cmp || Cmp->getParent() != &BB) - continue; - ConstantInt *Zero = dyn_cast(Cmp->getOperand(1)); - if (!Zero || !Zero->isZero()) - continue; - Instruction *And = dyn_cast(Cmp->getOperand(0)); - if (!And || And->getOpcode() != Instruction::And || And->getParent() != &BB) - continue; - ConstantInt* Mask = dyn_cast(And->getOperand(1)); - if (!Mask || !Mask->getUniqueInteger().isPowerOf2()) - continue; - DEBUG(dbgs() << "found and; icmp ?,0; brcc\n"); DEBUG(BB.dump()); - - // Push the "and; icmp" for any users that are conditional branches. - // Since there can only be one branch use per BB, we don't need to keep - // track of which BBs we insert into. - for (Use &TheUse : Cmp->uses()) { - // Find brcc use. - BranchInst *BrccUser = dyn_cast(TheUse); - if (!BrccUser || !BrccUser->isConditional()) - continue; - BasicBlock *UserBB = BrccUser->getParent(); - if (UserBB == &BB) continue; - DEBUG(dbgs() << "found Brcc use\n"); - - // Sink the "and; icmp" to use. - MadeChange = true; - BinaryOperator *NewAnd = - BinaryOperator::CreateAnd(And->getOperand(0), And->getOperand(1), "", - BrccUser); - CmpInst *NewCmp = - CmpInst::Create(Cmp->getOpcode(), Cmp->getPredicate(), NewAnd, Zero, - "", BrccUser); - TheUse = NewCmp; - ++NumAndCmpsMoved; - DEBUG(BrccUser->getParent()->dump()); - } - } - return MadeChange; -} - /// \brief Scale down both weights to fit into uint32_t. static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) { uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse; Index: lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- lib/CodeGen/TargetLoweringBase.cpp +++ lib/CodeGen/TargetLoweringBase.cpp @@ -838,7 +838,6 @@ HasExtractBitsInsn = false; JumpIsExpensive = JumpIsExpensiveOverride; PredictableSelectIsExpensive = false; - MaskAndBranchFoldingIsLegal = false; EnableExtLdPromotion = false; HasFloatingPointExceptions = true; StackPointerRegisterToSaveRestore = 0; Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -412,6 +412,8 @@ return true; } + bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override; + bool hasAndNotCompare(SDValue) const override { // 'bics' return true; Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -554,8 +554,6 @@ setSchedulingPreference(Sched::Hybrid); - // Enable TBZ/TBNZ - MaskAndBranchFoldingIsLegal = true; EnableExtLdPromotion = true; // Set required alignment. @@ -10653,6 +10651,19 @@ Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); } +bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial( + const Instruction &AndI) const { + // Only sink 'and' mask to cmp use block if it is masking a single bit, since + // this is likely to be fold the and/cmp/br into a single tbz instruction. It + // may be beneficial to sink in other cases, but we would have to check that + // the cmp would not get folded into the br to form a cbz for these to be + // beneficial. + ConstantInt* Mask = dyn_cast(AndI.getOperand(1)); + if (!Mask) + return false; + return Mask->getUniqueInteger().isPowerOf2(); +} + void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { // Update IsSplitCSR in AArch64unctionInfo. AArch64FunctionInfo *AFI = Entry->getParent()->getInfo(); Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -806,6 +806,8 @@ return false; } + bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override; + bool hasAndNotCompare(SDValue Y) const override; /// Return the value type to use for ISD::SETCC. Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -4448,6 +4448,11 @@ return Subtarget.hasFastLZCNT(); } +bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial( + const Instruction &AndI) const { + return true; +} + bool X86TargetLowering::hasAndNotCompare(SDValue Y) const { if (!Subtarget.hasBMI()) return false; Index: test/CodeGen/AArch64/and-sink.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/and-sink.ll @@ -0,0 +1,65 @@ +; RUN: llc -march=aarch64 -verify-machineinstrs < %s | FileCheck %s + +@A = global i32 zeroinitializer +@B = global i32 zeroinitializer +@C = global i32 zeroinitializer + +; Test that and is sunk into cmp block to form tbz. +define i32 @and_sink1(i32 %a, i1 %c) { +; CHECK-LABEL: and_sink1: +; CHECK: tbz w1, #0 +; CHECK: str wzr, [x{{[0-9]+}}, :lo12:A] +; CHECK: tbnz {{w[0-9]+}}, #2 + %and = and i32 %a, 4 + br i1 %c, label %bb0, label %bb2 +bb0: + %cmp = icmp eq i32 %and, 0 + store i32 0, i32* @A + br i1 %cmp, label %bb1, label %bb2 +bb1: + ret i32 1 +bb2: + ret i32 0 +} + +; Test that both 'and' and cmp get sunk to form tbz. +define i32 @and_sink2(i32 %a, i1 %c, i1 %c2) { +; CHECK-LABEL: and_sink2: +; CHECK: str wzr, [x{{[0-9]+}}, :lo12:A] +; CHECK: tbz w1, #0 +; CHECK: str wzr, [x{{[0-9]+}}, :lo12:B] +; CHECK: tbz w2, #0 +; CHECK: str wzr, [x{{[0-9]+}}, :lo12:C] +; CHECK: tbnz {{w[0-9]+}}, #2 + %and = and i32 %a, 4 + store i32 0, i32* @A + br i1 %c, label %bb0, label %bb3 +bb0: + %cmp = icmp eq i32 %and, 0 + store i32 0, i32* @B + br i1 %c2, label %bb1, label %bb3 +bb1: + store i32 0, i32* @C + br i1 %cmp, label %bb2, label %bb0 +bb2: + ret i32 1 +bb3: + ret i32 0 +} + +; Test that 'and' is not sunk since cbz is a better alternative. +define i32 @and_sink3(i32 %a) { +; CHECK-LABEL: and_sink3: +; CHECK: and [[REG:w[0-9]+]], w0, #0x3 +; CHECK: [[LOOP:.L[A-Z0-9_]+]]: +; CHECK: str wzr, [x{{[0-9]+}}, :lo12:A] +; CHECK: cbz [[REG]], [[LOOP]] + %and = and i32 %a, 3 + br label %bb0 +bb0: + %cmp = icmp eq i32 %and, 0 + store i32 0, i32* @A + br i1 %cmp, label %bb0, label %bb2 +bb2: + ret i32 0 +} Index: test/CodeGen/AArch64/fast-isel-tbz.ll =================================================================== --- test/CodeGen/AArch64/fast-isel-tbz.ll +++ test/CodeGen/AArch64/fast-isel-tbz.ll @@ -278,8 +278,24 @@ ; Test that we don't fold the 'and' instruction into the compare. define i32 @icmp_eq_and_i32(i32 %a, i1 %c) { ; CHECK-LABEL: icmp_eq_and_i32 -; CHECK: and [[REG:w[0-9]+]], w0, #0x4 +; CHECK: and [[REG:w[0-9]+]], w0, #0x3 ; CHECK-NEXT: cbz [[REG]], {{LBB.+_3}} + %1 = and i32 %a, 3 + br i1 %c, label %bb0, label %bb2 +bb0: + %2 = icmp eq i32 %1, 0 + br i1 %2, label %bb1, label %bb2, !prof !0 +bb1: + ret i32 1 +bb2: + ret i32 0 +} + +; Test that we do fold the 'and' instruction into the compare and +; generate a tbz instruction for the conditional branch. +define i32 @icmp_eq_and1bit_i32(i32 %a, i1 %c) { +; CHECK-LABEL: icmp_eq_and1bit_i32 +; CHECK: tbz {{w[0-9]+}}, #2, {{LBB.+_3}} %1 = and i32 %a, 4 br i1 %c, label %bb0, label %bb2 bb0: Index: test/CodeGen/X86/and-sink.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/and-sink.ll @@ -0,0 +1,134 @@ +; RUN: llc -march=x86 -verify-machineinstrs < %s | FileCheck %s + +@A = global i32 zeroinitializer +@B = global i32 zeroinitializer +@C = global i32 zeroinitializer + +; Test that 'and' is sunk into bb0. +define i32 @and_sink1(i32 %a, i1 %c) { +; CHECK-LABEL: and_sink1: +; CHECK: testb $1, +; CHECK: je +; CHECK-NOT: andl $4, +; CHECK: movl $0, A +; CHECK: testb $4, +; CHECK: jne + %and = and i32 %a, 4 + br i1 %c, label %bb0, label %bb2 +bb0: + %cmp = icmp eq i32 %and, 0 + store i32 0, i32* @A + br i1 %cmp, label %bb1, label %bb2 +bb1: + ret i32 1 +bb2: + ret i32 0 +} + +; Test that both 'and' and cmp get sunk to bb1. +define i32 @and_sink2(i32 %a, i1 %c, i1 %c2) { +; CHECK-LABEL: and_sink2: +; CHECK: movl $0, A +; CHECK: testb $1, +; CHECK: je +; CHECK-NOT: andl $4, +; CHECK: movl $0, B +; CHECK: testb $1, +; CHECK: je +; CHECK: movl $0, C +; CHECK: testb $4, +; CHECK: jne + %and = and i32 %a, 4 + store i32 0, i32* @A + br i1 %c, label %bb0, label %bb3 +bb0: + %cmp = icmp eq i32 %and, 0 + store i32 0, i32* @B + br i1 %c2, label %bb1, label %bb3 +bb1: + store i32 0, i32* @C + br i1 %cmp, label %bb2, label %bb0 +bb2: + ret i32 1 +bb3: + ret i32 0 +} + +; Test that CodeGenPrepare doesn't get stuck in a loop sinking and hoisting a masked load. +define i32 @and_sink3(i1 %c, i32* %p) { +; CHECK-LABEL: and_sink3: +; CHECK: testb $1, +; CHECK: je +; CHECK: movzbl +; CHECK: movl $0, A +; CHECK: testl % +; CHECK: je + %load = load i32, i32* %p + %and = and i32 %load, 255 + br i1 %c, label %bb0, label %bb2 +bb0: + %cmp = icmp eq i32 %and, 0 + store i32 0, i32* @A + br i1 %cmp, label %bb1, label %bb2 +bb1: + ret i32 1 +bb2: + ret i32 0 +} + +; Test that CodeGenPrepare sinks/duplicates non-immediate 'and'. +define i32 @and_sink4(i32 %a, i32 %b, i1 %c) { +; CHECK-LABEL: and_sink4: +; CHECK: testb $1, +; CHECK: je +; CHECK-NOT: andl +; CHECK: movl $0, A +; CHECK: testl [[REG1:%[a-z0-9]+]], [[REG2:%[a-z0-9]+]] +; CHECK: jne +; CHECK: movl {{%[a-z0-9]+}}, B +; CHECK: testl [[REG1]], [[REG2]] +; CHECK: je + %and = and i32 %a, %b + %cmp = icmp eq i32 %and, 0 + br i1 %c, label %bb0, label %bb3 +bb0: + store i32 0, i32* @A + br i1 %cmp, label %bb1, label %bb3 +bb1: + %add = add i32 %a, %b + store i32 %add, i32* @B + br i1 %cmp, label %bb2, label %bb3 +bb2: + ret i32 1 +bb3: + ret i32 0 +} + + +; Test that CodeGenPrepare doesn't sink/duplicate non-immediate 'and' +; when it would increase register pressure. +define i32 @and_sink5(i32 %a, i32 %b, i32 %a2, i32 %b2, i1 %c) { +; CHECK-LABEL: and_sink5: +; CHECK: testb $1, +; CHECK: je +; CHECK: andl {{[0-9]+\(%[a-z0-9]+\)}}, [[REG:%[a-z0-9]+]] +; CHECK: movl $0, A +; CHECK: jne +; CHECK: movl {{%[a-z0-9]+}}, B +; CHECK: testl [[REG]], [[REG]] +; CHECK: je + %and = and i32 %a, %b + %cmp = icmp eq i32 %and, 0 + br i1 %c, label %bb0, label %bb3 +bb0: + store i32 0, i32* @A + br i1 %cmp, label %bb1, label %bb3 +bb1: + %add = add i32 %a2, %b2 + store i32 %add, i32* @B + br i1 %cmp, label %bb2, label %bb3 +bb2: + ret i32 1 +bb3: + ret i32 0 +}