Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -64,6 +64,7 @@ "computations were sunk"); STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads"); STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized"); +STATISTIC(NumAndsMoved, "Number of and mask instructions combined with loads"); STATISTIC(NumRetsDup, "Number of return instructions duplicated"); STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); @@ -173,6 +174,7 @@ bool optimizeCallInst(CallInst *CI, bool& ModifiedDT); bool moveExtToFormExtLoad(Instruction *&I); bool optimizeExtUses(Instruction *I); + bool optimizeLoadExt(LoadInst *I); bool optimizeSelectInst(SelectInst *SI); bool optimizeShuffleVectorInst(ShuffleVectorInst *SI); bool optimizeSwitchInst(SwitchInst *CI); @@ -4172,6 +4174,132 @@ return MadeChange; } +// Find loads whose uses only use some of the loaded value's bits. Add an and +// just after the load if the target can fold this into one extload instruction, +// with the hope of eliminating some of the other later and instructions using +// the loaded value. +// +// For example: +// +// b0: +// x = load i32 +// ... +// b1: +// y = and x, 0xff +// +// becomes: +// +// b0: +// x = load i32 +// x' = and x, 0xff +// ... +// b1: +// y = and x', 0xff +// +// with the assumption that the y and will be eliminated by isel and the x +// and x' instructions will be selected as a single extload instruction. +bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) { + + if (!Load->isSimple() || + !(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy())) + return false; + + // Skip loads we've already transformed or have no reason to transform. + if (Load->hasOneUse()) { + User *LoadUser = *Load->user_begin(); + if (!dyn_cast(LoadUser) || + (cast(LoadUser)->getParent() == Load->getParent() && + !dyn_cast(LoadUser))) + return false; + } + + // Look at all uses of Load, looking through phis, to determine how many bits + // of the loaded value are needed. + SmallVector WorkList; + SmallPtrSet Visited; + for (auto *U : Load->users()) + WorkList.push_back(U); + + EVT LoadResultVT = TLI->getValueType(*DL, Load->getType()); + unsigned BitWidth = LoadResultVT.getSizeInBits(); + APInt DemandBits(BitWidth, 0); + APInt WidestAndBits(BitWidth, 0); + + while (!WorkList.empty()) { + Value *V = WorkList.back(); + WorkList.pop_back(); + + // Break use-def graph loops. + if (!Visited.insert(V).second) + continue; + + // For a PHI node, push all of its incoming values. + if (auto *Phi = dyn_cast(V)) { + for (auto *U : Phi->users()) + WorkList.push_back(U); + continue; + } + + if (auto *BinOp = dyn_cast(V)) { + if (BinOp->getOpcode() == llvm::Instruction::And) { + if (auto *AndC = dyn_cast(BinOp->getOperand(1))) { + APInt AndBits = AndC->getValue(); + DemandBits |= AndBits; + // Keep track of the widest and mask we see. + if (AndBits.ugt(WidestAndBits)) + WidestAndBits = AndBits; + continue; + } + } else if (BinOp->getOpcode() == llvm::Instruction::Shl) { + if (auto *ShlC = dyn_cast(BinOp->getOperand(1))) { + uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1); + auto ShlDemandBits = APInt::getAllOnesValue(BitWidth).lshr(ShiftAmt); + DemandBits |= ShlDemandBits; + continue; + } + } + } else if (auto *Trunc = dyn_cast(V)) { + EVT TruncVT = TLI->getValueType(*DL, Trunc->getType()); + unsigned TruncBitWidth = TruncVT.getSizeInBits(); + auto TruncBits = APInt::getAllOnesValue(TruncBitWidth).zext(BitWidth); + DemandBits |= TruncBits; + continue; + } + + return false; + } + + uint32_t ActiveBits = DemandBits.getActiveBits(); + // Avoid hoisting and 0x1 since it is unlikely to be folded by the target even + // if isLoadExtLegal says an i1 EXTLOAD is valid. + // Also avoid hoisting if we didn't see any ands with the exact DemandBits + // mask, since these are the only ands that will be removed by isel. + if (ActiveBits <= 1 || !APIntOps::isMask(ActiveBits, DemandBits) || + WidestAndBits != DemandBits) + return false; + + LLVMContext &Ctx = Load->getType()->getContext(); + Type *ExtTy = Type::getIntNTy(Ctx, ActiveBits); + EVT ExtVT = TLI->getValueType(*DL, ExtTy); + + // Reject cases that won't be matched as extloads. + if (!LoadResultVT.bitsGT(ExtVT) || !ExtVT.isRound() || + !TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, ExtVT)) + return false; + + IRBuilder<> Builder(Load->getNextNode()); + auto *NewAnd = dyn_cast( + Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits))); + + // Replace all uses of load with new and (except for the use of load in the + // new and itself). + Load->replaceAllUsesWith(NewAnd); + NewAnd->setOperand(0, Load); + + ++NumAndsMoved; + return true; +} + /// Check if V (an operand of a select instruction) is an expensive instruction /// that is only used once. static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) { @@ -4873,8 +5001,14 @@ if (LoadInst *LI = dyn_cast(I)) { stripInvariantGroupMetadata(*LI); if (TLI) { + bool Modified = false; + + if (TLI->enableExtLdPromotion() && !DisableExtLdPromotion) + Modified |= optimizeLoadExt(LI); + unsigned AS = LI->getPointerAddressSpace(); - return optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS); + Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS); + return Modified; } return false; } Index: test/CodeGen/AArch64/free-zext.ll =================================================================== --- test/CodeGen/AArch64/free-zext.ll +++ test/CodeGen/AArch64/free-zext.ll @@ -26,3 +26,23 @@ store i64 %load64, i64* %dst2, align 8 ret void } + +define i32 @test_free_zext3(i32* %ptr, i32* %ptr2, i32* %dst, i32 %c) { +; CHECK-LABEL: test_free_zext3: +bb1: +; CHECK: ldrh [[REG:w[0-9]+]] +; CHECK-NOT: and {{w[0-9]+}}, [[REG]], #0xffff + %tmp1 = load i32, i32* %ptr, align 4 + %cmp = icmp ne i32 %c, 0 + br i1 %cmp, label %bb2, label %bb3 +bb2: +; CHECK: ldrh [[REG2:w[0-9]+]] +; CHECK-NOT: and {{w[0-9]+}}, [[REG2]], #0xffff + %tmp2 = load i32, i32* %ptr2, align 4 + br label %bb3 +bb3: + %tmp3 = phi i32 [ %tmp1, %bb1 ], [ %tmp2, %bb2 ] +; CHECK-NOT: and {{w[0-9]+}}, {{w[0-9]+}}, #0xffff + %tmpand = and i32 %tmp3, 65535 + ret i32 %tmpand +}