Index: llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp +++ llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp @@ -64,6 +64,9 @@ "computations were sunk"); STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads"); STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized"); +STATISTIC(NumAndsAdded, + "Number of and mask instructions added to form ext loads"); +STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized"); STATISTIC(NumRetsDup, "Number of return instructions duplicated"); STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); @@ -173,6 +176,7 @@ bool optimizeCallInst(CallInst *CI, bool& ModifiedDT); bool moveExtToFormExtLoad(Instruction *&I); bool optimizeExtUses(Instruction *I); + bool optimizeLoadExt(LoadInst *I); bool optimizeSelectInst(SelectInst *SI); bool optimizeShuffleVectorInst(ShuffleVectorInst *SI); bool optimizeSwitchInst(SwitchInst *CI); @@ -4256,6 +4260,189 @@ return MadeChange; } +// Find loads whose uses only use some of the loaded value's bits. Add an "and" +// just after the load if the target can fold this into one extload instruction, +// with the hope of eliminating some of the other later "and" instructions using +// the loaded value. "and"s that are made trivially redundant by the insertion +// of the new "and" are removed by this function, while others (e.g. those whose +// path from the load goes through a phi) are left for isel to potentially +// remove. +// +// For example: +// +// b0: +// x = load i32 +// ... +// b1: +// y = and x, 0xff +// z = use y +// +// becomes: +// +// b0: +// x = load i32 +// x' = and x, 0xff +// ... +// b1: +// z = use x' +// +// whereas: +// +// b0: +// x1 = load i32 +// ... +// b1: +// x2 = load i32 +// ... +// b2: +// x = phi x1, x2 +// y = and x, 0xff +// +// becomes (after a call to optimizeLoadExt for each load): +// +// b0: +// x1 = load i32 +// x1' = and x1, 0xff +// ... +// b1: +// x2 = load i32 +// x2' = and x2, 0xff +// ... +// b2: +// x = phi x1', x2' +// y = and x, 0xff +// + +bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) { + + if (!Load->isSimple() || + !(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy())) + return false; + + // Skip loads we've already transformed or have no reason to transform. + if (Load->hasOneUse()) { + User *LoadUser = *Load->user_begin(); + if (cast(LoadUser)->getParent() == Load->getParent() && + !dyn_cast(LoadUser)) + return false; + } + + // Look at all uses of Load, looking through phis, to determine how many bits + // of the loaded value are needed. + SmallVector WorkList; + SmallPtrSet Visited; + SmallVector AndsToMaybeRemove; + for (auto *U : Load->users()) + WorkList.push_back(cast(U)); + + EVT LoadResultVT = TLI->getValueType(*DL, Load->getType()); + unsigned BitWidth = LoadResultVT.getSizeInBits(); + APInt DemandBits(BitWidth, 0); + APInt WidestAndBits(BitWidth, 0); + + while (!WorkList.empty()) { + Instruction *I = WorkList.back(); + WorkList.pop_back(); + + // Break use-def graph loops. + if (!Visited.insert(I).second) + continue; + + // For a PHI node, push all of its users. + if (auto *Phi = dyn_cast(I)) { + for (auto *U : Phi->users()) + WorkList.push_back(cast(U)); + continue; + } + + switch (I->getOpcode()) { + case llvm::Instruction::And: { + auto *AndC = dyn_cast(I->getOperand(1)); + if (!AndC) + return false; + APInt AndBits = AndC->getValue(); + DemandBits |= AndBits; + // Keep track of the widest and mask we see. + if (AndBits.ugt(WidestAndBits)) + WidestAndBits = AndBits; + if (AndBits == WidestAndBits && I->getOperand(0) == Load) + AndsToMaybeRemove.push_back(I); + break; + } + + case llvm::Instruction::Shl: { + auto *ShlC = dyn_cast(I->getOperand(1)); + if (!ShlC) + return false; + uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1); + auto ShlDemandBits = APInt::getAllOnesValue(BitWidth).lshr(ShiftAmt); + DemandBits |= ShlDemandBits; + break; + } + + case llvm::Instruction::Trunc: { + EVT TruncVT = TLI->getValueType(*DL, I->getType()); + unsigned TruncBitWidth = TruncVT.getSizeInBits(); + auto TruncBits = APInt::getAllOnesValue(TruncBitWidth).zext(BitWidth); + DemandBits |= TruncBits; + break; + } + + default: + return false; + } + } + + uint32_t ActiveBits = DemandBits.getActiveBits(); + // Avoid hoisting (and (load x) 1) since it is unlikely to be folded by the + // target even if isLoadExtLegal says an i1 EXTLOAD is valid. For example, + // for the AArch64 target isLoadExtLegal(ZEXTLOAD, i32, i1) returns true, but + // (and (load x) 1) is not matched as a single instruction, rather as a LDR + // followed by an AND. + // TODO: Look into removing this restriction by fixing backends to either + // return false for isLoadExtLegal for i1 or have them select this pattern to + // a single instruction. + // + // Also avoid hoisting if we didn't see any ands with the exact DemandBits + // mask, since these are the only ands that will be removed by isel. + if (ActiveBits <= 1 || !APIntOps::isMask(ActiveBits, DemandBits) || + WidestAndBits != DemandBits) + return false; + + LLVMContext &Ctx = Load->getType()->getContext(); + Type *TruncTy = Type::getIntNTy(Ctx, ActiveBits); + EVT TruncVT = TLI->getValueType(*DL, TruncTy); + + // Reject cases that won't be matched as extloads. + if (!LoadResultVT.bitsGT(TruncVT) || !TruncVT.isRound() || + !TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT)) + return false; + + IRBuilder<> Builder(Load->getNextNode()); + auto *NewAnd = dyn_cast( + Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits))); + + // Replace all uses of load with new and (except for the use of load in the + // new and itself). + Load->replaceAllUsesWith(NewAnd); + NewAnd->setOperand(0, Load); + + // Remove any and instructions that are now redundant. + for (auto *And : AndsToMaybeRemove) + // Check that the and mask is the same as the one we decided to put on the + // new and. + if (cast(And->getOperand(1))->getValue() == DemandBits) { + And->replaceAllUsesWith(NewAnd); + if (&*CurInstIterator == And) + CurInstIterator = std::next(And->getIterator()); + And->eraseFromParent(); + ++NumAndUses; + } + + ++NumAndsAdded; + return true; +} + /// Check if V (an operand of a select instruction) is an expensive instruction /// that is only used once. static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) { @@ -4957,8 +5144,10 @@ if (LoadInst *LI = dyn_cast(I)) { stripInvariantGroupMetadata(*LI); if (TLI) { + bool Modified = optimizeLoadExt(LI); unsigned AS = LI->getPointerAddressSpace(); - return optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS); + Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS); + return Modified; } return false; } Index: llvm/trunk/test/CodeGen/AArch64/free-zext.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/free-zext.ll +++ llvm/trunk/test/CodeGen/AArch64/free-zext.ll @@ -26,3 +26,46 @@ store i64 %load64, i64* %dst2, align 8 ret void } + +; Test for CodeGenPrepare::optimizeLoadExt(): simple case: two loads +; feeding a phi that zext's each loaded value. +define i32 @test_free_zext3(i32* %ptr, i32* %ptr2, i32* %dst, i32 %c) { +; CHECK-LABEL: test_free_zext3: +bb1: +; CHECK: ldrh [[REG:w[0-9]+]] +; CHECK-NOT: and {{w[0-9]+}}, [[REG]], #0xffff + %tmp1 = load i32, i32* %ptr, align 4 + %cmp = icmp ne i32 %c, 0 + br i1 %cmp, label %bb2, label %bb3 +bb2: +; CHECK: ldrh [[REG2:w[0-9]+]] +; CHECK-NOT: and {{w[0-9]+}}, [[REG2]], #0xffff + %tmp2 = load i32, i32* %ptr2, align 4 + br label %bb3 +bb3: + %tmp3 = phi i32 [ %tmp1, %bb1 ], [ %tmp2, %bb2 ] +; CHECK-NOT: and {{w[0-9]+}}, {{w[0-9]+}}, #0xffff + %tmpand = and i32 %tmp3, 65535 + ret i32 %tmpand +} + +; Test for CodeGenPrepare::optimizeLoadExt(): check case of zext-able +; load feeding a phi in the same block. +define void @test_free_zext4(i32* %ptr, i32* %ptr2, i32* %dst) { +; CHECK-LABEL: test_free_zext4: +; CHECK: ldrh [[REG:w[0-9]+]] +; TODO: fix isel to remove final and XCHECK-NOT: and {{w[0-9]+}}, {{w[0-9]+}}, #0xffff +; CHECK: ldrh [[REG:w[0-9]+]] +bb1: + %load1 = load i32, i32* %ptr, align 4 + br label %loop +loop: + %phi = phi i32 [ %load1, %bb1 ], [ %load2, %loop ] + %and = and i32 %phi, 65535 + store i32 %and, i32* %dst, align 4 + %load2 = load i32, i32* %ptr2, align 4 + %cmp = icmp ne i32 %and, 0 + br i1 %cmp, label %loop, label %end +end: + ret void +} Index: llvm/trunk/test/Transforms/CodeGenPrepare/free-zext.ll =================================================================== --- llvm/trunk/test/Transforms/CodeGenPrepare/free-zext.ll +++ llvm/trunk/test/Transforms/CodeGenPrepare/free-zext.ll @@ -0,0 +1,82 @@ +; RUN: opt -S -codegenprepare -mtriple=aarch64-linux %s | FileCheck %s + +; Test for CodeGenPrepare::optimizeLoadExt(): simple case: two loads +; feeding a phi that zext's each loaded value. +define i32 @test_free_zext(i32* %ptr, i32* %ptr2, i32 %c) { +; CHECK-LABEL: @test_free_zext( +bb1: +; CHECK-LABEL: bb1: +; CHECK: %[[T1:.*]] = load +; CHECK: %[[A1:.*]] = and i32 %[[T1]], 65535 + %load1 = load i32, i32* %ptr, align 4 + %cmp = icmp ne i32 %c, 0 + br i1 %cmp, label %bb2, label %bb3 +bb2: +; CHECK-LABEL: bb2: +; CHECK: %[[T2:.*]] = load +; CHECK: %[[A2:.*]] = and i32 %[[T2]], 65535 + %load2 = load i32, i32* %ptr2, align 4 + br label %bb3 +bb3: +; CHECK-LABEL: bb3: +; CHECK: phi i32 [ %[[A1]], %bb1 ], [ %[[A2]], %bb2 ] + %phi = phi i32 [ %load1, %bb1 ], [ %load2, %bb2 ] + %and = and i32 %phi, 65535 + ret i32 %and +} + +; Test for CodeGenPrepare::optimizeLoadExt(): exercise all opcode +; cases of active bit calculation. +define i32 @test_free_zext2(i32* %ptr, i16* %dst16, i32* %dst32, i32 %c) { +; CHECK-LABEL: @test_free_zext2( +bb1: +; CHECK-LABEL: bb1: +; CHECK: %[[T1:.*]] = load +; CHECK: %[[A1:.*]] = and i32 %[[T1]], 65535 + %load1 = load i32, i32* %ptr, align 4 + %cmp = icmp ne i32 %c, 0 + br i1 %cmp, label %bb2, label %bb4 +bb2: +; CHECK-LABEL: bb2: + %trunc = trunc i32 %load1 to i16 + store i16 %trunc, i16* %dst16, align 2 + br i1 %cmp, label %bb3, label %bb4 +bb3: +; CHECK-LABEL: bb3: + %shl = shl i32 %load1, 16 + store i32 %shl, i32* %dst32, align 4 + br label %bb4 +bb4: +; CHECK-LABEL: bb4: +; CHECK-NOT: and +; CHECK: ret i32 %[[A1]] + %and = and i32 %load1, 65535 + ret i32 %and +} + +; Test for CodeGenPrepare::optimizeLoadExt(): check case of zext-able +; load feeding a phi in the same block. +define void @test_free_zext3(i32* %ptr, i32* %ptr2, i32* %dst, i64* %c) { +; CHECK-LABEL: @test_free_zext3( +bb1: +; CHECK-LABEL: bb1: +; CHECK: %[[T1:.*]] = load +; CHECK: %[[A1:.*]] = and i32 %[[T1]], 65535 + %load1 = load i32, i32* %ptr, align 4 + br label %loop +loop: +; CHECK-LABEL: loop: +; CHECK: phi i32 [ %[[A1]], %bb1 ], [ %[[A2]], %loop ] + %phi = phi i32 [ %load1, %bb1 ], [ %load2, %loop ] + %and = and i32 %phi, 65535 + store i32 %and, i32* %dst, align 4 + %idx = load volatile i64, i64* %c, align 4 + %addr = getelementptr inbounds i32, i32* %ptr2, i64 %idx +; CHECK: %[[T2:.*]] = load i32 +; CHECK: %[[A2:.*]] = and i32 %[[T2]], 65535 + %load2 = load i32, i32* %addr, align 4 + %cmp = icmp ne i64 %idx, 0 + br i1 %cmp, label %loop, label %end +end: + ret void +}