Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -715,6 +715,10 @@ /// immediate offset and no index register. bool LSRWithInstrQueries() const; + /// Return true if the loop strength reduce pass should try to group + /// similar unfolded offsets together to use only one register. + bool LSRShouldGroupUnfoldableOffsets() const; + /// Return true if it's free to truncate a value of type Ty1 to type /// Ty2. e.g. On x86 it's free to truncate a i32 value in register EAX to i16 /// by referencing its sub-register AX. @@ -1590,6 +1594,7 @@ bool HasBaseReg, int64_t Scale, unsigned AddrSpace) = 0; virtual bool LSRWithInstrQueries() = 0; + virtual bool LSRShouldGroupUnfoldableOffsets() = 0; virtual bool isTruncateFree(Type *Ty1, Type *Ty2) = 0; virtual bool isProfitableToHoist(Instruction *I) = 0; virtual bool useAA() = 0; @@ -2026,6 +2031,9 @@ AddrSpace); } bool LSRWithInstrQueries() override { return Impl.LSRWithInstrQueries(); } + bool LSRShouldGroupUnfoldableOffsets() override { + return Impl.LSRShouldGroupUnfoldableOffsets(); + } bool isTruncateFree(Type *Ty1, Type *Ty2) override { return Impl.isTruncateFree(Ty1, Ty2); } Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -304,6 +304,8 @@ bool LSRWithInstrQueries() const { return false; } + bool LSRShouldGroupUnfoldableOffsets() const { return false; } + bool isTruncateFree(Type *Ty1, Type *Ty2) const { return false; } bool isProfitableToHoist(Instruction *I) const { return true; } Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -459,6 +459,10 @@ return TTIImpl->LSRWithInstrQueries(); } +bool TargetTransformInfo::LSRShouldGroupUnfoldableOffsets() const { + return TTIImpl->LSRShouldGroupUnfoldableOffsets(); +} + bool TargetTransformInfo::isTruncateFree(Type *Ty1, Type *Ty2) const { return TTIImpl->isTruncateFree(Ty1, Ty2); } Index: llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h =================================================================== --- llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -80,6 +80,7 @@ bool hasDivRemOp(Type *DataType, bool IsSigned); bool prefersVectorizedAddressing() { return false; } bool LSRWithInstrQueries() { return true; } + bool LSRShouldGroupUnfoldableOffsets(); bool supportsEfficientVectorElementLoadStore() { return true; } bool enableInterleavedAccessVectorization() { return true; } Index: llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -417,6 +417,12 @@ return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U)); } +// EXPERIMENTAL +static cl::opt LSRUNFOFFSETS("lsr-unfolded-offs", cl::init(true), cl::Hidden); +bool SystemZTTIImpl::LSRShouldGroupUnfoldableOffsets() { + return LSRUNFOFFSETS; +} + InstructionCost SystemZTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, Index: llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1187,6 +1187,8 @@ int64_t MinOffset = std::numeric_limits::max(); int64_t MaxOffset = std::numeric_limits::min(); + bool HasUnfoldedOffsets = false; + /// This records whether all of the fixups using this LSRUse are outside of /// the loop, in which case some special-case heuristics may be used. bool AllFixupsOutsideLoop = true; @@ -2001,13 +2003,18 @@ void CollectInterestingTypesAndFactors(); void CollectFixupsAndInitialFormulae(); + void adjustInitialFormulaeForOffsets(); // Support for sharing of LSRUses between LSRFixups. using UseMapTy = DenseMap; UseMapTy UseMap; + struct UseUnfOffsMapTy : std::multimap {}; + UseUnfOffsMapTy UseUnfOffsMap; bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, LSRUse::KindType Kind, MemAccessTy AccessTy); + bool reconcileUnfoldedAddressOffsets(LSRUse &LU, const SCEV *Expr, + int64_t &Offset, MemAccessTy AccessTy); std::pair getUse(const SCEV *&Expr, LSRUse::KindType Kind, MemAccessTy AccessTy); @@ -2598,6 +2605,30 @@ return true; } +bool LSRInstance::reconcileUnfoldedAddressOffsets(LSRUse &LU, const SCEV *Expr, + int64_t &Offset, + MemAccessTy AccessTy) { + assert(LU.Formulae.size() == 1 && "Expected an initial formula."); + assert(!LU.Fixups.empty() && "Expected at least one fixup."); + + Formula &F = LU.Formulae[0]; + const SCEV *Reg = F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]; + + const SCEV *Copy = Reg; + int64_t RegOffs = ExtractImmediate(Copy, SE); + if (RegOffs != 0 && Copy == Expr) { + int64_t NewOffset = Offset - RegOffs; + if (reconcileNewOffset(LU, NewOffset, /*HasBaseReg=*/true, + LSRUse::Address, AccessTy)) { + Offset = NewOffset; + LU.HasUnfoldedOffsets = true; + return true; + } + } + + return false; +} + /// Return an LSRUse index and an offset value for a fixup which needs the given /// expression, with the given kind and optional access type. Either reuse an /// existing use or create a new one, as needed. @@ -2608,8 +2639,25 @@ int64_t Offset = ExtractImmediate(Expr, SE); // Basic uses can't accept any offset, for example. + UseUnfOffsMapTy::iterator ItrUnfolded = UseUnfOffsMap.end(); if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr, Offset, /*HasBaseReg=*/ true)) { + if (Kind == LSRUse::Address) { + if (TTI.LSRShouldGroupUnfoldableOffsets()) { + // Try to find a usable existing LSRUse with an unfoldable offset + // that is reconcilable with Offset. + auto R = UseUnfOffsMap.equal_range(Expr); + for (auto I = R.first; I != R.second; ++I) { + size_t LUIdx = I->second; + LSRUse &LU = Uses[LUIdx]; + if (reconcileUnfoldedAddressOffsets(LU, Expr, Offset, AccessTy)) + return std::make_pair(LUIdx, Offset); + } + // Remember that the bare Expr has an unfoldable offset and record + // the new LUIdx for it below. + ItrUnfolded = UseUnfOffsMap.insert(std::make_pair(Expr, SIZE_MAX)); + } + } Expr = Copy; Offset = 0; } @@ -2617,6 +2665,8 @@ std::pair P = UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0)); if (!P.second) { + assert(ItrUnfolded == UseUnfOffsMap.end() && + "Could not reconcile unfolded offsets with same Expr?"); // A use already existed with this base. size_t LUIdx = P.first->second; LSRUse &LU = Uses[LUIdx]; @@ -2630,6 +2680,8 @@ P.first->second = LUIdx; Uses.push_back(LSRUse(Kind, AccessTy)); LSRUse &LU = Uses[LUIdx]; + if (ItrUnfolded != UseUnfOffsMap.end()) + ItrUnfolded->second = LUIdx; LU.MinOffset = Offset; LU.MaxOffset = Offset; @@ -3378,9 +3430,50 @@ } } + if (TTI.LSRShouldGroupUnfoldableOffsets()) + adjustInitialFormulaeForOffsets(); + LLVM_DEBUG(print_fixups(dbgs())); } +void LSRInstance::adjustInitialFormulaeForOffsets() { + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + if (LU.HasUnfoldedOffsets) { + // The range LU.MaxOffset - LU.MinOffset which covers all offsets is + // supposed to be foldable, but the initial formula may need to be + // adjusted. Adjust to eliminate negative offsets if needed: + if (LU.MinOffset < 0 && + !isAlwaysFoldable(TTI, LU.Kind, LU.AccessTy, /*BaseGV=*/nullptr, + LU.MinOffset, /*HasBaseReg=*/true)) { + int64_t OffsetAdjust = -LU.MinOffset; + + // Add OffsetAdjust to the formula. + assert(LU.Formulae.size() == 1 && "Expected an initial formula."); + Formula &F = LU.Formulae[0]; + const SCEV *Reg = F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]; + const SCEV *NewReg = + SE.getAddExpr(SE.getConstant(Reg->getType(), -OffsetAdjust), Reg); + if (F.ScaledReg == Reg) + F.ScaledReg = NewReg; + for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) + if (F.BaseRegs[i] == Reg) + F.BaseRegs[i] = NewReg; + + // Adjust offsets of all fixups. + for (LSRFixup &Fixup : LU.Fixups) + Fixup.Offset += OffsetAdjust; + + LU.MinOffset = 0; + LU.MaxOffset += OffsetAdjust; + assert(isAlwaysFoldable(TTI, LU.Kind, LU.AccessTy, /*BaseGV=*/nullptr, + LU.MaxOffset, /*HasBaseReg=*/true) && + "The range of offsets should be foldable."); + } + } + } +} + /// Insert a formula for the given expression into the given use, separating out /// loop-variant portions from loop-invariant and loop-computable portions. void @@ -4736,6 +4829,10 @@ Formula &Best = LU.Formulae[P.first->second]; if (IsBetterThan(F, Best)) std::swap(F, Best); + if (TTI.LSRShouldGroupUnfoldableOffsets() && LU.Kind == LSRUse::Address && + Best.UnfoldedOffset && !F.UnfoldedOffset) + continue; // Evaluate formula with unfolded offset also. + LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); dbgs() << "\n" " in favor of formula "; Index: llvm/test/CodeGen/SystemZ/loop-01.ll =================================================================== --- llvm/test/CodeGen/SystemZ/loop-01.ll +++ llvm/test/CodeGen/SystemZ/loop-01.ll @@ -320,3 +320,43 @@ %indvars.iv.next156.i.3 = add nsw i64 %indvars.iv155.i, 4 br label %for.body.i63 } + +; Test that there are two agfi:s in the preheader but none in the loop body. +define void @f10(double* %arg, double %V) { +; CHECK-Z13-LABEL: f10: +; CHECK-Z13-LABEL: # %bb.0: +; CHECK-Z13: lgr %r1, %r2 +; CHECK-Z13: lgr %r3, %r2 +; CHECK-Z13: agfi %r1, -1599952 +; CHECK-Z13: agfi %r3, 1600280 +; CHECK-Z13: lghi %r4, 0 +; CHECK-Z13-LABEL: .LBB9_1: +; CHECK-Z13-NOT: agfi +; CHECK-Z13: j .LBB9_1 +; CHECK-Z13-LABEL: .Lfunc_end9: + +bb: + br label %bb1 + +bb1: + %i = phi i64 [ 0, %bb ], [ %i13, %bb1 ] + %i2 = getelementptr inbounds double, double* %arg, i64 %i + store volatile double %V, double* %i2, align 8 + %i3 = add nsw i64 %i, -199994 + %i4 = getelementptr inbounds double, double* %arg, i64 %i3 + store volatile double %V, double* %i4, align 8 + %i5 = add nuw nsw i64 %i, 202011 + %i6 = getelementptr inbounds double, double* %arg, i64 %i5 + store volatile double %V, double* %i6, align 8 + %i7 = add nuw nsw i64 %i, 198013 + %i8 = getelementptr inbounds double, double* %arg, i64 %i7 + store volatile double %V, double* %i8, align 8 + %i9 = add nuw nsw i64 %i, 200035 + %i10 = getelementptr inbounds double, double* %arg, i64 %i9 + store volatile double %V, double* %i10, align 8 + %i11 = add nsw i64 %i, -199964 + %i12 = getelementptr inbounds double, double* %arg, i64 %i11 + store volatile double %V, double* %i12, align 8 + %i13 = add nuw nsw i64 %i, 20 + br label %bb1 +}