Index: llvm/lib/Transforms/Scalar/IndVarSimplify.cpp =================================================================== --- llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -1024,6 +1024,7 @@ Instruction *widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter); bool widenLoopCompare(NarrowIVDefUse DU); + bool widenWithVariantLoadUse(NarrowIVDefUse DU); void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef); }; @@ -1368,6 +1369,128 @@ return true; } +/// If the narrow use is an instruction whose two operands are the defining +/// instruction of DU and a load instruction, then we have the following: +/// if the load is hoisted outside the loop, then we do not reach this function +/// as scalar evolution analysis works fine in widenIVUse with variables +/// hoisted outside the loop and efficient code is subsequently generated by +/// not emitting truncate instructions. But when the load is not hoisted +/// (whether due to limitation in alias analysis or due to a true legality), +/// then scalar evolution can not proceed with loop variant values and +/// inefficient code is generated. This function handles the non-hoisted load +/// special case by making the optimization generate the same type of code for +/// hoisted and non-hoisted load (widen use and eliminate sign extend +/// instruction). This special case is important especially when the induction +/// variables are affecting addressing mode in code generation. +bool WidenIV::widenWithVariantLoadUse(NarrowIVDefUse DU) { + Instruction *NarrowUse = DU.NarrowUse; + Instruction *NarrowDef = DU.NarrowDef; + Instruction *WideDef = DU.WideDef; + + // Handle the common case of add + const unsigned OpCode = NarrowUse->getOpcode(); + // Only Add/Sub/Mul instructions are supported. + if (OpCode != Instruction::Add && OpCode != Instruction::Sub && + OpCode != Instruction::Mul) + return false; + + // The operand that is not defined by NarrowDef of DU. Let's call it the + // other operand. + unsigned ExtendOperIdx = DU.NarrowUse->getOperand(0) == NarrowDef ? 1 : 0; + assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU"); + + const SCEV *ExtendOperExpr = nullptr; + const OverflowingBinaryOperator *OBO = + cast(NarrowUse); + ExtendKind ExtKind = getExtendKind(NarrowDef); + if (ExtKind == SignExtended && OBO->hasNoSignedWrap()) + ExtendOperExpr = SE->getSignExtendExpr( + SE->getSCEV(NarrowUse->getOperand(ExtendOperIdx)), WideType); + else if(ExtKind == ZeroExtended && OBO->hasNoUnsignedWrap()) + ExtendOperExpr = SE->getZeroExtendExpr( + SE->getSCEV(NarrowUse->getOperand(ExtendOperIdx)), WideType); + else + return false; + + // We are interested in the other operand being a load instruction. + // But, we should look into relaxing this restriction later on. + auto *I = dyn_cast(NarrowUse->getOperand(ExtendOperIdx)); + if (I && I->getOpcode() != Instruction::Load) + return false; + + // Verifying that Defining operand is an AddRec + const SCEV *op1 = SE->getSCEV(WideDef); + const SCEVAddRecExpr *AddRecOp1 = dyn_cast(op1); + if (!AddRecOp1 || AddRecOp1->getLoop() != L) + return false; + // Verifying that other operand is an Extend. + if (ExtKind == SignExtended) + { + const SCEVSignExtendExpr *AddRecOp2 = dyn_cast(ExtendOperExpr); + if (!AddRecOp2) + return false; + } + else + { + const SCEVZeroExtendExpr *AddRecOp2 = dyn_cast(ExtendOperExpr); + if (!AddRecOp2) + return false; + } + + LLVM_DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n"); + + // Generating a widening use instruction. + Value *LHS = (NarrowUse->getOperand(0) == NarrowDef) + ? WideDef + : createExtendInst(NarrowUse->getOperand(0), WideType, + ExtKind, NarrowUse); + Value *RHS = (NarrowUse->getOperand(1) == NarrowDef) + ? WideDef + : createExtendInst(NarrowUse->getOperand(1), WideType, + ExtKind, NarrowUse); + + auto *NarrowBO = cast(NarrowUse); + auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS, + NarrowBO->getName()); + IRBuilder<> Builder(NarrowUse); + Builder.Insert(WideBO); + WideBO->copyIRFlags(NarrowBO); + + if (ExtKind == SignExtended) + ExtendKindMap[NarrowUse] = SignExtended; + else + ExtendKindMap[NarrowUse] = ZeroExtended; + + // Check if widening the use eliminates sign/zero extend instructions. + bool WideningUseful = false; + for (Use &U : NarrowUse->uses()) { + auto *User = cast(U.getUser()); + + if (((isa(User) && (getExtendKind(NarrowUse) == SignExtended)) || + (isa(User) && (getExtendKind(NarrowUse) == ZeroExtended))) && + (User->getType() == WideType)) { + LLVM_DEBUG(dbgs() << "INDVARS: eliminating " << *User + << " replaced by " << *WideBO << "\n"); + ++NumElimExt; + User->replaceAllUsesWith(WideBO); + DeadInsts.emplace_back(User); + WideningUseful = true; + } + else { + WideningUseful = false; + break; + } + } + + // In case the widened instruction is not comsumed by s/z extend instrunction + // and is hence not useful, we throw it away. + if (!WideningUseful) { + LLVM_DEBUG(dbgs() << "Widening use not useful: " << *WideBO << "\n"); + DeadInsts.emplace_back(WideBO); + } + return WideningUseful; +} + /// Determine whether an individual user of the narrow IV can be widened. If so, /// return the wide clone of the user. Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { @@ -1465,6 +1588,14 @@ if (widenLoopCompare(DU)) return nullptr; + // We are here about to generate a truncate instruction that may hurt + // performance because the scalar evolution expression computed earlier + // in WideAddRec.first does not indicate a polynomial induction expression. + // In that case, look at the operands of the use instruction to determine + // if we can still widen the use instead of truncating its operand. + if (widenWithVariantLoadUse(DU)) + return nullptr; + // This user does not evaluate to a recurrence after widening, so don't // follow it. Instead insert a Trunc to kill off the original use, // eventually isolating the original narrow IV so it can be removed. Index: llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll =================================================================== --- llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll +++ llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll @@ -273,3 +273,87 @@ %call = call i32 @dummy(i32* getelementptr inbounds ([100 x i32], [100 x i32]* @a, i32 0, i32 0), i32* getelementptr inbounds ([100 x i32], [100 x i32]* @b, i32 0, i32 0)) ret i32 0 } + +%struct.image = type {i32, i32} +define i32 @foo4(%struct.image* %input, i32 %length, i32* %in) { +entry: + %stride = getelementptr inbounds %struct.image, %struct.image* %input, i64 0, i32 1 + %0 = load i32, i32* %stride, align 4 + %cmp17 = icmp sgt i32 %length, 1 + br i1 %cmp17, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %channel = getelementptr inbounds %struct.image, %struct.image* %input, i64 0, i32 0 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + %1 = phi i32 [ %6, %for.body ] + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %2 = phi i32 [ 0, %entry ], [ %1, %for.cond.cleanup.loopexit ] + ret i32 %2 + +; mul instruction below is widened instead of generating a truncate instruction for it +; regardless if Load operand of mul is inside or outside the loop (we have both cases). +; CHECK: for.body: +; CHECK-NOT: trunc +for.body: ; preds = %for.body.lr.ph, %for.body + %x.018 = phi i32 [ 1, %for.body.lr.ph ], [ %add, %for.body ] + %add = add nuw nsw i32 %x.018, 1 + %3 = load i32, i32* %channel, align 8 + %mul = mul nsw i32 %3, %add + %idx.ext = sext i32 %mul to i64 + %add.ptr = getelementptr inbounds i32, i32* %in, i64 %idx.ext + %4 = load i32, i32* %add.ptr, align 4 + %mul1 = mul nsw i32 %0, %add + %idx.ext1 = sext i32 %mul1 to i64 + %add.ptr1 = getelementptr inbounds i32, i32* %in, i64 %idx.ext1 + %5 = load i32, i32* %add.ptr1, align 4 + %6 = add i32 %4, %5 + %cmp = icmp slt i32 %add, %length + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + + +define i32 @foo5(%struct.image* %input, i32 %length, i32* %in) { +entry: + %stride = getelementptr inbounds %struct.image, %struct.image* %input, i64 0, i32 1 + %0 = load i32, i32* %stride, align 4 + %cmp17 = icmp sgt i32 %length, 1 + br i1 %cmp17, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %channel = getelementptr inbounds %struct.image, %struct.image* %input, i64 0, i32 0 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + %1 = phi i32 [ %7, %for.body ] + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %2 = phi i32 [ 0, %entry ], [ %1, %for.cond.cleanup.loopexit ] + ret i32 %2 + +; This example is the same as above except that the first mul is used in two places +; and this may result in having two versions of the multiply: an i32 and i64 version. +; In this case, keep the trucate instructions to avoid this redundancy. +; CHECK: for.body: +; CHECK: trunc +for.body: ; preds = %for.body.lr.ph, %for.body + %x.018 = phi i32 [ 1, %for.body.lr.ph ], [ %add, %for.body ] + %add = add nuw nsw i32 %x.018, 1 + %3 = load i32, i32* %channel, align 8 + %mul = mul nsw i32 %3, %add + %idx.ext = sext i32 %mul to i64 + %add.ptr = getelementptr inbounds i32, i32* %in, i64 %idx.ext + %4 = load i32, i32* %add.ptr, align 4 + %mul1 = mul nsw i32 %0, %add + %idx.ext1 = sext i32 %mul1 to i64 + %add.ptr1 = getelementptr inbounds i32, i32* %in, i64 %idx.ext1 + %5 = load i32, i32* %add.ptr1, align 4 + %6 = add i32 %4, %5 + %7 = add i32 %6, %mul + %cmp = icmp slt i32 %add, %length + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +}