Index: llvm/trunk/lib/Transforms/Scalar/IndVarSimplify.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/IndVarSimplify.cpp +++ llvm/trunk/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -1017,6 +1017,8 @@ Instruction *widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter); bool widenLoopCompare(NarrowIVDefUse DU); + bool widenWithVariantLoadUse(NarrowIVDefUse DU); + void widenWithVariantLoadUseCodegen(NarrowIVDefUse DU); void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef); }; @@ -1361,6 +1363,146 @@ return true; } +/// If the narrow use is an instruction whose two operands are the defining +/// instruction of DU and a load instruction, then we have the following: +/// if the load is hoisted outside the loop, then we do not reach this function +/// as scalar evolution analysis works fine in widenIVUse with variables +/// hoisted outside the loop and efficient code is subsequently generated by +/// not emitting truncate instructions. But when the load is not hoisted +/// (whether due to limitation in alias analysis or due to a true legality), +/// then scalar evolution can not proceed with loop variant values and +/// inefficient code is generated. This function handles the non-hoisted load +/// special case by making the optimization generate the same type of code for +/// hoisted and non-hoisted load (widen use and eliminate sign extend +/// instruction). This special case is important especially when the induction +/// variables are affecting addressing mode in code generation. +bool WidenIV::widenWithVariantLoadUse(NarrowIVDefUse DU) { + Instruction *NarrowUse = DU.NarrowUse; + Instruction *NarrowDef = DU.NarrowDef; + Instruction *WideDef = DU.WideDef; + + // Handle the common case of add + const unsigned OpCode = NarrowUse->getOpcode(); + // Only Add/Sub/Mul instructions are supported. + if (OpCode != Instruction::Add && OpCode != Instruction::Sub && + OpCode != Instruction::Mul) + return false; + + // The operand that is not defined by NarrowDef of DU. Let's call it the + // other operand. + unsigned ExtendOperIdx = DU.NarrowUse->getOperand(0) == NarrowDef ? 1 : 0; + assert(DU.NarrowUse->getOperand(1 - ExtendOperIdx) == DU.NarrowDef && + "bad DU"); + + const SCEV *ExtendOperExpr = nullptr; + const OverflowingBinaryOperator *OBO = + cast(NarrowUse); + ExtendKind ExtKind = getExtendKind(NarrowDef); + if (ExtKind == SignExtended && OBO->hasNoSignedWrap()) + ExtendOperExpr = SE->getSignExtendExpr( + SE->getSCEV(NarrowUse->getOperand(ExtendOperIdx)), WideType); + else if (ExtKind == ZeroExtended && OBO->hasNoUnsignedWrap()) + ExtendOperExpr = SE->getZeroExtendExpr( + SE->getSCEV(NarrowUse->getOperand(ExtendOperIdx)), WideType); + else + return false; + + // We are interested in the other operand being a load instruction. + // But, we should look into relaxing this restriction later on. + auto *I = dyn_cast(NarrowUse->getOperand(ExtendOperIdx)); + if (I && I->getOpcode() != Instruction::Load) + return false; + + // Verifying that Defining operand is an AddRec + const SCEV *Op1 = SE->getSCEV(WideDef); + const SCEVAddRecExpr *AddRecOp1 = dyn_cast(Op1); + if (!AddRecOp1 || AddRecOp1->getLoop() != L) + return false; + // Verifying that other operand is an Extend. + if (ExtKind == SignExtended) { + if (!isa(ExtendOperExpr)) + return false; + } else { + if (!isa(ExtendOperExpr)) + return false; + } + + if (ExtKind == SignExtended) { + for (Use &U : NarrowUse->uses()) { + SExtInst *User = dyn_cast(U.getUser()); + if (!User || User->getType() != WideType) + return false; + } + } else { // ExtKind == ZeroExtended + for (Use &U : NarrowUse->uses()) { + ZExtInst *User = dyn_cast(U.getUser()); + if (!User || User->getType() != WideType) + return false; + } + } + + return true; +} + +/// Special Case for widening with variant Loads (see +/// WidenIV::widenWithVariantLoadUse). This is the code generation part. +void WidenIV::widenWithVariantLoadUseCodegen(NarrowIVDefUse DU) { + Instruction *NarrowUse = DU.NarrowUse; + Instruction *NarrowDef = DU.NarrowDef; + Instruction *WideDef = DU.WideDef; + + ExtendKind ExtKind = getExtendKind(NarrowDef); + + LLVM_DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n"); + + // Generating a widening use instruction. + Value *LHS = (NarrowUse->getOperand(0) == NarrowDef) + ? WideDef + : createExtendInst(NarrowUse->getOperand(0), WideType, + ExtKind, NarrowUse); + Value *RHS = (NarrowUse->getOperand(1) == NarrowDef) + ? WideDef + : createExtendInst(NarrowUse->getOperand(1), WideType, + ExtKind, NarrowUse); + + auto *NarrowBO = cast(NarrowUse); + auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS, + NarrowBO->getName()); + IRBuilder<> Builder(NarrowUse); + Builder.Insert(WideBO); + WideBO->copyIRFlags(NarrowBO); + + if (ExtKind == SignExtended) + ExtendKindMap[NarrowUse] = SignExtended; + else + ExtendKindMap[NarrowUse] = ZeroExtended; + + // Update the Use. + if (ExtKind == SignExtended) { + for (Use &U : NarrowUse->uses()) { + SExtInst *User = dyn_cast(U.getUser()); + if (User && User->getType() == WideType) { + LLVM_DEBUG(dbgs() << "INDVARS: eliminating " << *User << " replaced by " + << *WideBO << "\n"); + ++NumElimExt; + User->replaceAllUsesWith(WideBO); + DeadInsts.emplace_back(User); + } + } + } else { // ExtKind == ZeroExtended + for (Use &U : NarrowUse->uses()) { + ZExtInst *User = dyn_cast(U.getUser()); + if (User && User->getType() == WideType) { + LLVM_DEBUG(dbgs() << "INDVARS: eliminating " << *User << " replaced by " + << *WideBO << "\n"); + ++NumElimExt; + User->replaceAllUsesWith(WideBO); + DeadInsts.emplace_back(User); + } + } + } +} + /// Determine whether an individual user of the narrow IV can be widened. If so, /// return the wide clone of the user. Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { @@ -1458,6 +1600,16 @@ if (widenLoopCompare(DU)) return nullptr; + // We are here about to generate a truncate instruction that may hurt + // performance because the scalar evolution expression computed earlier + // in WideAddRec.first does not indicate a polynomial induction expression. + // In that case, look at the operands of the use instruction to determine + // if we can still widen the use instead of truncating its operand. + if (widenWithVariantLoadUse(DU)) { + widenWithVariantLoadUseCodegen(DU); + return nullptr; + } + // This user does not evaluate to a recurrence after widening, so don't // follow it. Instead insert a Trunc to kill off the original use, // eventually isolating the original narrow IV so it can be removed. Index: llvm/trunk/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll =================================================================== --- llvm/trunk/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll +++ llvm/trunk/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll @@ -273,3 +273,87 @@ %call = call i32 @dummy(i32* getelementptr inbounds ([100 x i32], [100 x i32]* @a, i32 0, i32 0), i32* getelementptr inbounds ([100 x i32], [100 x i32]* @b, i32 0, i32 0)) ret i32 0 } + +%struct.image = type {i32, i32} +define i32 @foo4(%struct.image* %input, i32 %length, i32* %in) { +entry: + %stride = getelementptr inbounds %struct.image, %struct.image* %input, i64 0, i32 1 + %0 = load i32, i32* %stride, align 4 + %cmp17 = icmp sgt i32 %length, 1 + br i1 %cmp17, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %channel = getelementptr inbounds %struct.image, %struct.image* %input, i64 0, i32 0 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + %1 = phi i32 [ %6, %for.body ] + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %2 = phi i32 [ 0, %entry ], [ %1, %for.cond.cleanup.loopexit ] + ret i32 %2 + +; mul instruction below is widened instead of generating a truncate instruction for it +; regardless if Load operand of mul is inside or outside the loop (we have both cases). +; CHECK: for.body: +; CHECK-NOT: trunc +for.body: ; preds = %for.body.lr.ph, %for.body + %x.018 = phi i32 [ 1, %for.body.lr.ph ], [ %add, %for.body ] + %add = add nuw nsw i32 %x.018, 1 + %3 = load i32, i32* %channel, align 8 + %mul = mul nsw i32 %3, %add + %idx.ext = sext i32 %mul to i64 + %add.ptr = getelementptr inbounds i32, i32* %in, i64 %idx.ext + %4 = load i32, i32* %add.ptr, align 4 + %mul1 = mul nsw i32 %0, %add + %idx.ext1 = sext i32 %mul1 to i64 + %add.ptr1 = getelementptr inbounds i32, i32* %in, i64 %idx.ext1 + %5 = load i32, i32* %add.ptr1, align 4 + %6 = add i32 %4, %5 + %cmp = icmp slt i32 %add, %length + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + + +define i32 @foo5(%struct.image* %input, i32 %length, i32* %in) { +entry: + %stride = getelementptr inbounds %struct.image, %struct.image* %input, i64 0, i32 1 + %0 = load i32, i32* %stride, align 4 + %cmp17 = icmp sgt i32 %length, 1 + br i1 %cmp17, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %channel = getelementptr inbounds %struct.image, %struct.image* %input, i64 0, i32 0 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + %1 = phi i32 [ %7, %for.body ] + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %2 = phi i32 [ 0, %entry ], [ %1, %for.cond.cleanup.loopexit ] + ret i32 %2 + +; This example is the same as above except that the first mul is used in two places +; and this may result in having two versions of the multiply: an i32 and i64 version. +; In this case, keep the trucate instructions to avoid this redundancy. +; CHECK: for.body: +; CHECK: trunc +for.body: ; preds = %for.body.lr.ph, %for.body + %x.018 = phi i32 [ 1, %for.body.lr.ph ], [ %add, %for.body ] + %add = add nuw nsw i32 %x.018, 1 + %3 = load i32, i32* %channel, align 8 + %mul = mul nsw i32 %3, %add + %idx.ext = sext i32 %mul to i64 + %add.ptr = getelementptr inbounds i32, i32* %in, i64 %idx.ext + %4 = load i32, i32* %add.ptr, align 4 + %mul1 = mul nsw i32 %0, %add + %idx.ext1 = sext i32 %mul1 to i64 + %add.ptr1 = getelementptr inbounds i32, i32* %in, i64 %idx.ext1 + %5 = load i32, i32* %add.ptr1, align 4 + %6 = add i32 %4, %5 + %7 = add i32 %6, %mul + %cmp = icmp slt i32 %add, %length + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +}