Index: llvm/lib/Transforms/Scalar/IndVarSimplify.cpp =================================================================== --- llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -730,8 +730,8 @@ Instruction *widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter); bool widenLoopCompare(NarrowIVDefUse DU); - bool widenWithVariantLoadUse(NarrowIVDefUse DU); - void widenWithVariantLoadUseCodegen(NarrowIVDefUse DU); + bool widenWithVariantUse(NarrowIVDefUse DU); + void widenWithVariantUseCodegen(NarrowIVDefUse DU); void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef); }; @@ -1069,20 +1069,27 @@ return true; } -/// If the narrow use is an instruction whose two operands are the defining -/// instruction of DU and a load instruction, then we have the following: -/// if the load is hoisted outside the loop, then we do not reach this function -/// as scalar evolution analysis works fine in widenIVUse with variables -/// hoisted outside the loop and efficient code is subsequently generated by -/// not emitting truncate instructions. But when the load is not hoisted -/// (whether due to limitation in alias analysis or due to a true legality), -/// then scalar evolution can not proceed with loop variant values and -/// inefficient code is generated. This function handles the non-hoisted load -/// special case by making the optimization generate the same type of code for -/// hoisted and non-hoisted load (widen use and eliminate sign extend -/// instruction). This special case is important especially when the induction -/// variables are affecting addressing mode in code generation. -bool WidenIV::widenWithVariantLoadUse(NarrowIVDefUse DU) { +// The widenIVUse avoids generating trunc by evaluating the use as AddRec, this +// will not work when: +// 1) SCEV traces back to an instruction inside the loop that SCEV can not +// expand, eg. add %indvar, (load %addr) +// 2) SCEV finds a loop variant, eg. add %indvar, %loopvariant +// While SCEV fails to avoid trunc, we can still try to use instruction +// combining approach to prove trunc is not required. This can be further +// extended with other instruction combining checks, but for now we handle the +// following case (sub can be "add" and "mul", "nsw + sext" can be "nus + zext") +// +// Src: +// %c = sub nsw %b, %indvar +// %d = sext %c to i64 +// Dst: +// %indvar.ext1 = sext %indvar to i64 +// %m = sext %b to i64 +// %d = sub nsw i64 %m, %indvar.ext1 +// Therefore, as long as the result of add/sub/mul is extended to wide type, no +// trunc is required regardless of how %b is generated. This pattern is common +// when calculating address in 64 bit architecture +bool WidenIV::widenWithVariantUse(NarrowIVDefUse DU) { Instruction *NarrowUse = DU.NarrowUse; Instruction *NarrowDef = DU.NarrowDef; Instruction *WideDef = DU.WideDef; @@ -1113,12 +1120,6 @@ else return false; - // We are interested in the other operand being a load instruction. - // But, we should look into relaxing this restriction later on. - auto *I = dyn_cast(NarrowUse->getOperand(ExtendOperIdx)); - if (I && I->getOpcode() != Instruction::Load) - return false; - // Verifying that Defining operand is an AddRec const SCEV *Op1 = SE->getSCEV(WideDef); const SCEVAddRecExpr *AddRecOp1 = dyn_cast(Op1); @@ -1150,9 +1151,9 @@ return true; } -/// Special Case for widening with variant Loads (see -/// WidenIV::widenWithVariantLoadUse). This is the code generation part. -void WidenIV::widenWithVariantLoadUseCodegen(NarrowIVDefUse DU) { +/// Special Case for widening with loop variant (see +/// WidenIV::widenWithVariant). This is the code generation part. +void WidenIV::widenWithVariantUseCodegen(NarrowIVDefUse DU) { Instruction *NarrowUse = DU.NarrowUse; Instruction *NarrowDef = DU.NarrowDef; Instruction *WideDef = DU.WideDef; @@ -1300,8 +1301,8 @@ // in WideAddRec.first does not indicate a polynomial induction expression. // In that case, look at the operands of the use instruction to determine // if we can still widen the use instead of truncating its operand. - if (widenWithVariantLoadUse(DU)) { - widenWithVariantLoadUseCodegen(DU); + if (widenWithVariantUse(DU)) { + widenWithVariantUseCodegen(DU); return nullptr; } Index: llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll =================================================================== --- llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll +++ llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll @@ -419,3 +419,52 @@ %cmp = icmp slt i32 %add, %length br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit } + +define i32 @foo6(%struct.image* %input, i32 %length, i32* %in) { +entry: + %stride = getelementptr inbounds %struct.image, %struct.image* %input, i64 0, i32 1 + %0 = load i32, i32* %stride, align 4 + %cmp17 = icmp sgt i32 %length, 1 + br i1 %cmp17, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %channel = getelementptr inbounds %struct.image, %struct.image* %input, i64 0, i32 0 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + %1 = phi i32 [ %6, %for.body ] + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %2 = phi i32 [ 0, %entry ], [ %1, %for.cond.cleanup.loopexit ] + ret i32 %2 + +; Extend foo4 so that any loop variants (%3 and %or) with mul/sub/add then extend will not +; need a trunc instruction +; CHECK: for.body: +; CHECK-NOT: trunc +; CHECK: [[TMP0:%.*]] = and i32 %length, %0 +; CHECK-NEXT: zext i32 [[TMP0]] to i64 +; CHECK: [[TMP1:%.*]] = or i32 %length, [[TMP2:%.*]] +; CHECK-NEXT: zext i32 [[TMP1]] to i64 +for.body: ; preds = %for.body.lr.ph, %for.body + %x.018 = phi i32 [ 1, %for.body.lr.ph ], [ %add, %for.body ] + %add = add nuw nsw i32 %x.018, 1 + %3 = and i32 %length, %0 + %mul = mul nuw i32 %3, %add + %idx.ext = zext i32 %mul to i64 + %add.ptr = getelementptr inbounds i32, i32* %in, i64 %idx.ext + %4 = load i32, i32* %add.ptr, align 4 + %mul1 = mul nuw i32 %0, %add + %idx.ext1 = zext i32 %mul1 to i64 + %add.ptr1 = getelementptr inbounds i32, i32* %in, i64 %idx.ext1 + %5 = load i32, i32* %add.ptr1, align 4 + %or = or i32 %length, %5 + %sub.or = sub nuw i32 %or, %add + %or.ext = zext i32 %sub.or to i64 + %ptr.or = getelementptr inbounds i32, i32* %in, i64 %or.ext + %val.or = load i32, i32* %ptr.or + %6 = add i32 %4, %val.or + %cmp = icmp ult i32 %add, %length + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +}