Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2507,33 +2507,52 @@ auto StartValue = ID.getStartValue(); assert(Index->getType() == Step->getType() && "Index type does not match StepValue type"); + + // Note: the IR at this point is broken. We cannot use SE to create any new + // SCEV and then expand it, hoping that SCEV's simplification will give us + // a more optimal code. Unfortunately, attempt of doing so on invalid IR may + // lead to various SCEV crashes. So all we can do is to use builder and rely + // on InstCombine for future simplifications. Here we handle some trivial + // cases only. + auto CreateAdd = [&B](Value *X, Value *Y) { + assert(X->getType() == Y->getType() && "Types don't match!"); + if (auto *CX = dyn_cast(X)) + if (CX->isZero()) + return Y; + if (auto *CY = dyn_cast(Y)) + if (CY->isZero()) + return X; + return B.CreateAdd(X, Y); + }; + + auto CreateMul = [&B](Value *X, Value *Y) { + assert(X->getType() == Y->getType() && "Types don't match!"); + if (auto *CX = dyn_cast(X)) + if (CX->isOne()) + return Y; + if (auto *CY = dyn_cast(Y)) + if (CY->isOne()) + return X; + return B.CreateMul(X, Y); + }; + switch (ID.getKind()) { case InductionDescriptor::IK_IntInduction: { assert(Index->getType() == StartValue->getType() && "Index type does not match StartValue type"); - - // FIXME: Theoretically, we can call getAddExpr() of ScalarEvolution - // and calculate (Start + Index * Step) for all cases, without - // special handling for "isOne" and "isMinusOne". - // But in the real life the result code getting worse. We mix SCEV - // expressions and ADD/SUB operations and receive redundant - // intermediate values being calculated in different ways and - // Instcombine is unable to reduce them all. - if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) return B.CreateSub(StartValue, Index); - if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne()) - return B.CreateAdd(StartValue, Index); - const SCEV *S = SE->getAddExpr(SE->getSCEV(StartValue), - SE->getMulExpr(Step, SE->getSCEV(Index))); - return Exp.expandCodeFor(S, StartValue->getType(), &*B.GetInsertPoint()); + auto *Offset = CreateMul( + Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); + return CreateAdd(StartValue, Offset); } case InductionDescriptor::IK_PtrInduction: { assert(isa(Step) && "Expected constant step for pointer induction"); - const SCEV *S = SE->getMulExpr(SE->getSCEV(Index), Step); - Index = Exp.expandCodeFor(S, Index->getType(), &*B.GetInsertPoint()); - return B.CreateGEP(nullptr, StartValue, Index); + return B.CreateGEP( + nullptr, StartValue, + CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), + &*B.GetInsertPoint()))); } case InductionDescriptor::IK_FpInduction: { assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); Index: test/Transforms/LoopVectorize/X86/constant-fold.ll =================================================================== --- test/Transforms/LoopVectorize/X86/constant-fold.ll +++ test/Transforms/LoopVectorize/X86/constant-fold.ll @@ -18,20 +18,19 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[INDEX]] to i16 -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i16 0, [[TMP0]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> undef, i16 [[OFFSET_IDX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[TMP1]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i16*, i16** [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16** [[TMP4]] to <2 x i16*>* -; CHECK-NEXT: store <2 x i16*> getelementptr ([1 x %rec8], [1 x %rec8]* @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer), i32 0), i32 0, i32 0), i16* getelementptr inbounds (%rec8, %rec8* extractelement (<2 x %rec8*> getelementptr ([1 x %rec8], [1 x %rec8]* @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer), i32 1), i32 0, i32 0)>, <2 x i16*>* [[TMP5]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i16*, i16** [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16** [[TMP3]] to <2 x i16*>* +; CHECK-NEXT: store <2 x i16*> getelementptr ([1 x %rec8], [1 x %rec8]* @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer), i32 0), i32 0, i32 0), i16* getelementptr inbounds (%rec8, %rec8* extractelement (<2 x %rec8*> getelementptr ([1 x %rec8], [1 x %rec8]* @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer), i32 1), i32 0, i32 0)>, <2 x i16*>* [[TMP4]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2 -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2 +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; CHECK: middle.block: bb1: Index: test/Transforms/LoopVectorize/induction.ll =================================================================== --- test/Transforms/LoopVectorize/induction.ll +++ test/Transforms/LoopVectorize/induction.ll @@ -138,7 +138,7 @@ ; CHECK-LABEL: @scalarize_induction_variable_02( ; CHECK: vector.body: ; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; CHECK: %offset.idx = shl i64 %index, 3 +; CHECK: %offset.idx = mul i64 %index, 8 ; CHECK: %[[i0:.+]] = add i64 %offset.idx, 0 ; CHECK: %[[i1:.+]] = add i64 %offset.idx, 8 ; CHECK: getelementptr inbounds float, float* %a, i64 %[[i0]] @@ -149,7 +149,7 @@ ; UNROLL-NO-IC-LABEL: @scalarize_induction_variable_02( ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; UNROLL-NO-IC: %offset.idx = shl i64 %index, 3 +; UNROLL-NO-IC: %offset.idx = mul i64 %index, 8 ; UNROLL-NO-IC: %[[i0:.+]] = add i64 %offset.idx, 0 ; UNROLL-NO-IC: %[[i1:.+]] = add i64 %offset.idx, 8 ; UNROLL-NO-IC: %[[i2:.+]] = add i64 %offset.idx, 16 Index: test/Transforms/LoopVectorize/iv_outside_user.ll =================================================================== --- test/Transforms/LoopVectorize/iv_outside_user.ll +++ test/Transforms/LoopVectorize/iv_outside_user.ll @@ -23,11 +23,10 @@ ; CHECK-LABEL: @preinc ; CHECK-LABEL: middle.block: ; CHECK: %[[v3:.+]] = sub i32 %n.vec, 1 -; CHECK: %ind.escape = add i32 0, %[[v3]] ; CHECK-LABEL: scalar.ph: ; CHECK: %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %entry ] ; CHECK-LABEL: for.end: -; CHECK: %[[RET:.*]] = phi i32 [ {{.*}}, %for.body ], [ %ind.escape, %middle.block ] +; CHECK: %[[RET:.*]] = phi i32 [ {{.*}}, %for.body ], [ %[[v3]], %middle.block ] ; CHECK: ret i32 %[[RET]] define i32 @preinc(i32 %k) { entry: @@ -135,16 +134,13 @@ } ; CHECK-LABEL: @PR30742 +; CHECK: %[[T15:.+]] = add nsw i32 %tmp03, -7 ; CHECK: vector.ph ; CHECK: %[[N_MOD_VF:.+]] = urem i32 %[[T5:.+]], 2 ; CHECK: %[[N_VEC:.+]] = sub i32 %[[T5]], %[[N_MOD_VF]] ; CHECK: middle.block ; CHECK: %[[CMP:.+]] = icmp eq i32 %[[T5]], %[[N_VEC]] -; CHECK: %[[T15:.+]] = add i32 %tmp03, -7 -; CHECK: %[[T16:.+]] = shl i32 %[[N_MOD_VF]], 3 -; CHECK: %[[T17:.+]] = add i32 %[[T15]], %[[T16]] -; CHECK: %[[T18:.+]] = shl i32 {{.*}}, 3 -; CHECK: %ind.escape = sub i32 %[[T17]], %[[T18]] +; CHECK: %ind.escape = add i32 %[[T15]], ; CHECK: br i1 %[[CMP]], label %BB3, label %scalar.ph define void @PR30742() { BB0: Index: test/Transforms/LoopVectorize/pr39160.ll =================================================================== --- test/Transforms/LoopVectorize/pr39160.ll +++ test/Transforms/LoopVectorize/pr39160.ll @@ -0,0 +1,98 @@ +; RUN: opt -loop-vectorize -S < %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1" +target triple = "x86_64-unknown-linux-gnu" + +; Make sure that we can compile the test without crash. +define void @barney() { + +; CHECK-LABEL: @barney( +; CHECK: middle.block: + +bb: + br label %bb2 + +bb2: ; preds = %bb2, %bb + %tmp4 = icmp slt i32 undef, 0 + br i1 %tmp4, label %bb2, label %bb5 + +bb5: ; preds = %bb2 + br label %bb19 + +bb18: ; preds = %bb33 + ret void + +bb19: ; preds = %bb36, %bb5 + %tmp21 = phi i64 [ undef, %bb36 ], [ 2, %bb5 ] + %tmp22 = phi i32 [ %tmp65, %bb36 ], [ undef, %bb5 ] + br label %bb50 + +bb33: ; preds = %bb62 + br i1 undef, label %bb18, label %bb36 + +bb36: ; preds = %bb33 + br label %bb19 + +bb46: ; preds = %bb50 + br i1 undef, label %bb48, label %bb59 + +bb48: ; preds = %bb46 + %tmp49 = add i32 %tmp52, 14 + ret void + +bb50: ; preds = %bb50, %bb19 + %tmp52 = phi i32 [ %tmp55, %bb50 ], [ %tmp22, %bb19 ] + %tmp53 = phi i64 [ %tmp56, %bb50 ], [ 1, %bb19 ] + %tmp54 = add i32 %tmp52, 12 + %tmp55 = add i32 %tmp52, 13 + %tmp56 = add nuw nsw i64 %tmp53, 1 + %tmp58 = icmp ult i64 %tmp53, undef + br i1 %tmp58, label %bb50, label %bb46 + +bb59: ; preds = %bb46 + br label %bb62 + +bb62: ; preds = %bb68, %bb59 + %tmp63 = phi i32 [ %tmp65, %bb68 ], [ %tmp55, %bb59 ] + %tmp64 = phi i64 [ %tmp66, %bb68 ], [ %tmp56, %bb59 ] + %tmp65 = add i32 %tmp63, 13 + %tmp66 = add nuw nsw i64 %tmp64, 1 + %tmp67 = icmp ult i64 %tmp66, %tmp21 + br i1 %tmp67, label %bb68, label %bb33 + +bb68: ; preds = %bb62 + br label %bb62 +} + +define i32 @foo(i32 addrspace(1)* %p) { + +; CHECK-LABEL: foo +; CHECK: middle.block: + +entry: + br label %outer + +outer: ; preds = %outer_latch, %entry + %iv = phi i64 [ 2, %entry ], [ %iv.next, %outer_latch ] + br label %inner + +inner: ; preds = %inner, %outer + %0 = phi i32 [ %2, %inner ], [ 0, %outer ] + %a = phi i32 [ %3, %inner ], [ 1, %outer ] + %b = phi i32 [ %1, %inner ], [ 6, %outer ] + %1 = add i32 %b, 2 + %2 = or i32 %0, %b + %3 = add nuw nsw i32 %a, 1 + %4 = zext i32 %3 to i64 + %5 = icmp ugt i64 %iv, %4 + br i1 %5, label %inner, label %outer_latch + +outer_latch: ; preds = %inner + store atomic i32 %2, i32 addrspace(1)* %p unordered, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %6 = icmp ugt i64 %iv, 63 + br i1 %6, label %exit, label %outer + +exit: ; preds = %outer_latch + ret i32 0 +}