Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2507,33 +2507,52 @@ auto StartValue = ID.getStartValue(); assert(Index->getType() == Step->getType() && "Index type does not match StepValue type"); + + // Note: the IR at this point is broken. We cannot use SE to create any new + // SCEV and then expand it, hoping that SCEV's simplification will give us + // a more optimal code. Unfortunately, attempt of doing so on invalid IR may + // lead to various SCEV crashes. So all we can do is to use builder and rely + // on InstCombine for future simplifications. Here we handle some trivial + // cases only. + auto CreateAdd = [&](Value *X, Value *Y) { + assert(X->getType() == Y->getType() && "Types don't match!"); + if (auto *CX = dyn_cast(X)) + if (CX->isZero()) + return Y; + if (auto *CY = dyn_cast(Y)) + if (CY->isZero()) + return X; + return B.CreateAdd(X, Y); + }; + + auto CreateMul = [&](Value *X, Value *Y) { + assert(X->getType() == Y->getType() && "Types don't match!"); + if (auto *CX = dyn_cast(X)) + if (CX->isOne()) + return Y; + if (auto *CY = dyn_cast(Y)) + if (CY->isOne()) + return X; + return B.CreateMul(X, Y); + }; + switch (ID.getKind()) { case InductionDescriptor::IK_IntInduction: { assert(Index->getType() == StartValue->getType() && "Index type does not match StartValue type"); - - // FIXME: Theoretically, we can call getAddExpr() of ScalarEvolution - // and calculate (Start + Index * Step) for all cases, without - // special handling for "isOne" and "isMinusOne". - // But in the real life the result code getting worse. We mix SCEV - // expressions and ADD/SUB operations and receive redundant - // intermediate values being calculated in different ways and - // Instcombine is unable to reduce them all. - if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) return B.CreateSub(StartValue, Index); - if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne()) - return B.CreateAdd(StartValue, Index); - const SCEV *S = SE->getAddExpr(SE->getSCEV(StartValue), - SE->getMulExpr(Step, SE->getSCEV(Index))); - return Exp.expandCodeFor(S, StartValue->getType(), &*B.GetInsertPoint()); + auto *Offset = CreateMul( + Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); + return CreateAdd(StartValue, Offset); } case InductionDescriptor::IK_PtrInduction: { assert(isa(Step) && "Expected constant step for pointer induction"); - const SCEV *S = SE->getMulExpr(SE->getSCEV(Index), Step); - Index = Exp.expandCodeFor(S, Index->getType(), &*B.GetInsertPoint()); - return B.CreateGEP(nullptr, StartValue, Index); + return B.CreateGEP( + nullptr, StartValue, + CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), + &*B.GetInsertPoint()))); } case InductionDescriptor::IK_FpInduction: { assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); Index: test/Transforms/LoopVectorize/X86/constant-fold.ll =================================================================== --- test/Transforms/LoopVectorize/X86/constant-fold.ll +++ test/Transforms/LoopVectorize/X86/constant-fold.ll @@ -18,20 +18,19 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[INDEX]] to i16 -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i16 0, [[TMP0]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> undef, i16 [[OFFSET_IDX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[TMP1]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i16*, i16** [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16** [[TMP4]] to <2 x i16*>* -; CHECK-NEXT: store <2 x i16*> getelementptr ([1 x %rec8], [1 x %rec8]* @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer), i32 0), i32 0, i32 0), i16* getelementptr inbounds (%rec8, %rec8* extractelement (<2 x %rec8*> getelementptr ([1 x %rec8], [1 x %rec8]* @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer), i32 1), i32 0, i32 0)>, <2 x i16*>* [[TMP5]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i16*, i16** [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16** [[TMP3]] to <2 x i16*>* +; CHECK-NEXT: store <2 x i16*> getelementptr ([1 x %rec8], [1 x %rec8]* @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer), i32 0), i32 0, i32 0), i16* getelementptr inbounds (%rec8, %rec8* extractelement (<2 x %rec8*> getelementptr ([1 x %rec8], [1 x %rec8]* @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer), i32 1), i32 0, i32 0)>, <2 x i16*>* [[TMP4]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2 -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2 +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; CHECK: middle.block: bb1: Index: test/Transforms/LoopVectorize/X86/pr39160.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/X86/pr39160.ll @@ -0,0 +1,116 @@ +; RUN: opt < %s -loop-vectorize -S | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1" +target triple = "x86_64-unknown-linux-gnu" + +; Make sure that we can compile the test without crash. + +define void @barney(i8 addrspace(1)** %arg) #0 { +; CHECK-LABEL: @barney( +; CHECK: middle.block: + +bb: + br label %bb1 + +bb1: ; preds = %bb5, %bb + br label %bb2 + +bb2: ; preds = %bb2, %bb1 + %tmp = phi i32 [ 0, %bb1 ], [ %tmp3, %bb2 ] + %tmp3 = add i32 %tmp, 1 + %tmp4 = icmp slt i32 %tmp3, 0 + br i1 %tmp4, label %bb2, label %bb5 + +bb5: ; preds = %bb2 + br i1 true, label %bb6, label %bb1 + +bb6: ; preds = %bb5 + %tmp7 = load i8 addrspace(1)*, i8 addrspace(1)** %arg, align 8 + %tmp8 = getelementptr inbounds i8, i8 addrspace(1)* %tmp7, i64 816 + %tmp9 = bitcast i8 addrspace(1)* %tmp8 to i64 addrspace(1)* + %tmp10 = load i64, i64 addrspace(1)* %tmp9, align 8 + %tmp11 = trunc i64 %tmp10 to i32 + %tmp12 = and i32 %tmp11, 31 + %tmp13 = lshr i32 1, %tmp12 + %tmp14 = getelementptr inbounds i8, i8 addrspace(1)* %tmp7, i64 800 + %tmp15 = bitcast i8 addrspace(1)* %tmp14 to i8 addrspace(1)* addrspace(1)* + %tmp16 = getelementptr inbounds i8, i8 addrspace(1)* %tmp7, i64 808 + %tmp17 = bitcast i8 addrspace(1)* %tmp16 to i8 addrspace(1)* addrspace(1)* + br label %bb19 + +bb18: ; preds = %bb42, %bb36, %bb33 + ret void + +bb19: ; preds = %bb42, %bb6 + %tmp20 = phi i64 [ %tmp45, %bb42 ], [ 0, %bb6 ] + %tmp21 = phi i64 [ %tmp43, %bb42 ], [ 2, %bb6 ] + %tmp22 = phi i32 [ %tmp65, %bb42 ], [ %tmp13, %bb6 ] + %tmp23 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %tmp15, align 8 + %tmp24 = getelementptr inbounds i8, i8 addrspace(1)* %tmp23, i64 12 + %tmp25 = bitcast i8 addrspace(1)* %tmp24 to i32 addrspace(1)* + %tmp26 = getelementptr inbounds i8, i8 addrspace(1)* %tmp23, i64 8 + %tmp27 = bitcast i8 addrspace(1)* %tmp26 to i32 addrspace(1)* + %tmp28 = load i32, i32 addrspace(1)* %tmp27, align 8 + %tmp29 = zext i32 %tmp28 to i64 + %tmp30 = icmp ult i64 %tmp20, %tmp29 + %tmp31 = select i1 %tmp30, i64 %tmp20, i64 %tmp29 + %tmp32 = icmp eq i64 %tmp31, 0 + br i1 %tmp32, label %bb59, label %bb50 + +bb33: ; preds = %bb62 + %tmp34 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %tmp17, align 8 + %tmp35 = icmp eq i8 addrspace(1)* %tmp34, null + br i1 %tmp35, label %bb18, label %bb36 + +bb36: ; preds = %bb33 + %tmp37 = getelementptr inbounds i8, i8 addrspace(1)* %tmp34, i64 8 + %tmp38 = bitcast i8 addrspace(1)* %tmp37 to i32 addrspace(1)* + %tmp39 = load i32, i32 addrspace(1)* %tmp38, align 8 + %tmp40 = zext i32 %tmp39 to i64 + %tmp41 = icmp ult i64 %tmp21, %tmp40 + br i1 %tmp41, label %bb42, label %bb18 + +bb42: ; preds = %bb36 + %tmp43 = add nuw nsw i64 %tmp21, 1 + %tmp44 = icmp ugt i64 %tmp21, 88 + %tmp45 = add nuw nsw i64 %tmp20, 1 + br i1 %tmp44, label %bb18, label %bb19 + +bb46: ; preds = %bb50 + %tmp47 = icmp eq i32 %tmp28, 0 + br i1 %tmp47, label %bb48, label %bb59 + +bb48: ; preds = %bb46 + %tmp49 = add i32 %tmp52, 14 + store i32 %tmp49, i32* undef, align 4 + ret void + +bb50: ; preds = %bb50, %bb19 + %tmp51 = phi i32 addrspace(1)* [ %tmp57, %bb50 ], [ %tmp25, %bb19 ] + %tmp52 = phi i32 [ %tmp55, %bb50 ], [ %tmp22, %bb19 ] + %tmp53 = phi i64 [ %tmp56, %bb50 ], [ 1, %bb19 ] + %tmp54 = add i32 %tmp52, 12 + store i32 %tmp54, i32 addrspace(1)* %tmp51, align 4 + %tmp55 = add i32 %tmp52, 13 + %tmp56 = add nuw nsw i64 %tmp53, 1 + %tmp57 = getelementptr inbounds i32, i32 addrspace(1)* %tmp25, i64 %tmp53 + %tmp58 = icmp ult i64 %tmp53, %tmp31 + br i1 %tmp58, label %bb50, label %bb46 + +bb59: ; preds = %bb46, %bb19 + %tmp60 = phi i32 [ %tmp22, %bb19 ], [ %tmp55, %bb46 ] + %tmp61 = phi i64 [ 1, %bb19 ], [ %tmp56, %bb46 ] + br label %bb62 + +bb62: ; preds = %bb68, %bb59 + %tmp63 = phi i32 [ %tmp65, %bb68 ], [ %tmp60, %bb59 ] + %tmp64 = phi i64 [ %tmp66, %bb68 ], [ %tmp61, %bb59 ] + %tmp65 = add i32 %tmp63, 13 + %tmp66 = add nuw nsw i64 %tmp64, 1 + %tmp67 = icmp ult i64 %tmp66, %tmp21 + br i1 %tmp67, label %bb68, label %bb33 + +bb68: ; preds = %bb62 + br label %bb62 +} + +attributes #0 = { "target-cpu"="broadwell" "target-features"="+sse2,+cx16,+sahf,-tbm,-avx512ifma,-sha,-gfni,-fma4,-vpclmulqdq,+prfchw,+bmi2,-cldemote,+fsgsbase,-ptwrite,-xsavec,+popcnt,+aes,-avx512bitalg,-movdiri,-xsaves,-avx512er,-avx512vnni,-avx512vpopcntdq,-pconfig,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,+rdseed,-waitpkg,-movdir64b,-sse4a,-avx512bw,-clflushopt,+xsave,-avx512vbmi2,+64bit,-avx512vl,+invpcid,-avx512cd,+avx,-vaes,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,-wbnoinvd,+sse,+lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,-sgx,-shstk,+cmov,-avx512vbmi,+movbe,+xsaveopt,-avx512dq,+adx,-avx512pf,+sse3" } Index: test/Transforms/LoopVectorize/induction.ll =================================================================== --- test/Transforms/LoopVectorize/induction.ll +++ test/Transforms/LoopVectorize/induction.ll @@ -138,7 +138,7 @@ ; CHECK-LABEL: @scalarize_induction_variable_02( ; CHECK: vector.body: ; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; CHECK: %offset.idx = shl i64 %index, 3 +; CHECK: %offset.idx = mul i64 %index, 8 ; CHECK: %[[i0:.+]] = add i64 %offset.idx, 0 ; CHECK: %[[i1:.+]] = add i64 %offset.idx, 8 ; CHECK: getelementptr inbounds float, float* %a, i64 %[[i0]] @@ -149,7 +149,7 @@ ; UNROLL-NO-IC-LABEL: @scalarize_induction_variable_02( ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; UNROLL-NO-IC: %offset.idx = shl i64 %index, 3 +; UNROLL-NO-IC: %offset.idx = mul i64 %index, 8 ; UNROLL-NO-IC: %[[i0:.+]] = add i64 %offset.idx, 0 ; UNROLL-NO-IC: %[[i1:.+]] = add i64 %offset.idx, 8 ; UNROLL-NO-IC: %[[i2:.+]] = add i64 %offset.idx, 16 Index: test/Transforms/LoopVectorize/iv_outside_user.ll =================================================================== --- test/Transforms/LoopVectorize/iv_outside_user.ll +++ test/Transforms/LoopVectorize/iv_outside_user.ll @@ -23,11 +23,10 @@ ; CHECK-LABEL: @preinc ; CHECK-LABEL: middle.block: ; CHECK: %[[v3:.+]] = sub i32 %n.vec, 1 -; CHECK: %ind.escape = add i32 0, %[[v3]] ; CHECK-LABEL: scalar.ph: ; CHECK: %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %entry ] ; CHECK-LABEL: for.end: -; CHECK: %[[RET:.*]] = phi i32 [ {{.*}}, %for.body ], [ %ind.escape, %middle.block ] +; CHECK: %[[RET:.*]] = phi i32 [ {{.*}}, %for.body ], [ %[[v3]], %middle.block ] ; CHECK: ret i32 %[[RET]] define i32 @preinc(i32 %k) { entry: @@ -135,16 +134,13 @@ } ; CHECK-LABEL: @PR30742 +; CHECK: %[[T15:.+]] = add nsw i32 %tmp03, -7 ; CHECK: vector.ph ; CHECK: %[[N_MOD_VF:.+]] = urem i32 %[[T5:.+]], 2 ; CHECK: %[[N_VEC:.+]] = sub i32 %[[T5]], %[[N_MOD_VF]] ; CHECK: middle.block ; CHECK: %[[CMP:.+]] = icmp eq i32 %[[T5]], %[[N_VEC]] -; CHECK: %[[T15:.+]] = add i32 %tmp03, -7 -; CHECK: %[[T16:.+]] = shl i32 %[[N_MOD_VF]], 3 -; CHECK: %[[T17:.+]] = add i32 %[[T15]], %[[T16]] -; CHECK: %[[T18:.+]] = shl i32 {{.*}}, 3 -; CHECK: %ind.escape = sub i32 %[[T17]], %[[T18]] +; CHECK: %ind.escape = add i32 %[[T15]], ; CHECK: br i1 %[[CMP]], label %BB3, label %scalar.ph define void @PR30742() { BB0: