Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2716,13 +2716,13 @@ return ScalarValue; } - // Get the last scalar instruction we generated for V. If the value is - // known to be uniform after vectorization, this corresponds to lane zero - // of the last unroll iteration. Otherwise, the last instruction is the one - // we created for the last vector lane of the last unroll iteration. + // Get the last scalar instruction we generated for V and Part. If the valu + // is known to be uniform after vectorization, this corresponds to lane zero + // of the Part unroll iteration. Otherwise, the last instruction is the one + // we created for the last vector lane of the Part unroll iteration. unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; auto *LastInst = - cast(getOrCreateScalarValue(V, UF - 1, LastLane)); + cast(VectorLoopValueMap.getScalarValue(V, Part, LastLane)); // Set the insert point after the last scalarized instruction. This ensures // the insertelement sequence will directly follow the scalar definitions. @@ -4047,7 +4047,8 @@ auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); - // Get the vectorized previous value. + // Get the vectorized previous value of the last part UF - 1. It appears last + // among all unrolled iterations, due to the order of their construction. Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); // Set the insertion point after the previous value if it is an instruction. Index: test/Transforms/LoopVectorize/first-order-recurrence.ll =================================================================== --- test/Transforms/LoopVectorize/first-order-recurrence.ll +++ test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -295,14 +295,14 @@ ; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = load i32, i32* {{.*}} ; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = load i32, i32* {{.*}} ; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = load i32, i32* {{.*}} -; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = load i32, i32* {{.*}} -; UNROLL-NO-IC-NEXT: [[TMP32:%.*]] = load i32, i32* {{.*}} -; UNROLL-NO-IC-NEXT: [[TMP33:%.*]] = load i32, i32* {{.*}} -; UNROLL-NO-IC-NEXT: [[TMP34:%.*]] = load i32, i32* {{.*}} ; UNROLL-NO-IC-NEXT: [[TMP35:%.*]] = insertelement <4 x i32> undef, i32 [[TMP27]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP36:%.*]] = insertelement <4 x i32> [[TMP35]], i32 [[TMP28]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP36]], i32 [[TMP29]], i32 2 ; UNROLL-NO-IC-NEXT: [[TMP38:%.*]] = insertelement <4 x i32> [[TMP37]], i32 [[TMP30]], i32 3 +; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = load i32, i32* {{.*}} +; UNROLL-NO-IC-NEXT: [[TMP32:%.*]] = load i32, i32* {{.*}} +; UNROLL-NO-IC-NEXT: [[TMP33:%.*]] = load i32, i32* {{.*}} +; UNROLL-NO-IC-NEXT: [[TMP34:%.*]] = load i32, i32* {{.*}} ; UNROLL-NO-IC-NEXT: [[TMP39:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP40:%.*]] = insertelement <4 x i32> [[TMP39]], i32 [[TMP32]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> [[TMP40]], i32 [[TMP33]], i32 2 @@ -396,3 +396,54 @@ for.end: ret i32 %val.phi } + +; We vectorize this first order recurrence, with a set of insertelements for +; each unrolled part. Make sure these insertelements are generated in-order, +; because the shuffle of the first order recurrence will be added after the +; insertelement of the last part UF - 1, assuming the latter appears after the +; insertelements of all other parts. +; +; int PR33613(double *b, double j, int d) { +; int a = 0; +; for(int i = 0; i < 10240; i++, b+=25) { +; double f = b[d]; // Scalarize to form insertelements +; if (j * f) +; a++; +; j = f; +; } +; return a; +; } +; +; UNROLL-NO-IC-LABEL: @PR33613( +; UNROLL-NO-IC: vector.body: +; UNROLL-NO-IC: [[VECTOR_RECUR:%.*]] = phi <4 x double> +; UNROLL-NO-IC: shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> {{.*}}, <4 x i32> +; UNROLL-NO-IC-NEXT: shufflevector <4 x double> {{.*}}, <4 x double> {{.*}}, <4 x i32> +; UNROLL-NO-IC-NOT: insertelement <4 x double> +; UNROLL-NO-IC: middle.block: +; +define i32 @PR33613(double* %b, double %j, i32 %d) { +entry: + %idxprom = sext i32 %d to i64 + br label %for.body + +for.cond.cleanup: + %a.1.lcssa = phi i32 [ %a.1, %for.body ] + ret i32 %a.1.lcssa + +for.body: + %b.addr.012 = phi double* [ %b, %entry ], [ %add.ptr, %for.body ] + %i.011 = phi i32 [ 0, %entry ], [ %inc1, %for.body ] + %a.010 = phi i32 [ 0, %entry ], [ %a.1, %for.body ] + %j.addr.09 = phi double [ %j, %entry ], [ %0, %for.body ] + %arrayidx = getelementptr inbounds double, double* %b.addr.012, i64 %idxprom + %0 = load double, double* %arrayidx, align 8 + %mul = fmul double %j.addr.09, %0 + %tobool = fcmp une double %mul, 0.000000e+00 + %inc = zext i1 %tobool to i32 + %a.1 = add nsw i32 %a.010, %inc + %inc1 = add nuw nsw i32 %i.011, 1 + %add.ptr = getelementptr inbounds double, double* %b.addr.012, i64 25 + %exitcond = icmp eq i32 %inc1, 10240 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +}