Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5121,6 +5121,14 @@ MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; } + // If the trip count is constant, limit the interleave count to be less than + // the trip count divided by VF. + if (TC > 0) { + assert(TC >= VF && "VF exceeds trip count?"); + if ((TC / VF) < MaxInterleaveCount) + MaxInterleaveCount = (TC / VF); + } + // If we did not calculate the cost for VF (because the user selected the VF) // then we calculate the cost of VF here. if (LoopCost == 0) @@ -5129,7 +5137,7 @@ assert(LoopCost && "Non-zero loop cost expected"); // Clamp the calculated IC to be between the 1 and the max interleave count - // that the target allows. + // that the target and trip count allows. if (IC > MaxInterleaveCount) IC = MaxInterleaveCount; else if (IC < 1) Index: llvm/trunk/test/Transforms/LoopVectorize/X86/pr42674.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/pr42674.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/pr42674.ll @@ -3,24 +3,44 @@ @bytes = global [128 x i8] zeroinitializer, align 16 -; FIXME: We should end up with vector code for this loop, but don't because -; we try to create VF=64,UF=4 loop, but the scalar trip count is only 128 so -; the vector loop becomes dead code leaving only a scalar remainder. +; Make sure we end up with vector code for this loop. We used to try to create +; a VF=64,UF=4 loop, but the scalar trip count is only 128 so +; the vector loop was dead code leaving only a scalar remainder. define zeroext i8 @sum() { ; CHECK-LABEL: @sum( ; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[R_010:%.*]] = phi i8 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [128 x i8], [128 x i8]* @bytes, i64 0, i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ADD]] = add i8 [[TMP0]], [[R_010]] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 128 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0 -; CHECK: for.end: -; CHECK-NEXT: ret i8 [[ADD]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <64 x i8> [ zeroinitializer, [[ENTRY]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <64 x i8> [ zeroinitializer, [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [128 x i8], [128 x i8]* @bytes, i64 0, i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <64 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <64 x i8>, <64 x i8>* [[TMP1]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 64 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <64 x i8>* +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <64 x i8>, <64 x i8>* [[TMP3]], align 16 +; CHECK-NEXT: [[TMP4]] = add <64 x i8> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP5]] = add <64 x i8> [[WIDE_LOAD3]], [[VEC_PHI2]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 128 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX]], 0 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <64 x i8> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <64 x i8> [[BIN_RDX]], <64 x i8> undef, <64 x i32> +; CHECK-NEXT: [[BIN_RDX4:%.*]] = add <64 x i8> [[BIN_RDX]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <64 x i8> [[BIN_RDX4]], <64 x i8> undef, <64 x i32> +; CHECK-NEXT: [[BIN_RDX6:%.*]] = add <64 x i8> [[BIN_RDX4]], [[RDX_SHUF5]] +; CHECK-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <64 x i8> [[BIN_RDX6]], <64 x i8> undef, <64 x i32> +; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <64 x i8> [[BIN_RDX6]], [[RDX_SHUF7]] +; CHECK-NEXT: [[RDX_SHUF9:%.*]] = shufflevector <64 x i8> [[BIN_RDX8]], <64 x i8> undef, <64 x i32> +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <64 x i8> [[BIN_RDX8]], [[RDX_SHUF9]] +; CHECK-NEXT: [[RDX_SHUF11:%.*]] = shufflevector <64 x i8> [[BIN_RDX10]], <64 x i8> undef, <64 x i32> +; CHECK-NEXT: [[BIN_RDX12:%.*]] = add <64 x i8> [[BIN_RDX10]], [[RDX_SHUF11]] +; CHECK-NEXT: [[RDX_SHUF13:%.*]] = shufflevector <64 x i8> [[BIN_RDX12]], <64 x i8> undef, <64 x i32> +; CHECK-NEXT: [[BIN_RDX14:%.*]] = add <64 x i8> [[BIN_RDX12]], [[RDX_SHUF13]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[BIN_RDX14]], i32 0 +; CHECK-NEXT: ret i8 [[TMP7]] ; entry: br label %for.body