diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5222,8 +5222,8 @@ } ElementCount MaxVF = MaxVectorElementCount; - if (TTI.shouldMaximizeVectorBandwidth() || - (MaximizeBandwidth && isScalarEpilogueAllowed())) { + if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && + TTI.shouldMaximizeVectorBandwidth())) { auto MaxVectorElementCountMaxBW = ElementCount::get( PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), ComputeScalableMaxVF); @@ -5261,6 +5261,11 @@ MaxVF = MinVF; } } + + // Invalidate any widening decisions we might have made, in case the loop + // requires prediction (decided later), but we have already made some + // load/store widening decisions. + invalidateCostModelingDecisions(); } return MaxVF; } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll b/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll @@ -0,0 +1,129 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; REQUIRES: asserts +; RUN: opt < %s -loop-vectorize -vectorizer-maximize-bandwidth -S 2>&1 | FileCheck %s +; RUN: opt < %s -loop-vectorize -vectorizer-maximize-bandwidth -S -debug-only=loop-vectorize 2>&1 -disable-output | FileCheck %s --check-prefix=COST + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-eabi" + +; Check that the maximize vector bandwidth option does not give incorrect costs +; due to invalid cost decisions. The loop below has a low maximum trip count, +; so will be masked. + +; COST: LV: Found an estimated cost of 3000000 for VF 2 For instruction: %0 = load +; COST: LV: Found an estimated cost of 3000000 for VF 4 For instruction: %0 = load +; COST: LV: Found an estimated cost of 3000000 for VF 8 For instruction: %0 = load +; COST: LV: Found an estimated cost of 3000000 for VF 16 For instruction: %0 = load +; COST: LV: Selecting VF: 1. + +define i32 @test(i8* nocapture noundef readonly %pInVec, i8* nocapture noundef readonly %pInA1, i8* nocapture noundef readonly %pInA2, i8* nocapture noundef readonly %pInA3, i8* nocapture noundef readonly %pInA4, i32 noundef %numCols) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[AND:%.*]] = and i32 [[NUMCOLS:%.*]], 3 +; CHECK-NEXT: [[CMP_NOT32:%.*]] = icmp eq i32 [[AND]], 0 +; CHECK-NEXT: br i1 [[CMP_NOT32]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] +; CHECK: while.body.preheader: +; CHECK-NEXT: br label [[WHILE_BODY:%.*]] +; CHECK: while.body: +; CHECK-NEXT: [[PINVEC_ADDR_042:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[PINVEC:%.*]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[SUM4_041:%.*]] = phi i32 [ [[ADD14:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[SUM3_040:%.*]] = phi i32 [ [[ADD10:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[SUM2_039:%.*]] = phi i32 [ [[ADD6:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[SUM1_038:%.*]] = phi i32 [ [[ADD:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[COLCNT_037:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[AND]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[PINA1_ADDR_036:%.*]] = phi i8* [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[PINA1:%.*]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[PINA4_ADDR_035:%.*]] = phi i8* [ [[INCDEC_PTR11:%.*]], [[WHILE_BODY]] ], [ [[PINA4:%.*]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[PINA3_ADDR_034:%.*]] = phi i8* [ [[INCDEC_PTR7:%.*]], [[WHILE_BODY]] ], [ [[PINA3:%.*]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[PINA2_ADDR_033:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[PINA2:%.*]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[PINVEC_ADDR_042]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[PINVEC_ADDR_042]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP0]] to i32 +; CHECK-NEXT: [[INCDEC_PTR1]] = getelementptr inbounds i8, i8* [[PINA1_ADDR_036]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[PINA1_ADDR_036]], align 1 +; CHECK-NEXT: [[CONV2:%.*]] = sext i8 [[TMP1]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]] +; CHECK-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[SUM1_038]] +; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i8, i8* [[PINA2_ADDR_033]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[PINA2_ADDR_033]], align 1 +; CHECK-NEXT: [[CONV4:%.*]] = sext i8 [[TMP2]] to i32 +; CHECK-NEXT: [[MUL5:%.*]] = mul nsw i32 [[CONV4]], [[CONV]] +; CHECK-NEXT: [[ADD6]] = add nsw i32 [[MUL5]], [[SUM2_039]] +; CHECK-NEXT: [[INCDEC_PTR7]] = getelementptr inbounds i8, i8* [[PINA3_ADDR_034]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[PINA3_ADDR_034]], align 1 +; CHECK-NEXT: [[CONV8:%.*]] = sext i8 [[TMP3]] to i32 +; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[CONV8]], [[CONV]] +; CHECK-NEXT: [[ADD10]] = add nsw i32 [[MUL9]], [[SUM3_040]] +; CHECK-NEXT: [[INCDEC_PTR11]] = getelementptr inbounds i8, i8* [[PINA4_ADDR_035]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[PINA4_ADDR_035]], align 1 +; CHECK-NEXT: [[CONV12:%.*]] = sext i8 [[TMP4]] to i32 +; CHECK-NEXT: [[MUL13:%.*]] = mul nsw i32 [[CONV12]], [[CONV]] +; CHECK-NEXT: [[ADD14]] = add nsw i32 [[MUL13]], [[SUM4_041]] +; CHECK-NEXT: [[DEC]] = add nsw i32 [[COLCNT_037]], -1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]] +; CHECK: while.end.loopexit: +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY]] ] +; CHECK-NEXT: [[ADD6_LCSSA:%.*]] = phi i32 [ [[ADD6]], [[WHILE_BODY]] ] +; CHECK-NEXT: [[ADD10_LCSSA:%.*]] = phi i32 [ [[ADD10]], [[WHILE_BODY]] ] +; CHECK-NEXT: [[ADD14_LCSSA:%.*]] = phi i32 [ [[ADD14]], [[WHILE_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = add nsw i32 [[ADD6_LCSSA]], [[ADD_LCSSA]] +; CHECK-NEXT: [[TMP6:%.*]] = add nsw i32 [[TMP5]], [[ADD10_LCSSA]] +; CHECK-NEXT: [[TMP7:%.*]] = add nsw i32 [[TMP6]], [[ADD14_LCSSA]] +; CHECK-NEXT: br label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[ADD17:%.*]] = phi i32 [ [[TMP7]], [[WHILE_END_LOOPEXIT]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: ret i32 [[ADD17]] +; +entry: + %and = and i32 %numCols, 3 + %cmp.not32 = icmp eq i32 %and, 0 + br i1 %cmp.not32, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %pInVec.addr.042 = phi i8* [ %incdec.ptr, %while.body ], [ %pInVec, %entry ] + %sum4.041 = phi i32 [ %add14, %while.body ], [ 0, %entry ] + %sum3.040 = phi i32 [ %add10, %while.body ], [ 0, %entry ] + %sum2.039 = phi i32 [ %add6, %while.body ], [ 0, %entry ] + %sum1.038 = phi i32 [ %add, %while.body ], [ 0, %entry ] + %colCnt.037 = phi i32 [ %dec, %while.body ], [ %and, %entry ] + %pInA1.addr.036 = phi i8* [ %incdec.ptr1, %while.body ], [ %pInA1, %entry ] + %pInA4.addr.035 = phi i8* [ %incdec.ptr11, %while.body ], [ %pInA4, %entry ] + %pInA3.addr.034 = phi i8* [ %incdec.ptr7, %while.body ], [ %pInA3, %entry ] + %pInA2.addr.033 = phi i8* [ %incdec.ptr3, %while.body ], [ %pInA2, %entry ] + %incdec.ptr = getelementptr inbounds i8, i8* %pInVec.addr.042, i64 1 + %0 = load i8, i8* %pInVec.addr.042, align 1 + %conv = sext i8 %0 to i32 + %incdec.ptr1 = getelementptr inbounds i8, i8* %pInA1.addr.036, i64 1 + %1 = load i8, i8* %pInA1.addr.036, align 1 + %conv2 = sext i8 %1 to i32 + %mul = mul nsw i32 %conv2, %conv + %add = add nsw i32 %mul, %sum1.038 + %incdec.ptr3 = getelementptr inbounds i8, i8* %pInA2.addr.033, i64 1 + %2 = load i8, i8* %pInA2.addr.033, align 1 + %conv4 = sext i8 %2 to i32 + %mul5 = mul nsw i32 %conv4, %conv + %add6 = add nsw i32 %mul5, %sum2.039 + %incdec.ptr7 = getelementptr inbounds i8, i8* %pInA3.addr.034, i64 1 + %3 = load i8, i8* %pInA3.addr.034, align 1 + %conv8 = sext i8 %3 to i32 + %mul9 = mul nsw i32 %conv8, %conv + %add10 = add nsw i32 %mul9, %sum3.040 + %incdec.ptr11 = getelementptr inbounds i8, i8* %pInA4.addr.035, i64 1 + %4 = load i8, i8* %pInA4.addr.035, align 1 + %conv12 = sext i8 %4 to i32 + %mul13 = mul nsw i32 %conv12, %conv + %add14 = add nsw i32 %mul13, %sum4.041 + %dec = add nsw i32 %colCnt.037, -1 + %cmp.not = icmp eq i32 %dec, 0 + br i1 %cmp.not, label %while.end.loopexit, label %while.body + +while.end.loopexit: ; preds = %while.body + %5 = add nsw i32 %add6, %add + %6 = add nsw i32 %5, %add10 + %7 = add nsw i32 %6, %add14 + br label %while.end + +while.end: ; preds = %while.end.loopexit, %entry + %add17 = phi i32 [ %7, %while.end.loopexit ], [ 0, %entry ] + ret i32 %add17 +}