diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -998,7 +998,8 @@ /// \return The size (in bits) of the smallest and widest types in the code /// that needs to be vectorized. We ignore values that remain scalar such as - /// 64 bit loop indices. + /// 64 bit loop indices. Non-power of 2 sizes are round up to the next power + /// of 2. std::pair getSmallestAndWidestTypes(); /// \return The desired interleave count. @@ -5225,7 +5226,8 @@ } } - return {MinWidth, MaxWidth}; + // Round up to the next power of 2, if min or max widths aren't powers of 2. + return {PowerOf2Ceil(MinWidth), PowerOf2Ceil(MaxWidth)}; } unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, diff --git a/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll b/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -loop-vectorize -S %s -mattr=+avx512f | FileCheck %s + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.15.0" + +; Make sure non-power-of-2 types are round up to the next power of 2. + +define x86_fp80 @test() { +; CHECK-LABEL: @test( +; CHECK-NEXT: foo.exit: +; CHECK-NEXT: br label [[FOR_BODY3_I_3:%.*]] +; CHECK: for.body3.i.3: +; CHECK-NEXT: [[N_ADDR_112_I_3:%.*]] = phi i64 [ [[DEC_I_3:%.*]], [[FOR_BODY3_I_3]] ], [ 24, [[FOO_EXIT:%.*]] ] +; CHECK-NEXT: [[X_ADDR_111_I_3:%.*]] = phi x86_fp80 [ [[MUL_I_3:%.*]], [[FOR_BODY3_I_3]] ], [ undef, [[FOO_EXIT]] ] +; CHECK-NEXT: [[MUL_I_3]] = fmul x86_fp80 [[X_ADDR_111_I_3]], 0xK40008000000000000000 +; CHECK-NEXT: [[DEC_I_3]] = add nsw i64 [[N_ADDR_112_I_3]], -1 +; CHECK-NEXT: [[CMP2_I_3:%.*]] = icmp sgt i64 [[N_ADDR_112_I_3]], 1 +; CHECK-NEXT: br i1 [[CMP2_I_3]], label [[FOR_BODY3_I_3]], label [[FOO_EXIT_3:%.*]] +; CHECK: foo.exit.3: +; CHECK-NEXT: [[MUL_LCSSA:%.*]] = phi x86_fp80 [ [[MUL_I_3]], [[FOR_BODY3_I_3]] ] +; CHECK-NEXT: ret x86_fp80 [[MUL_LCSSA]] +; +foo.exit: + br label %for.body3.i.3 + +for.body3.i.3: ; preds = %for.body3.i.3, %foo.exit + %n.addr.112.i.3 = phi i64 [ %dec.i.3, %for.body3.i.3 ], [ 24, %foo.exit ] + %x.addr.111.i.3 = phi x86_fp80 [ %mul.i.3, %for.body3.i.3 ], [ undef, %foo.exit ] + %mul.i.3 = fmul x86_fp80 %x.addr.111.i.3, 0xK40008000000000000000 + %dec.i.3 = add nsw i64 %n.addr.112.i.3, -1 + %cmp2.i.3 = icmp sgt i64 %n.addr.112.i.3, 1 + br i1 %cmp2.i.3, label %for.body3.i.3, label %foo.exit.3 + +foo.exit.3: ; preds = %for.body3.i.3 + %mul.lcssa = phi x86_fp80 [ %mul.i.3, %for.body3.i.3 ] + ret x86_fp80 %mul.lcssa +}