Index: llvm/lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -245,6 +245,9 @@ MPM.add(createLoopInterchangePass()); // Interchange loops MPM.add(createCFGSimplificationPass()); } + + MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize)); + if (!DisableUnrollLoops) MPM.add(createSimpleLoopUnrollPass()); // Unroll small loops addExtensionsToPM(EP_LoopOptimizerEnd, MPM); @@ -465,7 +468,6 @@ // llvm.loop.distribute=true or when -enable-loop-distribute is specified. MPM.add(createLoopDistributePass(/*ProcessAllLoopsByDefault=*/false)); - MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize)); // Eliminate loads by forwarding stores from the previous iteration to loads // of the current iteration. Index: llvm/test/Other/pass-pipelines.ll =================================================================== --- llvm/test/Other/pass-pipelines.ll +++ llvm/test/Other/pass-pipelines.ll @@ -49,6 +49,10 @@ ; FIXME: It isn't clear that we need yet another loop pass pipeline ; and run of LICM here. ; CHECK-O2-NOT: Manager +; CHECK-O2: Loop Vectorization +; CHECK-O2-NOT: Manager +; CHECK-O2: Loop Pass Manager +; Check-O2-NEXT: Unroll loops ; CHECK-O2: Loop Pass Manager ; CHECK-O2-NEXT: Loop Invariant Code Motion ; CHECK-O2-NOT: Manager @@ -69,8 +73,6 @@ ; CHECK-O2: Loop Pass Manager ; CHECK-O2-NEXT: Rotate Loops ; CHECK-O2-NOT: Manager -; CHECK-O2: Loop Vectorization -; CHECK-O2-NOT: Manager ; CHECK-O2: SLP Vectorizer ; CHECK-O2-NOT: Manager ; After vectorization we do partial unrolling. Index: llvm/test/Transforms/LoopVectorize/LoopWithConstTripCount1.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/LoopWithConstTripCount1.ll @@ -0,0 +1,66 @@ +; RUN: opt < %s -O3 -S | FileCheck %s + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" +;This is test is to verify the loop with constant trip count is getting vectorize. +;original C code: +;int a[32]; +;int reduce_add() { +; int s = 0; +; for (int i = 0; i < 32; ++i) +; s = s + a[i]; +; return s; +;} +; +;https://llvm.org/bugs/show_bug.cgi?id=25748 +; +;CHECK-LABLE:@_Z6reduceR1V +;CHECK:load <4 x i16> +;CHECK:zext <4 x i16> +;CHECK:add <4 x i32> +;CHECK:extractelement <4 x i32> + +%struct.V = type { [32 x i16] } + +; Function Attrs: nounwind uwtable +define i32 @_Z6reduceR1V(%struct.V* dereferenceable(64) %v) #0 { +entry: + %v.addr = alloca %struct.V*, align 8 + %sum = alloca i32, align 4 + %i = alloca i32, align 4 + store %struct.V* %v, %struct.V** %v.addr, align 8 + store i32 0, i32* %sum, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %1 = load %struct.V*, %struct.V** %v.addr, align 8 + %cmp = icmp slt i32 %0, 32 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load i32, i32* %i, align 4 + %idxprom = sext i32 %2 to i64 + %3 = load %struct.V*, %struct.V** %v.addr, align 8 + %data = getelementptr inbounds %struct.V, %struct.V* %3, i32 0, i32 0 + %arrayidx = getelementptr inbounds [32 x i16], [32 x i16]* %data, i64 0, i64 %idxprom + %4 = load i16, i16* %arrayidx, align 2 + %conv = zext i16 %4 to i32 + %5 = load i32, i32* %sum, align 4 + %add = add nsw i32 %5, %conv + store i32 %add, i32* %sum, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %6 = load i32, i32* %i, align 4 + %inc = add nsw i32 %6, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %7 = load i32, i32* %sum, align 4 + ret i32 %7 +} + Index: llvm/test/Transforms/LoopVectorize/LoopWithConstTripCount2.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/LoopWithConstTripCount2.ll @@ -0,0 +1,64 @@ +; RUN: opt < %s -O3 -S | FileCheck %s + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +;This test is to verify that loop with constant trip count is getting vectorize. +;original code: +;struct V { +; static constexpr int length = 32; +; unsigned short data[32]; +;}; +; +;int reduce(V &v) { +; int sum = 0; +; for (int i = 0; i < v.length; ++i) { +; sum += static_cast(v.data[i]); +; } +; return sum; +;} +; +;https://llvm.org/bugs/show_bug.cgi?id=28090 + +;CHECK-LABLE: @reduce_add +;CHECK:load <4 x i32> +;CHECK:add <4 x i32> +;CHECK:extractelement <4 x i32> + +@a = common global [32 x i32] zeroinitializer, align 16 + +define i32 @reduce_add() #0 { +entry: + %s = alloca i32, align 4 + %i = alloca i32, align 4 + store i32 0, i32* %s, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 32 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32, i32* %s, align 4 + %2 = load i32, i32* %i, align 4 + %idxprom = sext i32 %2 to i64 + %arrayidx = getelementptr inbounds [32 x i32], [32 x i32]* @a, i64 0, i64 %idxprom + %3 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %1, %3 + store i32 %add, i32* %s, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %4 = load i32, i32* %i, align 4 + %inc = add nsw i32 %4, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %5 = load i32, i32* %s, align 4 + ret i32 %5 +} +