Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4506,6 +4506,15 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // For each instruction in the old loop. for (Instruction &I : *BB) { + + // Scalarize instructions that should remain scalar after vectorization. + if (!(isa(&I) || isa(&I) || + isa(&I)) && + Legal->isScalarAfterVectorization(&I)) { + scalarizeInstruction(&I); + continue; + } + switch (I.getOpcode()) { case Instruction::Br: // Nothing to do for PHIs and BR, since we already took care of the Index: llvm/trunk/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll +++ llvm/trunk/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll @@ -1,5 +1,6 @@ ; RUN: opt < %s -loop-vectorize -S | FileCheck %s +; CHECK: vector.body: ; CHECK: fadd ; CHECK-NEXT: fadd ; CHECK-NEXT: fadd @@ -12,9 +13,8 @@ ; CHECK-NEXT: fadd ; CHECK-NEXT: fadd ; CHECK-NEXT: fadd -; CHECK-NEXT: = ; CHECK-NOT: fadd -; CHECK-SAME: > +; CHECK: middle.block target datalayout = "e-m:e-i64:64-n32:64" target triple = "powerpc64le-ibm-linux-gnu" Index: llvm/trunk/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll +++ llvm/trunk/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll @@ -43,7 +43,7 @@ ; CHECK-LABEL: @s173 ; CHECK: load <4 x float>, <4 x float>* -; CHECK: add i64 %index, 16000 +; CHECK: add nsw i64 %index, 16000 ; CHECK: ret i32 0 } Index: llvm/trunk/test/Transforms/LoopVectorize/global_alias.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/global_alias.ll +++ llvm/trunk/test/Transforms/LoopVectorize/global_alias.ll @@ -387,7 +387,7 @@ ; return Foo.A[a]; ; } ; CHECK-LABEL: define i32 @noAlias08( -; CHECK: sub <4 x i32> +; CHECK: load <4 x i32> ; CHECK: ret define i32 @noAlias08(i32 %a) #0 { @@ -439,7 +439,7 @@ ; return Foo.A[a]; ; } ; CHECK-LABEL: define i32 @noAlias09( -; CHECK: sub <4 x i32> +; CHECK: load <4 x i32> ; CHECK: ret define i32 @noAlias09(i32 %a) #0 { @@ -721,7 +721,7 @@ ; return Foo.A[a]; ; } ; CHECK-LABEL: define i32 @noAlias14( -; CHECK: sub <4 x i32> +; CHECK: load <4 x i32> ; CHECK: ret define i32 @noAlias14(i32 %a) #0 { Index: llvm/trunk/test/Transforms/LoopVectorize/induction_plus.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/induction_plus.ll +++ llvm/trunk/test/Transforms/LoopVectorize/induction_plus.ll @@ -9,7 +9,9 @@ ;CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ;CHECK: %vec.ind = phi <4 x i64> [ , %vector.ph ], [ %vec.ind.next, %vector.body ] ;CHECK: %vec.ind1 = phi <4 x i32> [ , %vector.ph ], [ %vec.ind.next2, %vector.body ] -;CHECK: add nsw <4 x i64> %vec.ind, +;CHECK: %[[T1:.+]] = add i64 %index, 0 +;CHECK: %[[T2:.+]] = add nsw i64 %[[T1]], 12 +;CHECK: getelementptr inbounds [1024 x i32], [1024 x i32]* @array, i64 0, i64 %[[T2]] ;CHECK: %vec.ind.next = add <4 x i64> %vec.ind, ;CHECK: %vec.ind.next2 = add <4 x i32> %vec.ind1, ;CHECK: ret i32 Index: llvm/trunk/test/Transforms/LoopVectorize/scalar_after_vectorization.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/scalar_after_vectorization.ll +++ llvm/trunk/test/Transforms/LoopVectorize/scalar_after_vectorization.ll @@ -0,0 +1,74 @@ +; RUN: opt < %s -force-vector-width=4 -force-vector-interleave=2 -loop-vectorize -instcombine -S | FileCheck %s +; RUN: opt < %s -force-vector-width=4 -force-vector-interleave=2 -loop-vectorize -S | FileCheck %s --check-prefix=NO-IC + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; CHECK-LABEL: @scalar_after_vectorization_0 +; +; CHECK: vector.body: +; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: %offset.idx = or i64 %index, 1 +; CHECK: %[[T2:.+]] = add nuw nsw i64 %offset.idx, %tmp0 +; CHECK: %[[T3:.+]] = sub nsw i64 %[[T2]], %x +; CHECK: %[[T4:.+]] = getelementptr inbounds i32, i32* %a, i64 %[[T3]] +; CHECK: %[[T5:.+]] = bitcast i32* %[[T4]] to <4 x i32>* +; CHECK: load <4 x i32>, <4 x i32>* %[[T5]], align 4 +; CHECK: %[[T6:.+]] = getelementptr i32, i32* %[[T4]], i64 4 +; CHECK: %[[T7:.+]] = bitcast i32* %[[T6]] to <4 x i32>* +; CHECK: load <4 x i32>, <4 x i32>* %[[T7]], align 4 +; CHECK: br {{.*}}, label %middle.block, label %vector.body +; +; NO-IC-LABEL: @scalar_after_vectorization_0 +; +; NO-IC: vector.body: +; NO-IC: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; NO-IC: %offset.idx = add i64 1, %index +; NO-IC: %[[T2:.+]] = add i64 %offset.idx, 0 +; NO-IC: %[[T3:.+]] = add i64 %offset.idx, 4 +; NO-IC: %[[T4:.+]] = add nuw nsw i64 %[[T2]], %tmp0 +; NO-IC: %[[T5:.+]] = add nuw nsw i64 %[[T3]], %tmp0 +; NO-IC: %[[T6:.+]] = sub nsw i64 %[[T4]], %x +; NO-IC: %[[T7:.+]] = sub nsw i64 %[[T5]], %x +; NO-IC: %[[T8:.+]] = getelementptr inbounds i32, i32* %a, i64 %[[T6]] +; NO-IC: %[[T9:.+]] = getelementptr inbounds i32, i32* %a, i64 %[[T7]] +; NO-IC: %[[T10:.+]] = getelementptr i32, i32* %[[T8]], i32 0 +; NO-IC: %[[T11:.+]] = bitcast i32* %[[T10]] to <4 x i32>* +; NO-IC: load <4 x i32>, <4 x i32>* %[[T11]], align 4 +; NO-IC: %[[T12:.+]] = getelementptr i32, i32* %[[T8]], i32 4 +; NO-IC: %[[T13:.+]] = bitcast i32* %[[T12]] to <4 x i32>* +; NO-IC: load <4 x i32>, <4 x i32>* %[[T13]], align 4 +; NO-IC: br {{.*}}, label %middle.block, label %vector.body +; +define void @scalar_after_vectorization_0(i32* noalias %a, i32* noalias %b, i64 %x, i64 %y) { + +outer.ph: + br label %outer.body + +outer.body: + %i = phi i64 [ 1, %outer.ph ], [ %i.next, %inner.end ] + %tmp0 = mul nuw nsw i64 %i, %x + br label %inner.ph + +inner.ph: + br label %inner.body + +inner.body: + %j = phi i64 [ 1, %inner.ph ], [ %j.next, %inner.body ] + %tmp1 = add nuw nsw i64 %j, %tmp0 + %tmp2 = sub nsw i64 %tmp1, %x + %tmp3 = getelementptr inbounds i32, i32* %a, i64 %tmp2 + %tmp4 = load i32, i32* %tmp3, align 4 + %tmp5 = getelementptr inbounds i32, i32* %b, i64 %tmp1 + store i32 %tmp4, i32* %tmp5, align 4 + %j.next = add i64 %j, 1 + %cond.j = icmp slt i64 %j.next, %y + br i1 %cond.j, label %inner.body, label %inner.end + +inner.end: + %i.next = add i64 %i, 1 + %cond.i = icmp slt i64 %i.next, %y + br i1 %cond.i, label %outer.body, label %outer.end + +outer.end: + ret void +}