Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -391,7 +391,7 @@ /// \return The maximum interleave factor that any transform should try to /// perform for this target. This number depends on the level of parallelism /// and the number of execution units in the CPU. - unsigned getMaxInterleaveFactor() const; + unsigned getMaxInterleaveFactor(unsigned VF) const; /// \return The expected cost of arithmetic ops, such as mul, xor, fsub, etc. unsigned @@ -549,7 +549,7 @@ const APInt &Imm, Type *Ty) = 0; virtual unsigned getNumberOfRegisters(bool Vector) = 0; virtual unsigned getRegisterBitWidth(bool Vector) = 0; - virtual unsigned getMaxInterleaveFactor() = 0; + virtual unsigned getMaxInterleaveFactor(unsigned VF) = 0; virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info, OperandValueKind Opd2Info, @@ -687,8 +687,8 @@ unsigned getRegisterBitWidth(bool Vector) override { return Impl.getRegisterBitWidth(Vector); } - unsigned getMaxInterleaveFactor() override { - return Impl.getMaxInterleaveFactor(); + unsigned getMaxInterleaveFactor(unsigned VF) override { + return Impl.getMaxInterleaveFactor(VF); } unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info, Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -261,7 +261,7 @@ unsigned getRegisterBitWidth(bool Vector) { return 32; } - unsigned getMaxInterleaveFactor() { return 1; } + unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -283,7 +283,7 @@ unsigned getRegisterBitWidth(bool Vector) { return 32; } - unsigned getMaxInterleaveFactor() { return 1; } + unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } unsigned getArithmeticInstrCost( unsigned Opcode, Type *Ty, Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -182,8 +182,8 @@ return TTIImpl->getRegisterBitWidth(Vector); } -unsigned TargetTransformInfo::getMaxInterleaveFactor() const { - return TTIImpl->getMaxInterleaveFactor(); +unsigned TargetTransformInfo::getMaxInterleaveFactor(unsigned VF) const { + return TTIImpl->getMaxInterleaveFactor(VF); } unsigned TargetTransformInfo::getArithmeticInstrCost( Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -110,7 +110,7 @@ return 64; } - unsigned getMaxInterleaveFactor(); + unsigned getMaxInterleaveFactor(unsigned VF); unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -419,7 +419,7 @@ return Cost; } -unsigned AArch64TTIImpl::getMaxInterleaveFactor() { +unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) { if (ST->isCortexA57()) return 4; return 2; Index: lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.h +++ lib/Target/ARM/ARMTargetTransformInfo.h @@ -96,7 +96,7 @@ return 32; } - unsigned getMaxInterleaveFactor() { + unsigned getMaxInterleaveFactor(unsigned VF) { // These are out of order CPUs: if (ST->isCortexA15() || ST->isSwift()) return 2; Index: lib/Target/PowerPC/PPCTargetTransformInfo.h =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.h +++ lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -81,7 +81,7 @@ bool enableAggressiveInterleaving(bool LoopHasReductions); unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); - unsigned getMaxInterleaveFactor(); + unsigned getMaxInterleaveFactor(unsigned VF); unsigned getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -215,7 +215,7 @@ } -unsigned PPCTTIImpl::getMaxInterleaveFactor() { +unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) { unsigned Directive = ST->getDarwinDirective(); // The 440 has no SIMD support, but floating-point instructions // have a 5-cycle latency, so unroll by 5x for latency hiding. Index: lib/Target/R600/AMDGPUTargetTransformInfo.h =================================================================== --- lib/Target/R600/AMDGPUTargetTransformInfo.h +++ lib/Target/R600/AMDGPUTargetTransformInfo.h @@ -70,7 +70,7 @@ unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); - unsigned getMaxInterleaveFactor(); + unsigned getMaxInterleaveFactor(unsigned VF); }; } // end namespace llvm Index: lib/Target/R600/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/R600/AMDGPUTargetTransformInfo.cpp +++ lib/Target/R600/AMDGPUTargetTransformInfo.cpp @@ -76,7 +76,7 @@ unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool) { return 32; } -unsigned AMDGPUTTIImpl::getMaxInterleaveFactor() { +unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { // Semi-arbitrary large amount. return 64; } Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -72,7 +72,7 @@ unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); - unsigned getMaxInterleaveFactor(); + unsigned getMaxInterleaveFactor(unsigned VF); unsigned getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -66,7 +66,13 @@ } -unsigned X86TTIImpl::getMaxInterleaveFactor() { +unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { + // If the loop will not be vectorized, don't interleave the loop. + // Let regular unroll to unroll the loop, which saves the overflow + // check and memory check cost. + if (VF == 1) + return 1; + if (ST->isAtom()) return 1; Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4684,7 +4684,7 @@ std::max(1U, (R.MaxLocalUsers - 1))); // Clamp the unroll factor ranges to reasonable factors. - unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor(); + unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor(VF); // Check if the user has overridden the unroll max. if (VF == 1) { Index: test/Transforms/LoopVectorize/X86/unroll-small-loops.ll =================================================================== --- test/Transforms/LoopVectorize/X86/unroll-small-loops.ll +++ test/Transforms/LoopVectorize/X86/unroll-small-loops.ll @@ -47,9 +47,11 @@ ; CHECK-VECTOR: store <4 x i32> ; CHECK-VECTOR: ret ; +; For x86, loop unroll in loop vectorizer is disabled when VF==1. +; ; CHECK-SCALAR-LABEL: @bar( ; CHECK-SCALAR: store i32 -; CHECK-SCALAR: store i32 +; CHECK-SCALAR-NOT: store i32 ; CHECK-SCALAR: ret define i32 @bar(i32* nocapture %A, i32 %n) nounwind uwtable ssp { %1 = icmp sgt i32 %n, 0 Index: test/Transforms/LoopVectorize/unroll.ll =================================================================== --- test/Transforms/LoopVectorize/unroll.ll +++ test/Transforms/LoopVectorize/unroll.ll @@ -0,0 +1,37 @@ +; This test makes sure that loop will not be unrolled in vectorization if VF computed +; equals to 1. +; RUN: opt < %s -loop-vectorize -S | FileCheck %s + +; Make sure there are no geps being merged. +; CHECK-LABEL: @foo( +; CHECK: getelementptr +; CHECK-NOT: getelementptr + +@N = common global i32 0, align 4 +@a = common global [1000 x i32] zeroinitializer, align 16 + +define void @foo() #0 { +entry: + %0 = load i32, i32* @N, align 4 + %cmp5 = icmp sgt i32 %0, 0 + br i1 %cmp5, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + %conv = sext i32 %0 to i64 + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %i.06 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %mul = mul nuw nsw i64 %i.06, 7 + %arrayidx = getelementptr inbounds [1000 x i32], [1000 x i32]* @a, i64 0, i64 %mul + store i32 3, i32* %arrayidx, align 4 + %inc = add nuw nsw i64 %i.06, 1 + %cmp = icmp slt i64 %inc, %conv + br i1 %cmp, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +}