Index: ../lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- ../lib/Transforms/Vectorize/LoopVectorize.cpp +++ ../lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5819,6 +5819,15 @@ return TTI.getAddressComputationCost(VectorTy) + TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + if (LI && Legal->isUniform(Ptr)) { + // Scalar load + broadcast + unsigned Cost = TTI.getAddressComputationCost(ValTy->getScalarType()); + Cost += TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), + Alignment, AS); + return Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, + ValTy); + } + // For an interleaved access, calculate the total cost of the whole // interleave group. if (Legal->isAccessInterleaved(I)) { Index: ../test/Transforms/LoopVectorize/X86/uniform_load.ll =================================================================== --- ../test/Transforms/LoopVectorize/X86/uniform_load.ll +++ ../test/Transforms/LoopVectorize/X86/uniform_load.ll @@ -0,0 +1,58 @@ +; RUN: opt -O2 -S -mcpu=core-avx2 < %s | FileCheck %s + +;float inc = 0.5; +;void foo(float *A, unsigned N) { +; +; for (unsigned i=0; i +; CHECK: fadd <8 x float> +; CHECK: store <8 x float> + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@inc = global float 5.000000e-01, align 4 + +; Function Attrs: nounwind uwtable +define void @foo(float* %A, i32 %N) { +entry: + %A.addr = alloca float*, align 8 + %N.addr = alloca i32, align 4 + %i = alloca i32, align 4 + store float* %A, float** %A.addr, align 8 + store i32 %N, i32* %N.addr, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %N.addr, align 4 + %cmp = icmp ult i32 %0, %1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load float, float* @inc, align 4 + %3 = load i32, i32* %i, align 4 + %idxprom = sext i32 %3 to i64 + %4 = load float*, float** %A.addr, align 8 + %arrayidx = getelementptr inbounds float, float* %4, i64 %idxprom + %5 = load float, float* %arrayidx, align 4 + %add = fadd float %5, %2 + store float %add, float* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %6 = load i32, i32* %i, align 4 + %inc = add nsw i32 %6, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +}