Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -223,6 +223,25 @@ return LT.first * SSE2UniformConstCostTable[Idx].Cost; } + static const CostTblEntry SSE2UniformCostTable[] = { + { ISD::SHL, MVT::v8i16, 1 }, // psllw. + { ISD::SHL, MVT::v4i32, 1 }, // pslld + { ISD::SHL, MVT::v2i64, 1 }, // psllq. + + { ISD::SRL, MVT::v8i16, 1 }, // psrlw. + { ISD::SRL, MVT::v4i32, 1 }, // psrld. + { ISD::SRL, MVT::v2i64, 1 }, // psrlq. + + { ISD::SRA, MVT::v8i16, 1 }, // psraw. + { ISD::SRA, MVT::v4i32, 1 }, // psrad. + }; + + if (Op2Info == TargetTransformInfo::OK_UniformValue && ST->hasSSE2()) { + int Idx = CostTableLookup(SSE2UniformCostTable, ISD, LT.second); + if (Idx != -1) + return LT.first * SSE2UniformCostTable[Idx].Cost; + } + if (ISD == ISD::SHL && Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) { EVT VT = LT.second; Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4535,6 +4535,10 @@ Op2VP = TargetTransformInfo::OP_PowerOf2; Op2VK = TargetTransformInfo::OK_UniformConstantValue; } + } else if (SE->isSCEVable(Op2->getType())) { + const SCEV *Op2SCEV = SE->getSCEV(Op2); + if (SE->isLoopInvariant(Op2SCEV, TheLoop)) + Op2VK = TargetTransformInfo::OK_UniformValue; } return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK, Index: test/Transforms/LoopVectorize/uniform-shift.ll =================================================================== --- test/Transforms/LoopVectorize/uniform-shift.ll +++ test/Transforms/LoopVectorize/uniform-shift.ll @@ -0,0 +1,39 @@ +; PR23582 +; RUN: opt < %s -basicaa -loop-vectorize -force-vector-interleave=1 -dce -instcombine -simplifycfg -S | llc | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@k = common global i32 0, align 4 +@A1 = common global [1024 x i32] zeroinitializer, align 16 +@B1 = common global [1024 x i32] zeroinitializer, align 16 +@C1 = common global [1024 x i32] zeroinitializer, align 16 + +; This test checks that loop vectorizer will generate uniform vshift. +; CHECK-LABEL: kernel1: +; CHECK: [[LOOP:^[a-zA-Z0-9_.]+]]: +; CHECK: movdqa {{.*}}, [[REG:%xmm[0-7]]] +; CHECK-NEXT: psrad {{%xmm[0-7]}}, [[REG]] +; CHECK-NEXT: movdqa [[REG]], {{.*}} +; CHECK-NEXT: addq $16, {{%[a-z0-9]+}} +; CHECK-NEXT: jne [[LOOP]] + +define void @kernel1() { +entry: + %tmp = load i32, i32* @k, align 4 + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @B1, i64 0, i64 %indvars.iv + %tmp1 = load i32, i32* %arrayidx, align 4 + %shr = ashr i32 %tmp1, %tmp + %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A1, i64 0, i64 %indvars.iv + store i32 %shr, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +}