Index: lib/Analysis/CostModel.cpp =================================================================== --- lib/Analysis/CostModel.cpp +++ lib/Analysis/CostModel.cpp @@ -130,6 +130,20 @@ OpInfo = TargetTransformInfo::OK_NonUniformConstantValue; if (cast(V)->getSplatValue() != nullptr) OpInfo = TargetTransformInfo::OK_UniformConstantValue; + } else if (ShuffleVectorInst *SI = dyn_cast(V)) { + // Check for a splat of a variant. + unsigned NumVecElems = V->getType()->getVectorNumElements(); + if (isPowerOf2_32(NumVecElems)) { + SmallVector ShuffleMask(NumVecElems, 0); + // Check that shuffle masks matches. + SmallVector Mask = SI->getShuffleMask(); + for (unsigned i = 0; i < NumVecElems; i++) { + for (unsigned j = 0; j < NumVecElems; j++) + ShuffleMask[j] = i; + if (ShuffleMask == Mask) + OpInfo = TargetTransformInfo::OK_UniformValue; + } + } } return OpInfo; Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -223,6 +223,25 @@ return LT.first * SSE2UniformConstCostTable[Idx].Cost; } + static const CostTblEntry SSE2UniformCostTable[] = { + { ISD::SHL, MVT::v8i16, 1 }, // psllw. + { ISD::SHL, MVT::v4i32, 1 }, // pslld + { ISD::SHL, MVT::v2i64, 1 }, // psllq. + + { ISD::SRL, MVT::v8i16, 1 }, // psrlw. + { ISD::SRL, MVT::v4i32, 1 }, // psrld. + { ISD::SRL, MVT::v2i64, 1 }, // psrlq. + + { ISD::SRA, MVT::v8i16, 1 }, // psraw. + { ISD::SRA, MVT::v4i32, 1 }, // psrad. + }; + + if (Op2Info == TargetTransformInfo::OK_UniformValue && ST->hasSSE2()) { + int Idx = CostTableLookup(SSE2UniformCostTable, ISD, LT.second); + if (Idx != -1) + return LT.first * SSE2UniformCostTable[Idx].Cost; + } + if (ISD == ISD::SHL && Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) { EVT VT = LT.second; Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4535,6 +4535,10 @@ Op2VP = TargetTransformInfo::OP_PowerOf2; Op2VK = TargetTransformInfo::OK_UniformConstantValue; } + } else if (SE->isSCEVable(Op2->getType())) { + const SCEV *Op2SCEV = SE->getSCEV(Op2); + if (SE->isLoopInvariant(Op2SCEV, TheLoop)) + Op2VK = TargetTransformInfo::OK_UniformValue; } return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK, Index: test/Analysis/CostModel/X86/testshiftashr.ll =================================================================== --- test/Analysis/CostModel/X86/testshiftashr.ll +++ test/Analysis/CostModel/X86/testshiftashr.ll @@ -529,3 +529,59 @@ ret %shifttypec32i8 %0 } +; Uniform variant shift. +%shifttypeu16i8 = type <16 x i8> +define %shifttypeu16i8 @shift16i8u(%shifttypeu16i8 %a, i8 %b) { +entry: + ; SSE2: shift16i8u + ; SSE2: cost of 160 {{.*}} ashr + ; SSE2-CODEGEN: shift16i8u + ; SSE2-CODEGEN: sarb %cl + + %broadcast.splatinsert1 = insertelement <16 x i8> undef, i8 %b, i32 0 + %broadcast.splat2 = shufflevector <16 x i8> %broadcast.splatinsert1, <16 x i8> undef, <16 x i32> zeroinitializer + %tmp = ashr <16 x i8> %a, %broadcast.splat2 + ret %shifttypeu16i8 %tmp +} + +%shifttypeu8i16 = type <8 x i16> +define %shifttypeu8i16 @shift8i16u(%shifttypeu8i16 %a, i16 %b) { +entry: + ; SSE2: shift8i16u + ; SSE2: cost of 1 {{.*}} ashr + ; SSE2-CODEGEN: shift8i16u + ; SSE2-CODEGEN: psraw + + %broadcast.splatinsert1 = insertelement <8 x i16> undef, i16 %b, i32 0 + %broadcast.splat2 = shufflevector <8 x i16> %broadcast.splatinsert1, <8 x i16> undef, <8 x i32> zeroinitializer + %tmp = ashr <8 x i16> %a, %broadcast.splat2 + ret %shifttypeu8i16 %tmp +} + +%shifttypeu4i32 = type <4 x i32> +define %shifttypeu4i32 @shift4i32u(%shifttypeu4i32 %a, i32 %b) { +entry: + ; SSE2: shift4i32u + ; SSE2: cost of 1 {{.*}} ashr + ; SSE2-CODEGEN: shift4i32u + ; SSE2-CODEGEN: psrad + + %broadcast.splatinsert1 = insertelement <4 x i32> undef, i32 %b, i32 0 + %broadcast.splat2 = shufflevector <4 x i32> %broadcast.splatinsert1, <4 x i32> undef, <4 x i32> zeroinitializer + %tmp = ashr <4 x i32> %a, %broadcast.splat2 + ret %shifttypeu4i32 %tmp +} + +%shifttypeu2i64 = type <2 x i64> +define %shifttypeu2i64 @shift2i64u(%shifttypeu2i64 %a, i64 %b) { +entry: + ; SSE2: shift2i64u + ; SSE2: cost of 20 {{.*}} ashr + ; SSE2-CODEGEN: shift2i64u + ; SSE2-CODEGEN: sarq %cl + + %broadcast.splatinsert1 = insertelement <2 x i64> undef, i64 %b, i32 0 + %broadcast.splat2 = shufflevector <2 x i64> %broadcast.splatinsert1, <2 x i64> undef, <2 x i32> zeroinitializer + %tmp = ashr <2 x i64> %a, %broadcast.splat2 + ret %shifttypeu2i64 %tmp +} Index: test/Analysis/CostModel/X86/testshiftlshr.ll =================================================================== --- test/Analysis/CostModel/X86/testshiftlshr.ll +++ test/Analysis/CostModel/X86/testshiftlshr.ll @@ -527,3 +527,60 @@ i8 3, i8 3, i8 3, i8 3> ret %shifttypec32i8 %0 } + +; Uniform variant shift. +%shifttypeu16i8 = type <16 x i8> +define %shifttypeu16i8 @shift16i8u(%shifttypeu16i8 %a, i8 %b) { +entry: + ; SSE2: shift16i8u + ; SSE2: cost of 160 {{.*}} lshr + ; SSE2-CODEGEN: shift16i8u + ; SSE2-CODEGEN: shrb %cl + + %broadcast.splatinsert1 = insertelement <16 x i8> undef, i8 %b, i32 0 + %broadcast.splat2 = shufflevector <16 x i8> %broadcast.splatinsert1, <16 x i8> undef, <16 x i32> zeroinitializer + %tmp = lshr <16 x i8> %a, %broadcast.splat2 + ret %shifttypeu16i8 %tmp +} + +%shifttypeu8i16 = type <8 x i16> +define %shifttypeu8i16 @shift8i16u(%shifttypeu8i16 %a, i16 %b) { +entry: + ; SSE2: shift8i16u + ; SSE2: cost of 1 {{.*}} lshr + ; SSE2-CODEGEN: shift8i16u + ; SSE2-CODEGEN: psrlw + + %broadcast.splatinsert1 = insertelement <8 x i16> undef, i16 %b, i32 0 + %broadcast.splat2 = shufflevector <8 x i16> %broadcast.splatinsert1, <8 x i16> undef, <8 x i32> zeroinitializer + %tmp = lshr <8 x i16> %a, %broadcast.splat2 + ret %shifttypeu8i16 %tmp +} + +%shifttypeu4i32 = type <4 x i32> +define %shifttypeu4i32 @shift4i32u(%shifttypeu4i32 %a, i32 %b) { +entry: + ; SSE2: shift4i32u + ; SSE2: cost of 1 {{.*}} lshr + ; SSE2-CODEGEN: shift4i32u + ; SSE2-CODEGEN: psrld + + %broadcast.splatinsert1 = insertelement <4 x i32> undef, i32 %b, i32 0 + %broadcast.splat2 = shufflevector <4 x i32> %broadcast.splatinsert1, <4 x i32> undef, <4 x i32> zeroinitializer + %tmp = lshr <4 x i32> %a, %broadcast.splat2 + ret %shifttypeu4i32 %tmp +} + +%shifttypeu2i64 = type <2 x i64> +define %shifttypeu2i64 @shift2i64u(%shifttypeu2i64 %a, i64 %b) { +entry: + ; SSE2: shift2i64u + ; SSE2: cost of 1 {{.*}} lshr + ; SSE2-CODEGEN: shift2i64u + ; SSE2-CODEGEN: psrlq + + %broadcast.splatinsert1 = insertelement <2 x i64> undef, i64 %b, i32 0 + %broadcast.splat2 = shufflevector <2 x i64> %broadcast.splatinsert1, <2 x i64> undef, <2 x i32> zeroinitializer + %tmp = lshr <2 x i64> %a, %broadcast.splat2 + ret %shifttypeu2i64 %tmp +} Index: test/Analysis/CostModel/X86/testshiftshl.ll =================================================================== --- test/Analysis/CostModel/X86/testshiftshl.ll +++ test/Analysis/CostModel/X86/testshiftshl.ll @@ -527,3 +527,60 @@ i8 3, i8 3, i8 3, i8 3> ret %shifttypec32i8 %0 } + +; Uniform variant shift. +%shifttypeu16i8 = type <16 x i8> +define %shifttypeu16i8 @shift16i8u(%shifttypeu16i8 %a, i8 %b) { +entry: + ; SSE2: shift16i8u + ; SSE2: cost of 30 {{.*}} shl + ; SSE2-CODEGEN: shift16i8u + ; SSE2-CODEGEN: psllw + + %broadcast.splatinsert1 = insertelement <16 x i8> undef, i8 %b, i32 0 + %broadcast.splat2 = shufflevector <16 x i8> %broadcast.splatinsert1, <16 x i8> undef, <16 x i32> zeroinitializer + %tmp = shl <16 x i8> %a, %broadcast.splat2 + ret %shifttypeu16i8 %tmp +} + +%shifttypeu8i16 = type <8 x i16> +define %shifttypeu8i16 @shift8i16u(%shifttypeu8i16 %a, i16 %b) { +entry: + ; SSE2: shift8i16u + ; SSE2: cost of 1 {{.*}} shl + ; SSE2-CODEGEN: shift8i16u + ; SSE2-CODEGEN: psllw + + %broadcast.splatinsert1 = insertelement <8 x i16> undef, i16 %b, i32 0 + %broadcast.splat2 = shufflevector <8 x i16> %broadcast.splatinsert1, <8 x i16> undef, <8 x i32> zeroinitializer + %tmp = shl <8 x i16> %a, %broadcast.splat2 + ret %shifttypeu8i16 %tmp +} + +%shifttypeu4i32 = type <4 x i32> +define %shifttypeu4i32 @shift4i32u(%shifttypeu4i32 %a, i32 %b) { +entry: + ; SSE2: shift4i32u + ; SSE2: cost of 1 {{.*}} shl + ; SSE2-CODEGEN: shift4i32u + ; SSE2-CODEGEN: pslld + + %broadcast.splatinsert1 = insertelement <4 x i32> undef, i32 %b, i32 0 + %broadcast.splat2 = shufflevector <4 x i32> %broadcast.splatinsert1, <4 x i32> undef, <4 x i32> zeroinitializer + %tmp = shl <4 x i32> %a, %broadcast.splat2 + ret %shifttypeu4i32 %tmp +} + +%shifttypeu2i64 = type <2 x i64> +define %shifttypeu2i64 @shift2i64u(%shifttypeu2i64 %a, i64 %b) { +entry: + ; SSE2: shift2i64u + ; SSE2: cost of 1 {{.*}} shl + ; SSE2-CODEGEN: shift2i64u + ; SSE2-CODEGEN: psllq + + %broadcast.splatinsert1 = insertelement <2 x i64> undef, i64 %b, i32 0 + %broadcast.splat2 = shufflevector <2 x i64> %broadcast.splatinsert1, <2 x i64> undef, <2 x i32> zeroinitializer + %tmp = shl <2 x i64> %a, %broadcast.splat2 + ret %shifttypeu2i64 %tmp +} Index: test/Transforms/LoopVectorize/uniform-shift.ll =================================================================== --- test/Transforms/LoopVectorize/uniform-shift.ll +++ test/Transforms/LoopVectorize/uniform-shift.ll @@ -0,0 +1,39 @@ +; PR23582 +; RUN: opt < %s -basicaa -loop-vectorize -force-vector-interleave=1 -dce -instcombine -simplifycfg -S | llc | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@k = common global i32 0, align 4 +@A1 = common global [1024 x i32] zeroinitializer, align 16 +@B1 = common global [1024 x i32] zeroinitializer, align 16 +@C1 = common global [1024 x i32] zeroinitializer, align 16 + +; This test checks that loop vectorizer will generate uniform vshift. +; CHECK-LABEL: kernel1: +; CHECK: [[LOOP:^[a-zA-Z0-9_.]+]]: +; CHECK: movdqa {{.*}}, [[REG:%xmm[0-7]]] +; CHECK-NEXT: psrad {{%xmm[0-7]}}, [[REG]] +; CHECK-NEXT: movdqa [[REG]], {{.*}} +; CHECK-NEXT: addq $16, {{%[a-z0-9]+}} +; CHECK-NEXT: jne [[LOOP]] + +define void @kernel1() { +entry: + %tmp = load i32, i32* @k, align 4 + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @B1, i64 0, i64 %indvars.iv + %tmp1 = load i32, i32* %arrayidx, align 4 + %shr = ashr i32 %tmp1, %tmp + %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A1, i64 0, i64 %indvars.iv + store i32 %shr, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +}