Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -142,10 +142,15 @@ { ISD::FDIV, MVT::v2f64, 69 }, // divpd { ISD::FADD, MVT::v2f64, 2 }, // addpd { ISD::FSUB, MVT::v2f64, 2 }, // subpd - // v2i64/v4i64 mul is custom lowered as a series of long - // multiplies(3), shifts(3) and adds(2). - // slm muldq version throughput is 2 - { ISD::MUL, MVT::v2i64, 11 }, + // v2i64/v4i64 mul is custom lowered as a series of long: + // multiplies(3), shifts(3) and adds(2) + // slm muldq version throughput is 2 and addq throughput 4 + // thus: 3X2 (muldq throughput) + 3X1 (shift throuput) + + // 3X4 (addq throughput) = 17 + { ISD::MUL, MVT::v2i64, 17 }, + // slm addq\subq throughput is 4 + { ISD::ADD, MVT::v2i64, 4 }, + { ISD::SUB, MVT::v2i64, 4 }, }; if (ST->isSLM()) { Index: test/Analysis/CostModel/X86/slm-arith-costs.ll =================================================================== --- test/Analysis/CostModel/X86/slm-arith-costs.ll +++ test/Analysis/CostModel/X86/slm-arith-costs.ll @@ -3,6 +3,20 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" +define <2 x i64> @slm-costs_64_vector_add(<2 x i64> %a, <2 x i64> %b) { +entry: +; SLM: cost of 4 {{.*}} add <2 x i64> + %res = add <2 x i64> %a, %b + ret <2 x i64> %res +} + +define <2 x i64> @slm-costs_64_vector_sub(<2 x i64> %a, <2 x i64> %b) { +entry: +; SLM: cost of 4 {{.*}} sub <2 x i64> + %res = sub <2 x i64> %a, %b + ret <2 x i64> %res +} + ; 8bit mul define i8 @slm-costs_8_scalar_mul(i8 %a, i8 %b) { entry: @@ -13,7 +27,7 @@ define <2 x i8> @slm-costs_8_v2_mul(<2 x i8> %a, <2 x i8> %b) { entry: -; SLM: cost of 11 {{.*}} mul nsw <2 x i8> +; SLM: cost of 17 {{.*}} mul nsw <2 x i8> %res = mul nsw <2 x i8> %a, %b ret <2 x i8> %res } @@ -97,7 +111,7 @@ define <2 x i16> @slm-costs_16_v2_mul(<2 x i16> %a, <2 x i16> %b) { entry: -; SLM: cost of 11 {{.*}} mul nsw <2 x i16> +; SLM: cost of 17 {{.*}} mul nsw <2 x i16> %res = mul nsw <2 x i16> %a, %b ret <2 x i16> %res } @@ -181,7 +195,7 @@ define <2 x i32> @slm-costs_32_v2_mul(<2 x i32> %a, <2 x i32> %b) { entry: -; SLM: cost of 11 {{.*}} mul nsw <2 x i32> +; SLM: cost of 17 {{.*}} mul nsw <2 x i32> %res = mul nsw <2 x i32> %a, %b ret <2 x i32> %res } @@ -217,28 +231,28 @@ define <2 x i64> @slm-costs_64_v2_mul(<2 x i64> %a, <2 x i64> %b) { entry: -; SLM: cost of 11 {{.*}} mul nsw <2 x i64> +; SLM: cost of 17 {{.*}} mul nsw <2 x i64> %res = mul nsw <2 x i64> %a, %b ret <2 x i64> %res } define <4 x i64> @slm-costs_64_v4_mul(<4 x i64> %a, <4 x i64> %b) { entry: -; SLM: cost of 22 {{.*}} mul nsw <4 x i64> +; SLM: cost of 34 {{.*}} mul nsw <4 x i64> %res = mul nsw <4 x i64> %a, %b ret <4 x i64> %res } define <8 x i64> @slm-costs_64_v8_mul(<8 x i64> %a, <8 x i64> %b) { entry: -; SLM: cost of 44 {{.*}} mul nsw <8 x i64> +; SLM: cost of 68 {{.*}} mul nsw <8 x i64> %res = mul nsw <8 x i64> %a, %b ret <8 x i64> %res } define <16 x i64> @slm-costs_64_v16_mul(<16 x i64> %a, <16 x i64> %b) { entry: -; SLM: cost of 88 {{.*}} mul nsw <16 x i64> +; SLM: cost of 136 {{.*}} mul nsw <16 x i64> %res = mul nsw <16 x i64> %a, %b ret <16 x i64> %res } Index: test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll =================================================================== --- test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll +++ test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll @@ -0,0 +1,48 @@ +; This test should not be vectorized in X86\SLM arch +; Vectorizing the 64bit multiply in this case is wrong since +; it can be done with a lower bit mode (notice that the sources is 16bit) +; Also addq\subq (quad word) has a high cost on SLM arch. +; this test has a bad performance (regression of -70%) if vectorized on SLM arch +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -mcpu=slm -debug 2>&1 | FileCheck -check-prefix=MSG %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @no_vec(i32 %LastIndex, i16* nocapture readonly %InputData, i16 signext %lag, i16 signext %Scale) { +entry: +; MSG: LV: Selecting VF: 1. + %cmp17 = icmp sgt i32 %LastIndex, 0 + br i1 %cmp17, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %conv5 = sext i16 %Scale to i64 + %sh_prom = and i64 %conv5, 4294967295 + %0 = sext i16 %lag to i64 + %wide.trip.count = zext i32 %LastIndex to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + %conv8 = trunc i64 %add7 to i32 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %Accumulator.0.lcssa = phi i32 [ 0, %entry ], [ %conv8, %for.cond.cleanup.loopexit ] + ret i32 %Accumulator.0.lcssa + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %Accumulator.018 = phi i64 [ 0, %for.body.lr.ph ], [ %add7, %for.body ] + %arrayidx = getelementptr inbounds i16, i16* %InputData, i64 %indvars.iv + %1 = load i16, i16* %arrayidx, align 2 + %conv = sext i16 %1 to i64 + %2 = add nsw i64 %indvars.iv, %0 + %arrayidx3 = getelementptr inbounds i16, i16* %InputData, i64 %2 + %3 = load i16, i16* %arrayidx3, align 2 + %conv4 = sext i16 %3 to i64 + %mul = mul nsw i64 %conv4, %conv + %shr = ashr i64 %mul, %sh_prom + %add7 = add i64 %shr, %Accumulator.018 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} +