Index: lib/Target/NVPTX/NVPTXTargetTransformInfo.h =================================================================== --- lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -56,6 +56,13 @@ // calls are particularly expensive in NVPTX. unsigned getInliningThresholdMultiplier() { return 5; } + unsigned getNumberOfRegisters(bool /*Vector*/) const { return 1; } + unsigned getRegisterBitWidth(bool Vector) const { return Vector ? 128 : 64; } + + int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace); + int getVectorInstrCost(unsigned Opcode, Type *Ty, unsigned Index); + int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, Index: lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp =================================================================== --- lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -118,6 +118,42 @@ } } +int NVPTXTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Alignment, unsigned AddressSpace) { + int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); + + // Model vector loads and stores (of vector types that ptx supports) as half + // the cost of the corresponding set of scalar loads and stores. This is a + // bit optimistic, but it encourages the SLP optimizer to use vectorized loads + // and stores, which we want. + // + // We ignore the Alignment arg, even though PTX can only handle vector + // loads/stores that are aligned to the vector's width, because the SLP + // vectorizer queries us with an alignment of 1. + if (Src->isVectorTy()) { + int N = Src->getVectorNumElements(); + int SZ = Src->getScalarSizeInBits(); + if ((SZ <= 64 && N == 2) || (SZ <= 32 && N == 4)) { + return Cost / 2; + } + } + return Cost; +} + +int NVPTXTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Ty, + unsigned Index) { + switch (Opcode) { + case Instruction::InsertElement: + // Model vector insertions as free. PTX only supports vector loads and + // stores, and in those you can specify a list of general-purpose registers, + // {a, b, c, d}. So vector insertions get optimized away when we lower to + // PTX. + return 0; + default: + return BaseT::getVectorInstrCost(Opcode, Ty, Index); + } +} + void NVPTXTTIImpl::getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP) { BaseT::getUnrollingPreferences(L, UP); Index: test/Transforms/SLPVectorizer/NVPTX/lit.local.cfg =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/NVPTX/lit.local.cfg @@ -0,0 +1,3 @@ +if not 'NVPTX' in config.root.targets: + config.unsupported = True + Index: test/Transforms/SLPVectorizer/NVPTX/simple.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/NVPTX/simple.ll @@ -0,0 +1,53 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s + +; Check that we vectorize the store in the following function. + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +declare float @llvm.nvvm.ex2.approx.ftz.f(float) +declare float @llvm.nvvm.lg2.approx.ftz.f(float) +declare <4 x float > @llvm.nvvm.ldg.global.f.v4f32.p0v4f32(<4 x float >*, i32) + +define weak_odr void @foo(<4 x float>* %in1, <4 x float>* %in2, float* %out, i64 %out_idx) { + %1 = tail call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0v4f32(<4 x float>* %in1, i32 16) + %2 = extractelement <4 x float> %1, i32 0 + %3 = extractelement <4 x float> %1, i32 1 + %4 = extractelement <4 x float> %1, i32 2 + %5 = extractelement <4 x float> %1, i32 3 + %6 = fmul float %3, 0x3FF7154760000000 + %7 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6) + %8 = fmul float %4, 0x3FF7154760000000 + %9 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8) + %10 = fmul float %5, 0x3FF7154760000000 + %11 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %10) + %12 = fmul float %2, 0x3FF7154760000000 + %13 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %12) + %14 = tail call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0v4f32(<4 x float>* %in2, i32 16) + %15 = extractelement <4 x float> %14, i32 0 + %16 = extractelement <4 x float> %14, i32 1 + %17 = extractelement <4 x float> %14, i32 2 + %18 = extractelement <4 x float> %14, i32 3 + %19 = tail call float @llvm.nvvm.lg2.approx.ftz.f(float %16) + %20 = fmul float %19, 0x3FE62E4300000000 + %21 = tail call float @llvm.nvvm.lg2.approx.ftz.f(float %17) + %22 = fmul float %21, 0x3FE62E4300000000 + %23 = tail call float @llvm.nvvm.lg2.approx.ftz.f(float %18) + %24 = fmul float %23, 0x3FE62E4300000000 + %25 = tail call float @llvm.nvvm.lg2.approx.ftz.f(float %15) + %26 = fmul float %25, 0x3FE62E4300000000 + %27 = fadd float %7, %20 + %28 = fadd float %9, %22 + %29 = fadd float %11, %24 + %30 = fadd float %13, %26 + %31 = getelementptr inbounds float, float* %out, i64 %out_idx + store float %27, float* %31, align 16 + %32 = getelementptr inbounds float, float* %31, i64 1 + store float %28, float* %32, align 4 + %33 = getelementptr inbounds float, float* %31, i64 2 + store float %29, float* %33, align 8 + %34 = getelementptr inbounds float, float* %31, i64 3 + ; CHECK: store <4 x float> %{{[0-9]+}}, <4 x float>* %{{[0-9]+}}, align 16 + store float %30, float* %34, align 4 + ret void +}