Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -75,6 +75,9 @@ } int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); + int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace); + bool isSourceOfDivergence(const Value *V) const; }; Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -156,6 +156,108 @@ } } +int AMDGPUTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Align, unsigned AS) { + // TODO: We should not use the default accounting for the scalarization of + // illegal vector types when they can be successfully merged into fewer loads. + + // FIXME: The base implementation should probably account for + // allowsMisalignedMemoryAccess, but unaligned accesses are expanded in a + // variety of different ways. + + const unsigned SMRDOpCost = 2; + const unsigned BufferOpCost = 5; + + if (Align == 0) + Align = DL.getABITypeAlignment(Src); + + switch (AS) { + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::PRIVATE_ADDRESS: + case AMDGPUAS::FLAT_ADDRESS: + default: { + // TODO: Account for alignment restrictions. + + if (VectorType *VT = dyn_cast(Src)) { + unsigned NElts = VT->getNumElements(); + Type *EltTy = VT->getElementType(); + unsigned EltSize = DL.getTypeAllocSize(EltTy); + + // v8i32 and v16i32 vectors are legal, but the largest store is 16 bytes, + // so ignore what the default cost derived from whether the type is legal + // and assume the vector is split correctly. + if (EltSize == 4) { + unsigned RoundedNElts = (NElts + 3) / 4; + return BufferOpCost * RoundedNElts; + } + } + + int BaseCost = BaseT::getMemoryOpCost(Opcode, Src, Align, AS); + return BufferOpCost * BaseCost; + } + case AMDGPUAS::LOCAL_ADDRESS: + case AMDGPUAS::REGION_ADDRESS: { + // LDS is pretty fast assuming no bank conflicts. + const unsigned DSOpCost = 3; + + // These don't have the larger load/store sizes, so estimate how the load + // will be broken up. + VectorType *VT = dyn_cast(Src); + + unsigned Size = DL.getTypeAllocSize(Src); + // This only has 32-bit and 64-bit loads and stores available even though + // larger vector types are legal, so estimate how many this will be split + // into. Ignore the base vector legalization cost. + if (Align == 1) + return DSOpCost * Size; + + int BaseCost = BaseT::getMemoryOpCost(Opcode, Src, Align, AS); + + // Somewhat hacky way to test for scalarization. + if (BaseCost == 1 && Align == 2) + return DSOpCost * Size / 2; + + if (VT) { + unsigned NElts = VT->getNumElements(); + Type *EltTy = VT->getElementType(); + unsigned EltSize = DL.getTypeAllocSize(EltTy); + + if (EltSize == 4) { + unsigned RoundedNElts = (NElts + 1) / 2; + return DSOpCost * RoundedNElts; + } + + if (EltSize == 8) + return DSOpCost * NElts; + + if (EltSize < 4) + return BaseCost * DSOpCost; + } + + assert(Align >= 4); + return BaseCost * DSOpCost; + } + case AMDGPUAS::CONSTANT_ADDRESS: { + int BaseCost = BaseT::getMemoryOpCost(Opcode, Src, Align, AS); + + // SMRD requires 4-byte alignment, otherwise we must use buffer + // instructions. + + // FIXME: We should be able to handle >= 4 byte aligned sub-dword types. + if (Align < 4 || DL.getTypeAllocSize(Src) < 4) + return BufferOpCost * BaseCost; + + // FIXME: Scalarized illegal types not correctly handled. + + // If uniformly accessed, SMRD instructions are faster than buffer/flat + // instructions. + return SMRDOpCost * BaseCost; + } + } + + llvm_unreachable("cannot happen"); +} + static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII, const IntrinsicInst *I) { switch (I->getIntrinsicID()) { Index: test/Analysis/CostModel/AMDGPU/memory-ops.ll =================================================================== --- /dev/null +++ test/Analysis/CostModel/AMDGPU/memory-ops.ll @@ -0,0 +1,459 @@ +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s + +; CHECK: 'store_global_i32' +; CHECK: estimated cost of 5 for {{.*}} store i32 +define void @store_global_i32(i32 addrspace(1)* %out) #0 { + store i32 0, i32 addrspace(1)* %out + ret void +} + +; CHECK: 'store_global_v2i32' +; CHECK: estimated cost of 5 for {{.*}} store <2 x i32> +define void @store_global_v2i32(<2 x i32> addrspace(1)* %out) #0 { + store <2 x i32> zeroinitializer, <2 x i32> addrspace(1)* %out + ret void +} + +; CHECK: 'store_global_v3i32' +; CHECK: estimated cost of 5 for {{.*}} store <3 x i32> +define void @store_global_v3i32(<3 x i32> addrspace(1)* %out) #0 { + store <3 x i32> zeroinitializer, <3 x i32> addrspace(1)* %out + ret void +} + +; CHECK: 'store_global_v4i32' +; CHECK: estimated cost of 5 for {{.*}} store <4 x i32> +define void @store_global_v4i32(<4 x i32> addrspace(1)* %out) #0 { + store <4 x i32> zeroinitializer, <4 x i32> addrspace(1)* %out + ret void +} + +; CHECK: 'store_global_v8i32' +; CHECK: estimated cost of 10 for {{.*}} store <8 x i32> +define void @store_global_v8i32(<8 x i32> addrspace(1)* %out) #0 { + store <8 x i32> zeroinitializer, <8 x i32> addrspace(1)* %out + ret void +} + +; CHECK: 'store_global_v16i32' +; CHECK: estimated cost of 20 for {{.*}} store <16 x i32> +define void @store_global_v16i32(<16 x i32> addrspace(1)* %out) #0 { + store <16 x i32> zeroinitializer, <16 x i32> addrspace(1)* %out + ret void +} + +; CHECK: 'store_global_v32i32' +; CHECK: estimated cost of 40 for {{.*}} store <32 x i32> +define void @store_global_v32i32(<32 x i32> addrspace(1)* %out) #0 { + store <32 x i32> zeroinitializer, <32 x i32> addrspace(1)* %out + ret void +} + +; CHECK: 'store_global_v64i32' +; CHECK: estimated cost of 80 for {{.*}} store <64 x i32> +define void @store_global_v64i32(<64 x i32> addrspace(1)* %out) #0 { + store <64 x i32> zeroinitializer, <64 x i32> addrspace(1)* %out + ret void +} + +; CHECK: 'store_global_i8' +; CHECK: estimated cost of 5 for {{.*}} store i8 +define void @store_global_i8(i8 addrspace(1)* %out) #0 { + store i8 0, i8 addrspace(1)* %out + ret void +} + +; CHECK: 'store_global_v4i8' +; CHECK: estimated cost of 20 for {{.*}} store <4 x i8> +define void @store_global_v4i8(<4 x i8> addrspace(1)* %out) #0 { + store <4 x i8> zeroinitializer, <4 x i8> addrspace(1)* %out + ret void +} + +; CHECK: 'store_global_i16' +; CHECK: estimated cost of 5 for {{.*}} store i16 +define void @store_global_i16(i16 addrspace(1)* %out) #0 { + store i16 0, i16 addrspace(1)* %out + ret void +} + +; CHECK: 'store_global_v4i16' +; CHECK: estimated cost of 20 for {{.*}} store <4 x i16> +define void @store_global_v4i16(<4 x i16> addrspace(1)* %out) #0 { + store <4 x i16> zeroinitializer, <4 x i16> addrspace(1)* %out + ret void +} + +; CHECK: 'store_global_v8i16' +; CHECK: estimated cost of 40 for {{.*}} store <8 x i16> +define void @store_global_v8i16(<8 x i16> addrspace(1)* %out) #0 { + store <8 x i16> zeroinitializer, <8 x i16> addrspace(1)* %out + ret void +} + +; CHECK: 'store_private_i32' +; CHECK: estimated cost of 5 for {{.*}} store i32 +define void @store_private_i32(i32* %out) #0 { + store i32 0, i32* %out + ret void +} + +; CHECK: 'store_private_v2i32' +; CHECK: estimated cost of 5 for {{.*}} store <2 x i32> +define void @store_private_v2i32(<2 x i32>* %out) #0 { + store <2 x i32> zeroinitializer, <2 x i32>* %out + ret void +} + +; CHECK: 'store_private_v3i32' +; CHECK: estimated cost of 5 for {{.*}} store <3 x i32> +define void @store_private_v3i32(<3 x i32>* %out) #0 { + store <3 x i32> zeroinitializer, <3 x i32>* %out + ret void +} + +; CHECK: 'store_private_v4i32' +; CHECK: estimated cost of 5 for {{.*}} store <4 x i32> +define void @store_private_v4i32(<4 x i32>* %out) #0 { + store <4 x i32> zeroinitializer, <4 x i32>* %out + ret void +} + +; CHECK: 'store_private_v4i8' +; CHECK: estimated cost of 20 for {{.*}} store <4 x i8> +define void @store_private_v4i8(<4 x i8>* %out) #0 { + store <4 x i8> zeroinitializer, <4 x i8>* %out + ret void +} + +; CHECK: 'store_global_i64' +; CHECK: estimated cost of 5 for {{.*}} store i64 +define void @store_global_i64(i64 addrspace(1)* %out) #0 { + store i64 0, i64 addrspace(1)* %out + ret void +} + +; CHECK: 'store_global_i64_align_1' +; CHECK: estimated cost of 5 for {{.*}} store i64 +define void @store_global_i64_align_1(i64 addrspace(1)* %out) #0 { + store i64 0, i64 addrspace(1)* %out, align 1 + ret void +} + +; CHECK: 'store_global_v2i64' +; CHECK: estimated cost of 5 for {{.*}} store <2 x i64> +define void @store_global_v2i64(<2 x i64> addrspace(1)* %out) #0 { + store <2 x i64> zeroinitializer, <2 x i64> addrspace(1)* %out + ret void +} + +; CHECK: 'store_global_v2i64_align_1' +; CHECK: estimated cost of 5 for {{.*}} store <2 x i64> +define void @store_global_v2i64_align_1(<2 x i64> addrspace(1)* %out) #0 { + store <2 x i64> zeroinitializer, <2 x i64> addrspace(1)* %out, align 1 + ret void +} + +; CHECK: 'store_global_v3i64' +; CHECK: estimated cost of 10 for {{.*}} store <3 x i64> +define void @store_global_v3i64(<3 x i64> addrspace(1)* %out) #0 { + store <3 x i64> zeroinitializer, <3 x i64> addrspace(1)* %out + ret void +} + +; CHECK: 'store_global_v4i64' +; CHECK: estimated cost of 10 for {{.*}} store <4 x i64> +define void @store_global_v4i64(<4 x i64> addrspace(1)* %out) #0 { + store <4 x i64> zeroinitializer, <4 x i64> addrspace(1)* %out + ret void +} + +; CHECK: 'store_global_v8i64' +; CHECK: estimated cost of 20 for {{.*}} store <8 x i64> +define void @store_global_v8i64(<8 x i64> addrspace(1)* %out) #0 { + store <8 x i64> zeroinitializer, <8 x i64> addrspace(1)* %out + ret void +} + +; CHECK: 'store_local_i32' +; CHECK: estimated cost of 3 for {{.*}} store i32 +define void @store_local_i32(i32 addrspace(3)* %out) #0 { + store i32 0, i32 addrspace(3)* %out + ret void +} + +; CHECK: 'store_local_i32_align_1' +; CHECK: estimated cost of 12 for {{.*}} store i32 +define void @store_local_i32_align_1(i32 addrspace(3)* %out) #0 { + store i32 0, i32 addrspace(3)* %out, align 1 + ret void +} + +; CHECK: 'store_local_i32_align_2' +; CHECK: estimated cost of 6 for {{.*}} store i32 +define void @store_local_i32_align_2(i32 addrspace(3)* %out) #0 { + store i32 0, i32 addrspace(3)* %out, align 2 + ret void +} + +; CHECK: 'store_local_v2i32' +; CHECK: estimated cost of 3 for {{.*}} store <2 x i32> +define void @store_local_v2i32(<2 x i32> addrspace(3)* %out) #0 { + store <2 x i32> zeroinitializer, <2 x i32> addrspace(3)* %out + ret void +} + +; CHECK: 'store_local_v3i32' +; CHECK: estimated cost of 6 for {{.*}} store <3 x i32> +define void @store_local_v3i32(<3 x i32> addrspace(3)* %out) #0 { + store <3 x i32> zeroinitializer, <3 x i32> addrspace(3)* %out + ret void +} + +; CHECK: 'store_local_v4i32' +; CHECK: estimated cost of 6 for {{.*}} store <4 x i32> +define void @store_local_v4i32(<4 x i32> addrspace(3)* %out) #0 { + store <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %out + ret void +} + +; CHECK: 'store_local_v8i32' +; CHECK: estimated cost of 12 for {{.*}} store <8 x i32> +define void @store_local_v8i32(<8 x i32> addrspace(3)* %out) #0 { + store <8 x i32> zeroinitializer, <8 x i32> addrspace(3)* %out + ret void +} + +; CHECK: 'store_local_v16i32' +; CHECK: estimated cost of 24 for {{.*}} store <16 x i32> +define void @store_local_v16i32(<16 x i32> addrspace(3)* %out) #0 { + store <16 x i32> zeroinitializer, <16 x i32> addrspace(3)* %out + ret void +} + +; CHECK: 'store_local_v32i32' +; CHECK: estimated cost of 48 for {{.*}} store <32 x i32> +define void @store_local_v32i32(<32 x i32> addrspace(3)* %out) #0 { + store <32 x i32> zeroinitializer, <32 x i32> addrspace(3)* %out + ret void +} + +; CHECK: 'store_local_i8' +; CHECK: estimated cost of 3 for {{.*}} store i8 +define void @store_local_i8(i8 addrspace(3)* %out) #0 { + store i8 0, i8 addrspace(3)* %out + ret void +} + +; CHECK: 'store_local_v4i8' +; CHECK: estimated cost of 12 for {{.*}} store <4 x i8> +define void @store_local_v4i8(<4 x i8> addrspace(3)* %out) #0 { + store <4 x i8> zeroinitializer, <4 x i8> addrspace(3)* %out + ret void +} + +; CHECK: 'store_local_v4i8_align_1' +; CHECK: estimated cost of 12 for {{.*}} store <4 x i8> +define void @store_local_v4i8_align_1(<4 x i8> addrspace(3)* %out) #0 { + store <4 x i8> zeroinitializer, <4 x i8> addrspace(3)* %out, align 1 + ret void +} + +; CHECK: 'store_local_v4i8_align_2' +; CHECK: estimated cost of 12 for {{.*}} store <4 x i8> +define void @store_local_v4i8_align_2(<4 x i8> addrspace(3)* %out) #0 { + store <4 x i8> zeroinitializer, <4 x i8> addrspace(3)* %out, align 2 + ret void +} + +; CHECK: 'store_local_i16' +; CHECK: estimated cost of 3 for {{.*}} store i16 +define void @store_local_i16(i16 addrspace(3)* %out) #0 { + store i16 0, i16 addrspace(3)* %out + ret void +} + +; CHECK: 'store_local_i16_align_4' +; CHECK: estimated cost of 3 for {{.*}} store i16 +define void @store_local_i16_align_4(i16 addrspace(3)* %out) #0 { + store i16 0, i16 addrspace(3)* %out, align 4 + ret void +} + +; CHECK: 'store_local_v4i16' +; CHECK: estimated cost of 12 for {{.*}} store <4 x i16> +define void @store_local_v4i16(<4 x i16> addrspace(3)* %out) #0 { + store <4 x i16> zeroinitializer, <4 x i16> addrspace(3)* %out + ret void +} + +; CHECK: 'store_local_v8i16' +; CHECK: estimated cost of 24 for {{.*}} store <8 x i16> +define void @store_local_v8i16(<8 x i16> addrspace(3)* %out) #0 { + store <8 x i16> zeroinitializer, <8 x i16> addrspace(3)* %out + ret void +} + +; CHECK: 'store_local_i64' +; CHECK: estimated cost of 3 for {{.*}} store i64 +define void @store_local_i64(i64 addrspace(3)* %out) #0 { + store i64 0, i64 addrspace(3)* %out + ret void +} + +; CHECK: 'store_local_i64_align_1' +; CHECK: estimated cost of 24 for {{.*}} store i64 +define void @store_local_i64_align_1(i64 addrspace(3)* %out) #0 { + store i64 0, i64 addrspace(3)* %out, align 1 + ret void +} + +; CHECK: 'store_local_i64_align_2' +; CHECK: estimated cost of 12 for {{.*}} store i64 +define void @store_local_i64_align_2(i64 addrspace(3)* %out) #0 { + store i64 0, i64 addrspace(3)* %out, align 2 + ret void +} + +; CHECK: 'store_local_v2i64' +; CHECK: estimated cost of 6 for {{.*}} store <2 x i64> +define void @store_local_v2i64(<2 x i64> addrspace(3)* %out) #0 { + store <2 x i64> zeroinitializer, <2 x i64> addrspace(3)* %out + ret void +} + +; CHECK: 'store_local_v2i64_align_1' +; CHECK: estimated cost of 48 for {{.*}} store <2 x i64> +define void @store_local_v2i64_align_1(<2 x i64> addrspace(3)* %out) #0 { + store <2 x i64> zeroinitializer, <2 x i64> addrspace(3)* %out, align 1 + ret void +} + +; CHECK: 'store_local_v2i64_align_2' +; CHECK: estimated cost of 24 for {{.*}} store <2 x i64> +define void @store_local_v2i64_align_2(<2 x i64> addrspace(3)* %out) #0 { + store <2 x i64> zeroinitializer, <2 x i64> addrspace(3)* %out, align 2 + ret void +} + +; CHECK: 'store_local_v3i64' +; CHECK: estimated cost of 9 for {{.*}} store <3 x i64> +define void @store_local_v3i64(<3 x i64> addrspace(3)* %out) #0 { + store <3 x i64> zeroinitializer, <3 x i64> addrspace(3)* %out + ret void +} + +; CHECK: 'store_local_v4i64' +; CHECK: estimated cost of 12 for {{.*}} store <4 x i64> +define void @store_local_v4i64(<4 x i64> addrspace(3)* %out) #0 { + store <4 x i64> zeroinitializer, <4 x i64> addrspace(3)* %out + ret void +} + +; CHECK: 'store_local_v8i64' +; CHECK: estimated cost of 24 for {{.*}} store <8 x i64> +define void @store_local_v8i64(<8 x i64> addrspace(3)* %out) #0 { + store <8 x i64> zeroinitializer, <8 x i64> addrspace(3)* %out + ret void +} + + +; CHECK: 'load_constant_i32' +; CHECK: estimated cost of 2 for {{.*}} load i32 +define void @load_constant_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) #0 { + %val = load i32, i32 addrspace(2)* %in + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; CHECK: 'load_constant_i32_align_2' +; CHECK: estimated cost of 5 for {{.*}} load i32 +define void @load_constant_i32_align_2(i32 addrspace(1)* %out, i32 addrspace(2)* %in) #0 { + %val = load i32, i32 addrspace(2)* %in, align 2 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; CHECK: 'load_constant_i32_align_1' +; CHECK: estimated cost of 5 for {{.*}} load i32 +define void @load_constant_i32_align_1(i32 addrspace(1)* %out, i32 addrspace(2)* %in) #0 { + %val = load i32, i32 addrspace(2)* %in, align 1 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; CHECK: 'load_constant_i8' +; CHECK: estimated cost of 5 for {{.*}} load i8 +define void @load_constant_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { + %val = load i8, i8 addrspace(2)* %in + store i8 %val, i8 addrspace(1)* %out + ret void +} + +; CHECK: 'load_constant_i8_align_4' +; CHECK: estimated cost of 5 for {{.*}} load i8 +define void @load_constant_i8_align_4(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { + %val = load i8, i8 addrspace(2)* %in, align 4 + store i8 %val, i8 addrspace(1)* %out + ret void +} + +; FIXME: This currently is actually using buffer instructions on the scalarized vector. + +; CHECK: 'load_constant_v4i8' +; CHECK: estimated cost of 8 for {{.*}} load <4 x i8> +define void @load_constant_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { + %val = load <4 x i8>, <4 x i8> addrspace(2)* %in + store <4 x i8> %val, <4 x i8> addrspace(1)* %out + ret void +} + +; CHECK: 'load_constant_v4i32' +; CHECK: estimated cost of 2 for {{.*}} load <4 x i32> +define void @load_constant_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 { + %val = load <4 x i32>, <4 x i32> addrspace(2)* %in + store <4 x i32> %val, <4 x i32> addrspace(1)* %out + ret void +} + +; CHECK: 'load_constant_v4i32_align_4' +; CHECK: estimated cost of 2 for {{.*}} load <4 x i32> +define void @load_constant_v4i32_align_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 { + %val = load <4 x i32>, <4 x i32> addrspace(2)* %in, align 4 + store <4 x i32> %val, <4 x i32> addrspace(1)* %out + ret void +} + +; CHECK: 'load_constant_v4i32_align_1' +; CHECK: estimated cost of 5 for {{.*}} load <4 x i32> +define void @load_constant_v4i32_align_1(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 { + %val = load <4 x i32>, <4 x i32> addrspace(2)* %in, align 1 + store <4 x i32> %val, <4 x i32> addrspace(1)* %out + ret void +} + +; CHECK: 'load_constant_v8i32' +; CHECK: estimated cost of 2 for {{.*}} load <8 x i32> +define void @load_constant_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 { + %val = load <8 x i32>, <8 x i32> addrspace(2)* %in + store <8 x i32> %val, <8 x i32> addrspace(1)* %out + ret void +} + +; CHECK: 'load_constant_v16i32' +; CHECK: estimated cost of 2 for {{.*}} load <16 x i32> +define void @load_constant_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 { + %val = load <16 x i32>, <16 x i32> addrspace(2)* %in + store <16 x i32> %val, <16 x i32> addrspace(1)* %out + ret void +} + +; CHECK: 'load_constant_v32i32' +; CHECK: estimated cost of 4 for {{.*}} load <32 x i32> +define void @load_constant_v32i32(<32 x i32> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 { + %val = load <32 x i32>, <32 x i32> addrspace(2)* %in + store <32 x i32> %val, <32 x i32> addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind }