diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -464,6 +464,9 @@ MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const; + + std::pair getTypeLegalizationCost(const DataLayout &DL, + Type *Ty) const; }; } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11690,3 +11690,18 @@ SmallPtrSet Visited; return hasCFUser(V, Visited, Subtarget->getWavefrontSize()); } + +std::pair +SITargetLowering::getTypeLegalizationCost(const DataLayout &DL, + Type *Ty) const { + auto Cost = TargetLoweringBase::getTypeLegalizationCost(DL, Ty); + auto Size = DL.getTypeSizeInBits(Ty); + // Maximum load or store can handle 8 dwords for scalar and 4 for + // vector ALU. Let's assume anything above 8 dwords is expensive + // even if legal. + if (Size <= 256) + return Cost; + + Cost.first = (Size + 255) / 256; + return Cost; +} diff --git a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll @@ -90,7 +90,7 @@ } ; ALL: 'add_v16i64' -; ALL: estimated cost of 32 for {{.*}} add <16 x i64> +; ALL: estimated cost of 128 for {{.*}} add <16 x i64> define amdgpu_kernel void @add_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %vaddr, <16 x i64> %b) #0 { %vec = load <16 x i64>, <16 x i64> addrspace(1)* %vaddr %add = add <16 x i64> %vec, %b diff --git a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll @@ -90,7 +90,7 @@ ; ALL: 'mul_v8i64' -; ALL: estimated cost of 128 for {{.*}} mul <8 x i64> +; ALL: estimated cost of 256 for {{.*}} mul <8 x i64> define amdgpu_kernel void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 { %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr %mul = mul <8 x i64> %vec, %b