Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -464,6 +464,9 @@ MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const; + + std::pair getTypeLegalizationCost(const DataLayout &DL, + Type *Ty) const; }; } // End namespace llvm Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11680,3 +11680,18 @@ SmallPtrSet Visited; return hasCFUser(V, Visited, Subtarget->getWavefrontSize()); } + +std::pair +SITargetLowering::getTypeLegalizationCost(const DataLayout &DL, + Type *Ty) const { + auto Cost = TargetLoweringBase::getTypeLegalizationCost(DL, Ty); + auto Size = DL.getTypeSizeInBits(Ty); + // Maximum load or store can handle 8 dwords for scalar and 4 for + // vector ALU. Let's assume anything above 8 dwords is expensive + // even if legal. + if (Size <= 256) + return Cost; + + Cost.first = (Size + 255) / 256; + return Cost; +} Index: llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll =================================================================== --- llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll +++ llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll @@ -90,7 +90,7 @@ } ; ALL: 'add_v16i64' -; ALL: estimated cost of 32 for {{.*}} add <16 x i64> +; ALL: estimated cost of 128 for {{.*}} add <16 x i64> define amdgpu_kernel void @add_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %vaddr, <16 x i64> %b) #0 { %vec = load <16 x i64>, <16 x i64> addrspace(1)* %vaddr %add = add <16 x i64> %vec, %b Index: llvm/test/Analysis/CostModel/AMDGPU/mul.ll =================================================================== --- llvm/test/Analysis/CostModel/AMDGPU/mul.ll +++ llvm/test/Analysis/CostModel/AMDGPU/mul.ll @@ -90,7 +90,7 @@ ; ALL: 'mul_v8i64' -; ALL: estimated cost of 128 for {{.*}} mul <8 x i64> +; ALL: estimated cost of 256 for {{.*}} mul <8 x i64> define amdgpu_kernel void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 { %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr %mul = mul <8 x i64> %vec, %b