Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -99,6 +99,11 @@ unsigned Index) { switch (Opcode) { case Instruction::ExtractElement: + case Instruction::InsertElement: + // Extracts are just reads of a subregister, so are free. Inserts are + // considered free because we don't want to have any cost for scalarizing + // operations, and we don't have to copy into a different register class. + // Dynamic indexing isn't free and is best avoided. return Index == ~0u ? 2 : 0; default: Index: test/Analysis/CostModel/AMDGPU/insertelement.ll =================================================================== --- /dev/null +++ test/Analysis/CostModel/AMDGPU/insertelement.ll @@ -0,0 +1,37 @@ +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s + +; CHECK: 'insertelement_v2i32' +; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i32> +define void @insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) { + %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr + %insert = insertelement <2 x i32> %vec, i32 1, i32 123 + store <2 x i32> %insert, <2 x i32> addrspace(1)* %out + ret void +} + +; CHECK: 'insertelement_v2i64' +; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i64> +define void @insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) { + %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr + %insert = insertelement <2 x i64> %vec, i64 1, i64 123 + store <2 x i64> %insert, <2 x i64> addrspace(1)* %out + ret void +} + +; CHECK: 'insertelement_v2i16' +; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i16> +define void @insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) { + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr + %insert = insertelement <2 x i16> %vec, i16 1, i16 123 + store <2 x i16> %insert, <2 x i16> addrspace(1)* %out + ret void +} + +; CHECK: 'insertelement_v2i8' +; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i8> +define void @insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) { + %vec = load <2 x i8>, <2 x i8> addrspace(1)* %vaddr + %insert = insertelement <2 x i8> %vec, i8 1, i8 123 + store <2 x i8> %insert, <2 x i8> addrspace(1)* %out + ret void +}