Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1028,9 +1028,10 @@ if (MemVT.isVector() && MemVT.getVectorNumElements() == 1) MemVT = MemVT.getScalarType(); - if (MemVT.isExtended()) { - // This should really only happen if we have vec3 arguments - assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3); + // Round up vec3/vec5 argument. + if (MemVT.isVector() && !MemVT.isPow2VectorType()) { + assert(MemVT.getVectorNumElements() == 3 || + MemVT.getVectorNumElements() == 5); MemVT = MemVT.getPow2VectorType(State.getContext()); } Index: test/Analysis/CostModel/AMDGPU/add-sub.ll =================================================================== --- test/Analysis/CostModel/AMDGPU/add-sub.ll +++ test/Analysis/CostModel/AMDGPU/add-sub.ll @@ -20,7 +20,9 @@ } ; CHECK: 'add_v3i32' -; CHECK: estimated cost of 3 for {{.*}} add <3 x i32> +; Allow for 4 when v3i32 is illegal and TargetLowering thinks it needs widening, +; and 3 when it is legal. +; CHECK: estimated cost of {{[34]}} for {{.*}} add <3 x i32> define amdgpu_kernel void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 { %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr %add = add <3 x i32> %vec, %b @@ -37,6 +39,17 @@ ret void } +; CHECK: 'add_v5i32' +; Allow for 8 when v3i32 is illegal and TargetLowering thinks it needs widening, +; and 5 when it is legal. +; CHECK: estimated cost of {{[58]}} for {{.*}} add <5 x i32> +define amdgpu_kernel void @add_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr, <5 x i32> %b) #0 { + %vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr + %add = add <5 x i32> %vec, %b + store <5 x i32> %add, <5 x i32> addrspace(1)* %out + ret void +} + ; CHECK: 'add_i64' ; CHECK: estimated cost of 2 for {{.*}} add i64 define amdgpu_kernel void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { Index: test/Analysis/CostModel/AMDGPU/extractelement.ll =================================================================== --- test/Analysis/CostModel/AMDGPU/extractelement.ll +++ test/Analysis/CostModel/AMDGPU/extractelement.ll @@ -38,6 +38,15 @@ ret void } +; GCN: 'extractelement_v5i32' +; GCN: estimated cost of 0 for {{.*}} extractelement <5 x i32> +define amdgpu_kernel void @extractelement_v5i32(i32 addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr) { + %vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr + %elt = extractelement <5 x i32> %vec, i32 1 + store i32 %elt, i32 addrspace(1)* %out + ret void +} + ; GCN: 'extractelement_v8i32' ; GCN: estimated cost of 0 for {{.*}} extractelement <8 x i32> define amdgpu_kernel void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) { Index: test/Analysis/CostModel/AMDGPU/fabs.ll =================================================================== --- test/Analysis/CostModel/AMDGPU/fabs.ll +++ test/Analysis/CostModel/AMDGPU/fabs.ll @@ -27,6 +27,15 @@ ret void } +; CHECK: 'fabs_v5f32' +; CHECK: estimated cost of 0 for {{.*}} call <5 x float> @llvm.fabs.v5f32 +define amdgpu_kernel void @fabs_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr) #0 { + %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr + %fabs = call <5 x float> @llvm.fabs.v5f32(<5 x float> %vec) #1 + store <5 x float> %fabs, <5 x float> addrspace(1)* %out + ret void +} + ; CHECK: 'fabs_f64' ; CHECK: estimated cost of 0 for {{.*}} call double @llvm.fabs.f64 define amdgpu_kernel void @fabs_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 { @@ -84,6 +93,7 @@ declare float @llvm.fabs.f32(float) #1 declare <2 x float> @llvm.fabs.v2f32(<2 x float>) #1 declare <3 x float> @llvm.fabs.v3f32(<3 x float>) #1 +declare <5 x float> @llvm.fabs.v5f32(<5 x float>) #1 declare double @llvm.fabs.f64(double) #1 declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #1 Index: test/Analysis/CostModel/AMDGPU/fadd.ll =================================================================== --- test/Analysis/CostModel/AMDGPU/fadd.ll +++ test/Analysis/CostModel/AMDGPU/fadd.ll @@ -20,7 +20,9 @@ } ; ALL: 'fadd_v3f32' -; ALL: estimated cost of 3 for {{.*}} fadd <3 x float> +; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening, +; and 3 when it is legal. +; ALL: estimated cost of {{[34]}} for {{.*}} fadd <3 x float> define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %add = fadd <3 x float> %vec, %b @@ -28,6 +30,17 @@ ret void } +; ALL: 'fadd_v5f32' +; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening, +; and 5 when it is legal. +; ALL: estimated cost of {{[58]}} for {{.*}} fadd <5 x float> +define amdgpu_kernel void @fadd_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 { + %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr + %add = fadd <5 x float> %vec, %b + store <5 x float> %add, <5 x float> addrspace(1)* %out + ret void +} + ; ALL: 'fadd_f64' ; FASTF64: estimated cost of 2 for {{.*}} fadd double ; SLOWF64: estimated cost of 3 for {{.*}} fadd double Index: test/Analysis/CostModel/AMDGPU/fdiv.ll =================================================================== --- test/Analysis/CostModel/AMDGPU/fdiv.ll +++ test/Analysis/CostModel/AMDGPU/fdiv.ll @@ -26,8 +26,10 @@ } ; ALL: 'fdiv_v3f32' -; NOFP32DENORM: estimated cost of 36 for {{.*}} fdiv <3 x float> -; FP32DENORMS: estimated cost of 30 for {{.*}} fdiv <3 x float> +; Allow for 48/40 when v3f32 is illegal and TargetLowering thinks it needs widening, +; and 36/30 when it is legal. +; NOFP32DENORM: estimated cost of {{36|48}} for {{.*}} fdiv <3 x float> +; FP32DENORMS: estimated cost of {{30|40}} for {{.*}} fdiv <3 x float> define amdgpu_kernel void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %add = fdiv <3 x float> %vec, %b @@ -35,6 +37,18 @@ ret void } +; ALL: 'fdiv_v5f32' +; Allow for 96/80 when v5f32 is illegal and TargetLowering thinks it needs widening, +; and 60/50 when it is legal. +; NOFP32DENORM: estimated cost of {{96|60}} for {{.*}} fdiv <5 x float> +; FP32DENORMS: estimated cost of {{80|50}} for {{.*}} fdiv <5 x float> +define amdgpu_kernel void @fdiv_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 { + %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr + %add = fdiv <5 x float> %vec, %b + store <5 x float> %add, <5 x float> addrspace(1)* %out + ret void +} + ; ALL: 'fdiv_f64' ; CIFASTF64: estimated cost of 29 for {{.*}} fdiv double ; CISLOWF64: estimated cost of 33 for {{.*}} fdiv double Index: test/Analysis/CostModel/AMDGPU/fmul.ll =================================================================== --- test/Analysis/CostModel/AMDGPU/fmul.ll +++ test/Analysis/CostModel/AMDGPU/fmul.ll @@ -20,7 +20,9 @@ } ; ALL: 'fmul_v3f32' -; ALL: estimated cost of 3 for {{.*}} fmul <3 x float> +; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening, +; and 3 when it is legal. +; ALL: estimated cost of {{[34]}} for {{.*}} fmul <3 x float> define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %add = fmul <3 x float> %vec, %b @@ -28,6 +30,17 @@ ret void } +; ALL: 'fmul_v5f32' +; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening, +; and 5 when it is legal. +; ALL: estimated cost of {{[58]}} for {{.*}} fmul <5 x float> +define amdgpu_kernel void @fmul_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 { + %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr + %add = fmul <5 x float> %vec, %b + store <5 x float> %add, <5 x float> addrspace(1)* %out + ret void +} + ; ALL: 'fmul_f64' ; FASTF64: estimated cost of 2 for {{.*}} fmul double ; SLOWF64: estimated cost of 3 for {{.*}} fmul double Index: test/Analysis/CostModel/AMDGPU/fsub.ll =================================================================== --- test/Analysis/CostModel/AMDGPU/fsub.ll +++ test/Analysis/CostModel/AMDGPU/fsub.ll @@ -20,7 +20,9 @@ } ; ALL: 'fsub_v3f32' -; ALL: estimated cost of 3 for {{.*}} fsub <3 x float> +; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening, +; and 3 when it is legal. +; ALL: estimated cost of {{[34]}} for {{.*}} fsub <3 x float> define amdgpu_kernel void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %add = fsub <3 x float> %vec, %b @@ -28,6 +30,17 @@ ret void } +; ALL: 'fsub_v5f32' +; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening, +; and 5 when it is legal. +; ALL: estimated cost of {{[58]}} for {{.*}} fsub <5 x float> +define amdgpu_kernel void @fsub_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 { + %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr + %add = fsub <5 x float> %vec, %b + store <5 x float> %add, <5 x float> addrspace(1)* %out + ret void +} + ; ALL: 'fsub_f64' ; FASTF64: estimated cost of 2 for {{.*}} fsub double ; SLOWF64: estimated cost of 3 for {{.*}} fsub double Index: test/Analysis/CostModel/AMDGPU/mul.ll =================================================================== --- test/Analysis/CostModel/AMDGPU/mul.ll +++ test/Analysis/CostModel/AMDGPU/mul.ll @@ -19,7 +19,9 @@ } ; CHECK: 'mul_v3i32' -; CHECK: estimated cost of 9 for {{.*}} mul <3 x i32> +; Allow for 12 when v3i32 is illegal and TargetLowering thinks it needs widening, +; and 9 when it is legal. +; CHECK: estimated cost of {{9|12}} for {{.*}} mul <3 x i32> define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 { %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr %mul = mul <3 x i32> %vec, %b @@ -27,6 +29,17 @@ ret void } +; CHECK: 'mul_v5i32' +; Allow for 24 when v5i32 is illegal and TargetLowering thinks it needs widening, +; and 15 when it is legal. +; CHECK: estimated cost of {{15|24}} for {{.*}} mul <5 x i32> +define amdgpu_kernel void @mul_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr, <5 x i32> %b) #0 { + %vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr + %mul = mul <5 x i32> %vec, %b + store <5 x i32> %mul, <5 x i32> addrspace(1)* %out + ret void +} + ; CHECK: 'mul_v4i32' ; CHECK: estimated cost of 12 for {{.*}} mul <4 x i32> define amdgpu_kernel void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 { Index: test/CodeGen/AMDGPU/kernel-args.ll =================================================================== --- test/CodeGen/AMDGPU/kernel-args.ll +++ test/CodeGen/AMDGPU/kernel-args.ll @@ -375,6 +375,122 @@ ret void } +; FUNC-LABEL: {{^}}v5i8_arg: +; HSA-GFX9: kernarg_segment_byte_size = 16 +; HSA-GFX9: kernarg_segment_alignment = 4 + +; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 +; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 +; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 + +; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb + +; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 +define amdgpu_kernel void @v5i8_arg(<5 x i8> addrspace(1)* nocapture %out, <5 x i8> %in) nounwind { +entry: + store <5 x i8> %in, <5 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v5i16_arg: +; HSA-GFX9: kernarg_segment_byte_size = 32 +; HSA-GFX9: kernarg_segment_alignment = 4 + +; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 58 +; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 58 +; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 58 + +; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd + +; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 +; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c +define amdgpu_kernel void @v5i16_arg(<5 x i16> addrspace(1)* nocapture %out, <5 x i16> %in) nounwind { +entry: + store <5 x i16> %in, <5 x i16> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v5i32_arg: +; HSA-GFX9: kernarg_segment_byte_size = 64 +; HSA-GFX9: kernarg_segment_alignment = 5 +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W +; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11 +; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44 +; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20 +define amdgpu_kernel void @v5i32_arg(<5 x i32> addrspace(1)* nocapture %out, <5 x i32> %in) nounwind { +entry: + store <5 x i32> %in, <5 x i32> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v5f32_arg: +; HSA-GFX9: kernarg_segment_byte_size = 64 +; HSA-GFX9: kernarg_segment_alignment = 5 +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W +; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11 +; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44 +; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20 +define amdgpu_kernel void @v5f32_arg(<5 x float> addrspace(1)* nocapture %out, <5 x float> %in) nounwind { +entry: + store <5 x float> %in, <5 x float> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v5i64_arg: +; HSA-GFX9: kernarg_segment_byte_size = 128 +; HSA-GFX9: kernarg_segment_alignment = 6 +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z +; SI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 +; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x21 +; MESA-VI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 +; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x84 +; HSA-GFX9-DAG: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 +; HSA-GFX9-DAG: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x60 +define amdgpu_kernel void @v5i64_arg(<5 x i64> addrspace(1)* nocapture %out, <5 x i64> %in) nounwind { +entry: + store <5 x i64> %in, <5 x i64> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v5f64_arg: +; HSA-GFX9: kernarg_segment_byte_size = 128 +; HSA-GFX9: kernarg_segment_alignment = 6 +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z +; SI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 +; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x21 +; MESA-VI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 +; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x84 +; HSA-GFX9-DAG: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 +; HSA-GFX9-DAG: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x60 +define amdgpu_kernel void @v5f64_arg(<5 x double> addrspace(1)* nocapture %out, <5 x double> %in) nounwind { +entry: + store <5 x double> %in, <5 x double> addrspace(1)* %out, align 8 + ret void +} + ; FIXME: Lots of unpack and re-pack junk on VI ; FUNC-LABEL: {{^}}v8i8_arg: ; HSA-GFX9: kernarg_segment_byte_size = 16