Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1008,9 +1008,10 @@ if (MemVT.isVector() && MemVT.getVectorNumElements() == 1) MemVT = MemVT.getScalarType(); - if (MemVT.isExtended()) { - // This should really only happen if we have vec3 arguments - assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3); + // Round up vec3/vec5 argument. + if (MemVT.isVector() && !MemVT.isPow2VectorType()) { + assert(MemVT.getVectorNumElements() == 3 || + MemVT.getVectorNumElements() == 5); MemVT = MemVT.getPow2VectorType(State.getContext()); } Index: llvm/trunk/test/Analysis/CostModel/AMDGPU/add-sub.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/AMDGPU/add-sub.ll +++ llvm/trunk/test/Analysis/CostModel/AMDGPU/add-sub.ll @@ -20,7 +20,9 @@ } ; CHECK: 'add_v3i32' -; CHECK: estimated cost of 3 for {{.*}} add <3 x i32> +; Allow for 4 when v3i32 is illegal and TargetLowering thinks it needs widening, +; and 3 when it is legal. +; CHECK: estimated cost of {{[34]}} for {{.*}} add <3 x i32> define amdgpu_kernel void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 { %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr %add = add <3 x i32> %vec, %b @@ -37,6 +39,17 @@ ret void } +; CHECK: 'add_v5i32' +; Allow for 8 when v3i32 is illegal and TargetLowering thinks it needs widening, +; and 5 when it is legal. +; CHECK: estimated cost of {{[58]}} for {{.*}} add <5 x i32> +define amdgpu_kernel void @add_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr, <5 x i32> %b) #0 { + %vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr + %add = add <5 x i32> %vec, %b + store <5 x i32> %add, <5 x i32> addrspace(1)* %out + ret void +} + ; CHECK: 'add_i64' ; CHECK: estimated cost of 2 for {{.*}} add i64 define amdgpu_kernel void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { Index: llvm/trunk/test/Analysis/CostModel/AMDGPU/extractelement.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/AMDGPU/extractelement.ll +++ llvm/trunk/test/Analysis/CostModel/AMDGPU/extractelement.ll @@ -38,6 +38,15 @@ ret void } +; GCN: 'extractelement_v5i32' +; GCN: estimated cost of 0 for {{.*}} extractelement <5 x i32> +define amdgpu_kernel void @extractelement_v5i32(i32 addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr) { + %vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr + %elt = extractelement <5 x i32> %vec, i32 1 + store i32 %elt, i32 addrspace(1)* %out + ret void +} + ; GCN: 'extractelement_v8i32' ; GCN: estimated cost of 0 for {{.*}} extractelement <8 x i32> define amdgpu_kernel void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) { Index: llvm/trunk/test/Analysis/CostModel/AMDGPU/fabs.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/AMDGPU/fabs.ll +++ llvm/trunk/test/Analysis/CostModel/AMDGPU/fabs.ll @@ -27,6 +27,15 @@ ret void } +; CHECK: 'fabs_v5f32' +; CHECK: estimated cost of 0 for {{.*}} call <5 x float> @llvm.fabs.v5f32 +define amdgpu_kernel void @fabs_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr) #0 { + %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr + %fabs = call <5 x float> @llvm.fabs.v5f32(<5 x float> %vec) #1 + store <5 x float> %fabs, <5 x float> addrspace(1)* %out + ret void +} + ; CHECK: 'fabs_f64' ; CHECK: estimated cost of 0 for {{.*}} call double @llvm.fabs.f64 define amdgpu_kernel void @fabs_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 { @@ -84,6 +93,7 @@ declare float @llvm.fabs.f32(float) #1 declare <2 x float> @llvm.fabs.v2f32(<2 x float>) #1 declare <3 x float> @llvm.fabs.v3f32(<3 x float>) #1 +declare <5 x float> @llvm.fabs.v5f32(<5 x float>) #1 declare double @llvm.fabs.f64(double) #1 declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #1 Index: llvm/trunk/test/Analysis/CostModel/AMDGPU/fadd.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/AMDGPU/fadd.ll +++ llvm/trunk/test/Analysis/CostModel/AMDGPU/fadd.ll @@ -20,7 +20,9 @@ } ; ALL: 'fadd_v3f32' -; ALL: estimated cost of 3 for {{.*}} fadd <3 x float> +; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening, +; and 3 when it is legal. +; ALL: estimated cost of {{[34]}} for {{.*}} fadd <3 x float> define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %add = fadd <3 x float> %vec, %b @@ -28,6 +30,17 @@ ret void } +; ALL: 'fadd_v5f32' +; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening, +; and 5 when it is legal. +; ALL: estimated cost of {{[58]}} for {{.*}} fadd <5 x float> +define amdgpu_kernel void @fadd_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 { + %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr + %add = fadd <5 x float> %vec, %b + store <5 x float> %add, <5 x float> addrspace(1)* %out + ret void +} + ; ALL: 'fadd_f64' ; FASTF64: estimated cost of 2 for {{.*}} fadd double ; SLOWF64: estimated cost of 3 for {{.*}} fadd double Index: llvm/trunk/test/Analysis/CostModel/AMDGPU/fdiv.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/AMDGPU/fdiv.ll +++ llvm/trunk/test/Analysis/CostModel/AMDGPU/fdiv.ll @@ -26,8 +26,10 @@ } ; ALL: 'fdiv_v3f32' -; NOFP32DENORM: estimated cost of 36 for {{.*}} fdiv <3 x float> -; FP32DENORMS: estimated cost of 30 for {{.*}} fdiv <3 x float> +; Allow for 48/40 when v3f32 is illegal and TargetLowering thinks it needs widening, +; and 36/30 when it is legal. +; NOFP32DENORM: estimated cost of {{36|48}} for {{.*}} fdiv <3 x float> +; FP32DENORMS: estimated cost of {{30|40}} for {{.*}} fdiv <3 x float> define amdgpu_kernel void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %add = fdiv <3 x float> %vec, %b @@ -35,6 +37,18 @@ ret void } +; ALL: 'fdiv_v5f32' +; Allow for 96/80 when v5f32 is illegal and TargetLowering thinks it needs widening, +; and 60/50 when it is legal. +; NOFP32DENORM: estimated cost of {{96|60}} for {{.*}} fdiv <5 x float> +; FP32DENORMS: estimated cost of {{80|50}} for {{.*}} fdiv <5 x float> +define amdgpu_kernel void @fdiv_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 { + %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr + %add = fdiv <5 x float> %vec, %b + store <5 x float> %add, <5 x float> addrspace(1)* %out + ret void +} + ; ALL: 'fdiv_f64' ; CIFASTF64: estimated cost of 29 for {{.*}} fdiv double ; CISLOWF64: estimated cost of 33 for {{.*}} fdiv double Index: llvm/trunk/test/Analysis/CostModel/AMDGPU/fmul.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/AMDGPU/fmul.ll +++ llvm/trunk/test/Analysis/CostModel/AMDGPU/fmul.ll @@ -20,7 +20,9 @@ } ; ALL: 'fmul_v3f32' -; ALL: estimated cost of 3 for {{.*}} fmul <3 x float> +; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening, +; and 3 when it is legal. +; ALL: estimated cost of {{[34]}} for {{.*}} fmul <3 x float> define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %add = fmul <3 x float> %vec, %b @@ -28,6 +30,17 @@ ret void } +; ALL: 'fmul_v5f32' +; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening, +; and 5 when it is legal. +; ALL: estimated cost of {{[58]}} for {{.*}} fmul <5 x float> +define amdgpu_kernel void @fmul_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 { + %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr + %add = fmul <5 x float> %vec, %b + store <5 x float> %add, <5 x float> addrspace(1)* %out + ret void +} + ; ALL: 'fmul_f64' ; FASTF64: estimated cost of 2 for {{.*}} fmul double ; SLOWF64: estimated cost of 3 for {{.*}} fmul double Index: llvm/trunk/test/Analysis/CostModel/AMDGPU/fsub.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/AMDGPU/fsub.ll +++ llvm/trunk/test/Analysis/CostModel/AMDGPU/fsub.ll @@ -20,7 +20,9 @@ } ; ALL: 'fsub_v3f32' -; ALL: estimated cost of 3 for {{.*}} fsub <3 x float> +; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening, +; and 3 when it is legal. +; ALL: estimated cost of {{[34]}} for {{.*}} fsub <3 x float> define amdgpu_kernel void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %add = fsub <3 x float> %vec, %b @@ -28,6 +30,17 @@ ret void } +; ALL: 'fsub_v5f32' +; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening, +; and 5 when it is legal. +; ALL: estimated cost of {{[58]}} for {{.*}} fsub <5 x float> +define amdgpu_kernel void @fsub_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 { + %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr + %add = fsub <5 x float> %vec, %b + store <5 x float> %add, <5 x float> addrspace(1)* %out + ret void +} + ; ALL: 'fsub_f64' ; FASTF64: estimated cost of 2 for {{.*}} fsub double ; SLOWF64: estimated cost of 3 for {{.*}} fsub double Index: llvm/trunk/test/Analysis/CostModel/AMDGPU/mul.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/AMDGPU/mul.ll +++ llvm/trunk/test/Analysis/CostModel/AMDGPU/mul.ll @@ -19,7 +19,9 @@ } ; CHECK: 'mul_v3i32' -; CHECK: estimated cost of 9 for {{.*}} mul <3 x i32> +; Allow for 12 when v3i32 is illegal and TargetLowering thinks it needs widening, +; and 9 when it is legal. +; CHECK: estimated cost of {{9|12}} for {{.*}} mul <3 x i32> define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 { %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr %mul = mul <3 x i32> %vec, %b @@ -27,6 +29,17 @@ ret void } +; CHECK: 'mul_v5i32' +; Allow for 24 when v5i32 is illegal and TargetLowering thinks it needs widening, +; and 15 when it is legal. +; CHECK: estimated cost of {{15|24}} for {{.*}} mul <5 x i32> +define amdgpu_kernel void @mul_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr, <5 x i32> %b) #0 { + %vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr + %mul = mul <5 x i32> %vec, %b + store <5 x i32> %mul, <5 x i32> addrspace(1)* %out + ret void +} + ; CHECK: 'mul_v4i32' ; CHECK: estimated cost of 12 for {{.*}} mul <4 x i32> define amdgpu_kernel void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 { Index: llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll +++ llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll @@ -26,7 +26,9 @@ declare hidden void @external_void_func_f64(double) #0 declare hidden void @external_void_func_v2f32(<2 x float>) #0 declare hidden void @external_void_func_v2f64(<2 x double>) #0 +declare hidden void @external_void_func_v3f32(<3 x float>) #0 declare hidden void @external_void_func_v3f64(<3 x double>) #0 +declare hidden void @external_void_func_v5f32(<5 x float>) #0 declare hidden void @external_void_func_v2i16(<2 x i16>) #0 declare hidden void @external_void_func_v2f16(<2 x half>) #0 @@ -39,6 +41,7 @@ declare hidden void @external_void_func_v3i32(<3 x i32>) #0 declare hidden void @external_void_func_v3i32_i32(<3 x i32>, i32) #0 declare hidden void @external_void_func_v4i32(<4 x i32>) #0 +declare hidden void @external_void_func_v5i32(<5 x i32>) #0 declare hidden void @external_void_func_v8i32(<8 x i32>) #0 declare hidden void @external_void_func_v16i32(<16 x i32>) #0 declare hidden void @external_void_func_v32i32(<32 x i32>) #0 @@ -341,6 +344,30 @@ ret void } +; GCN-LABEL: {{^}}test_call_external_void_func_v3f32_imm: +; GCN-DAG: v_mov_b32_e32 v0, 1.0 +; GCN-DAG: v_mov_b32_e32 v1, 2.0 +; GCN-DAG: v_mov_b32_e32 v2, 4.0 +; GCN-NOT: v3, +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { + call void @external_void_func_v3f32(<3 x float> ) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_v5f32_imm: +; GCN-DAG: v_mov_b32_e32 v0, 1.0 +; GCN-DAG: v_mov_b32_e32 v1, 2.0 +; GCN-DAG: v_mov_b32_e32 v2, 4.0 +; GCN-DAG: v_mov_b32_e32 v3, -1.0 +; GCN-DAG: v_mov_b32_e32 v4, 0.5 +; GCN-NOT: v5, +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { + call void @external_void_func_v5f32(<5 x float> ) + ret void +} + ; GCN-LABEL: {{^}}test_call_external_void_func_f64_imm: ; GCN: v_mov_b32_e32 v0, 0{{$}} ; GCN: v_mov_b32_e32 v1, 0x40100000 @@ -519,6 +546,19 @@ ret void } +; GCN-LABEL: {{^}}test_call_external_void_func_v5i32_imm: +; GCN-DAG: v_mov_b32_e32 v0, 1 +; GCN-DAG: v_mov_b32_e32 v1, 2 +; GCN-DAG: v_mov_b32_e32 v2, 3 +; GCN-DAG: v_mov_b32_e32 v3, 4 +; GCN-DAG: v_mov_b32_e32 v4, 5 +; GCN-NOT v5, +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { + call void @external_void_func_v5i32(<5 x i32> ) + ret void +} + ; GCN-LABEL: {{^}}test_call_external_void_func_v8i32: ; GCN-DAG: buffer_load_dwordx4 v[0:3], off ; GCN-DAG: buffer_load_dwordx4 v[4:7], off @@ -764,9 +804,140 @@ ret void } +; GCN-LABEL: {{^}}stack_12xv3i32: +; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 +; GCN: buffer_store_dword [[REG15]], {{.*}} offset:16 +; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 +; GCN: buffer_store_dword [[REG14]], {{.*}} offset:12 +; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 +; GCN: buffer_store_dword [[REG13]], {{.*}} offset:8 +; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 +; GCN: buffer_store_dword [[REG12]], {{.*}} offset:4 +; GCN: v_mov_b32_e32 v31, 11 +; GCN: s_getpc +define void @stack_12xv3i32() #0 { +entry: + call void @external_void_func_12xv3i32( + <3 x i32>, + <3 x i32>, + <3 x i32>, + <3 x i32>, + <3 x i32>, + <3 x i32>, + <3 x i32>, + <3 x i32>, + <3 x i32>, + <3 x i32>, + <3 x i32>, + <3 x i32>) + ret void +} + +; GCN-LABEL: {{^}}stack_12xv3f32: +; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 +; GCN: buffer_store_dword [[REG15]], {{.*}} offset:16 +; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 +; GCN: buffer_store_dword [[REG14]], {{.*}} offset:12 +; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 +; GCN: buffer_store_dword [[REG13]], {{.*}} offset:8 +; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 +; GCN: buffer_store_dword [[REG12]], {{.*}} offset:4 +; GCN: v_mov_b32_e32 v31, 0x41300000 +; GCN: s_getpc +define void @stack_12xv3f32() #0 { +entry: + call void @external_void_func_12xv3f32( + <3 x float>, + <3 x float>, + <3 x float>, + <3 x float>, + <3 x float>, + <3 x float>, + <3 x float>, + <3 x float>, + <3 x float>, + <3 x float>, + <3 x float>, + <3 x float>) + ret void +} + +; GCN-LABEL: {{^}}stack_8xv5i32: +; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 +; GCN: buffer_store_dword [[REG15]], {{.*}} offset:32 +; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 +; GCN: buffer_store_dword [[REG14]], {{.*}} offset:28 +; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 +; GCN: buffer_store_dword [[REG13]], {{.*}} offset:24 +; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 +; GCN: buffer_store_dword [[REG12]], {{.*}} offset:20 +; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 +; GCN: buffer_store_dword [[REG11]], {{.*}} offset:16 +; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 +; GCN: buffer_store_dword [[REG10]], {{.*}} offset:12 +; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 +; GCN: buffer_store_dword [[REG9]], {{.*}} offset:8 +; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 +; GCN: buffer_store_dword [[REG8]], {{.*}} offset:4 +; GCN: v_mov_b32_e32 v31, 7 +; GCN: s_getpc +define void @stack_8xv5i32() #0 { +entry: + call void @external_void_func_8xv5i32( + <5 x i32>, + <5 x i32>, + <5 x i32>, + <5 x i32>, + <5 x i32>, + <5 x i32>, + <5 x i32>, + <5 x i32>) + ret void +} + +; GCN-LABEL: {{^}}stack_8xv5f32: +; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 +; GCN: buffer_store_dword [[REG15]], {{.*}} offset:32 +; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 +; GCN: buffer_store_dword [[REG14]], {{.*}} offset:28 +; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 +; GCN: buffer_store_dword [[REG13]], {{.*}} offset:24 +; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 +; GCN: buffer_store_dword [[REG12]], {{.*}} offset:20 +; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 +; GCN: buffer_store_dword [[REG11]], {{.*}} offset:16 +; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000 +; GCN: buffer_store_dword [[REG10]], {{.*}} offset:12 +; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000 +; GCN: buffer_store_dword [[REG9]], {{.*}} offset:8 +; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000 +; GCN: buffer_store_dword [[REG8]], {{.*}} offset:4 +; GCN: v_mov_b32_e32 v31, 0x40e00000 +; GCN: s_getpc +define void @stack_8xv5f32() #0 { +entry: + call void @external_void_func_8xv5f32( + <5 x float>, + <5 x float>, + <5 x float>, + <5 x float>, + <5 x float>, + <5 x float>, + <5 x float>, + <5 x float>) + ret void +} + declare hidden void @byval_align16_f64_arg(<32 x i32>, double addrspace(5)* byval align 16) #0 declare hidden void @stack_passed_f64_arg(<32 x i32>, double) #0 - +declare hidden void @external_void_func_12xv3i32(<3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, + <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>) #0 +declare hidden void @external_void_func_8xv5i32(<5 x i32>, <5 x i32>, <5 x i32>, <5 x i32>, + <5 x i32>, <5 x i32>, <5 x i32>, <5 x i32>) #0 +declare hidden void @external_void_func_12xv3f32(<3 x float>, <3 x float>, <3 x float>, <3 x float>, + <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>) #0 +declare hidden void @external_void_func_8xv5f32(<5 x float>, <5 x float>, <5 x float>, <5 x float>, + <5 x float>, <5 x float>, <5 x float>, <5 x float>) #0 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } attributes #2 = { nounwind noinline } Index: llvm/trunk/test/CodeGen/AMDGPU/call-return-types.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/call-return-types.ll +++ llvm/trunk/test/CodeGen/AMDGPU/call-return-types.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX89 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX89 %s declare void @external_void_func_void() #0 @@ -26,6 +26,8 @@ declare <2 x half> @external_v2f16_func_void() #0 declare <4 x half> @external_v4f16_func_void() #0 +declare <3 x float> @external_v3f32_func_void() #0 +declare <5 x float> @external_v5f32_func_void() #0 declare <2 x double> @external_v2f64_func_void() #0 declare <2 x i32> @external_v2i32_func_void() #0 @@ -171,6 +173,11 @@ } ; GCN-LABEL: {{^}}test_call_external_v3i32_func_void: +; GCN: s_swappc +; GFX7-DAG: flat_store_dwordx2 {{.*}}, v[0:1] +; GFX7-DAG: flat_store_dword {{.*}}, v2 +; GFX89-DAG: buffer_store_dwordx2 v[0:1] +; GFX89-DAG: buffer_store_dword v2 define amdgpu_kernel void @test_call_external_v3i32_func_void() #0 { %val = call <3 x i32> @external_v3i32_func_void() store volatile <3 x i32> %val, <3 x i32> addrspace(1)* undef, align 8 @@ -185,6 +192,11 @@ } ; GCN-LABEL: {{^}}test_call_external_v5i32_func_void: +; GCN: s_swappc +; GFX7-DAG: flat_store_dwordx4 {{.*}}, v[0:3] +; GFX7-DAG: flat_store_dword {{.*}}, v4 +; GFX89-DAG: buffer_store_dwordx4 v[0:3] +; GFX89-DAG: buffer_store_dword v4 define amdgpu_kernel void @test_call_external_v5i32_func_void() #0 { %val = call <5 x i32> @external_v5i32_func_void() store volatile <5 x i32> %val, <5 x i32> addrspace(1)* undef, align 8 @@ -240,6 +252,30 @@ ret void } +; GCN-LABEL: {{^}}test_call_external_v3f32_func_void: +; GCN: s_swappc +; GFX7-DAG: flat_store_dwordx2 {{.*}}, v[0:1] +; GFX7-DAG: flat_store_dword {{.*}}, v2 +; GFX89-DAG: buffer_store_dwordx2 v[0:1] +; GFX89-DAG: buffer_store_dword v2 +define amdgpu_kernel void @test_call_external_v3f32_func_void() #0 { + %val = call <3 x float> @external_v3f32_func_void() + store volatile <3 x float> %val, <3 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_call_external_v5f32_func_void: +; GCN: s_swappc +; GFX7-DAG: flat_store_dwordx4 {{.*}}, v[0:3] +; GFX7-DAG: flat_store_dword {{.*}}, v4 +; GFX89-DAG: buffer_store_dwordx4 v[0:3] +; GFX89-DAG: buffer_store_dword v4 +define amdgpu_kernel void @test_call_external_v5f32_func_void() #0 { + %val = call <5 x float> @external_v5f32_func_void() + store volatile <5 x float> %val, <5 x float> addrspace(1)* undef + ret void +} + ; GCN-LABEL: {{^}}test_call_external_i32_i64_func_void: define amdgpu_kernel void @test_call_external_i32_i64_func_void() #0 { %val = call { i32, i64 } @external_i32_i64_func_void() Index: llvm/trunk/test/CodeGen/AMDGPU/calling-conventions.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/calling-conventions.ll +++ llvm/trunk/test/CodeGen/AMDGPU/calling-conventions.ll @@ -200,4 +200,94 @@ ret void } +; GCN-LABEL: {{^}}ps_mesa_inreg_v3i32: +; GCN-DAG: s_add_i32 s0, s0, 1 +; GCN-DAG: s_add_i32 s{{[0-9]*}}, s1, 2 +; GCN-DAG: s_add_i32 s{{[0-9]*}}, s2, 3 +define amdgpu_ps void @ps_mesa_inreg_v3i32(<3 x i32> inreg %arg0) { + %add = add <3 x i32> %arg0, + store <3 x i32> %add, <3 x i32> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}ps_mesa_inreg_v3f32: +; GCN-DAG: v_add_f32{{.*}}, s0, 1.0 +; GCN-DAG: v_add_f32{{.*}}, s1, 2.0 +; GCN-DAG: v_add_f32{{.*}}, s2, 4.0 +define amdgpu_ps void @ps_mesa_inreg_v3f32(<3 x float> inreg %arg0) { + %add = fadd <3 x float> %arg0, + store <3 x float> %add, <3 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}ps_mesa_inreg_v5i32: +; GCN-DAG: s_add_i32 s0, s0, 1 +; GCN-DAG: s_add_i32 s{{[0-9]*}}, s1, 2 +; GCN-DAG: s_add_i32 s{{[0-9]*}}, s2, 3 +; GCN-DAG: s_add_i32 s{{[0-9]*}}, s3, 4 +; GCN-DAG: s_add_i32 s{{[0-9]*}}, s4, 5 +define amdgpu_ps void @ps_mesa_inreg_v5i32(<5 x i32> inreg %arg0) { + %add = add <5 x i32> %arg0, + store <5 x i32> %add, <5 x i32> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}ps_mesa_inreg_v5f32: +; GCN-DAG: v_add_f32{{.*}}, s0, 1.0 +; GCN-DAG: v_add_f32{{.*}}, s1, 2.0 +; GCN-DAG: v_add_f32{{.*}}, s2, 4.0 +; GCN-DAG: v_add_f32{{.*}}, s3, -1.0 +; GCN-DAG: v_add_f32{{.*}}, s4, 0.5 +define amdgpu_ps void @ps_mesa_inreg_v5f32(<5 x float> inreg %arg0) { + %add = fadd <5 x float> %arg0, + store <5 x float> %add, <5 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}ps_mesa_v3i32: +; GCN-DAG: v_add_{{.*}}, 1, v0 +; GCN-DAG: v_add_{{.*}}, 2, v1 +; GCN-DAG: v_add_{{.*}}, 3, v2 +define amdgpu_ps void @ps_mesa_v3i32(<3 x i32> %arg0) { + %add = add <3 x i32> %arg0, + store <3 x i32> %add, <3 x i32> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}ps_mesa_v3f32: +; GCN-DAG: v_add_{{.*}}, 1.0, v0 +; GCN-DAG: v_add_{{.*}}, 2.0, v1 +; GCN-DAG: v_add_{{.*}}, 4.0, v2 +define amdgpu_ps void @ps_mesa_v3f32(<3 x float> %arg0) { + %add = fadd <3 x float> %arg0, + store <3 x float> %add, <3 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}ps_mesa_v5i32: +; GCN-DAG: v_add_{{.*}}, 1, v0 +; GCN-DAG: v_add_{{.*}}, 2, v1 +; GCN-DAG: v_add_{{.*}}, 3, v2 +; GCN-DAG: v_add_{{.*}}, 4, v3 +; GCN-DAG: v_add_{{.*}}, 5, v4 +define amdgpu_ps void @ps_mesa_v5i32(<5 x i32> %arg0) { + %add = add <5 x i32> %arg0, + store <5 x i32> %add, <5 x i32> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}ps_mesa_v5f32: +; GCN-DAG: v_add_f32{{.*}}, 1.0, v0 +; GCN-DAG: v_add_f32{{.*}}, 2.0, v1 +; GCN-DAG: v_add_f32{{.*}}, 4.0, v2 +; GCN-DAG: v_add_f32{{.*}}, -1.0, v3 +; GCN-DAG: v_add_f32{{.*}}, 0.5, v4 +define amdgpu_ps void @ps_mesa_v5f32(<5 x float> %arg0) { + %add = fadd <5 x float> %arg0, + store <5 x float> %add, <5 x float> addrspace(1)* undef + ret void +} + + + attributes #0 = { nounwind noinline } Index: llvm/trunk/test/CodeGen/AMDGPU/kernel-args.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/kernel-args.ll +++ llvm/trunk/test/CodeGen/AMDGPU/kernel-args.ll @@ -375,6 +375,122 @@ ret void } +; FUNC-LABEL: {{^}}v5i8_arg: +; HSA-GFX9: kernarg_segment_byte_size = 16 +; HSA-GFX9: kernarg_segment_alignment = 4 + +; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 +; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 +; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 + +; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb + +; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 +define amdgpu_kernel void @v5i8_arg(<5 x i8> addrspace(1)* nocapture %out, <5 x i8> %in) nounwind { +entry: + store <5 x i8> %in, <5 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v5i16_arg: +; HSA-GFX9: kernarg_segment_byte_size = 32 +; HSA-GFX9: kernarg_segment_alignment = 4 + +; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 58 +; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 58 +; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 58 + +; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd + +; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 +; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c +define amdgpu_kernel void @v5i16_arg(<5 x i16> addrspace(1)* nocapture %out, <5 x i16> %in) nounwind { +entry: + store <5 x i16> %in, <5 x i16> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v5i32_arg: +; HSA-GFX9: kernarg_segment_byte_size = 64 +; HSA-GFX9: kernarg_segment_alignment = 5 +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W +; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11 +; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44 +; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20 +define amdgpu_kernel void @v5i32_arg(<5 x i32> addrspace(1)* nocapture %out, <5 x i32> %in) nounwind { +entry: + store <5 x i32> %in, <5 x i32> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v5f32_arg: +; HSA-GFX9: kernarg_segment_byte_size = 64 +; HSA-GFX9: kernarg_segment_alignment = 5 +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W +; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11 +; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44 +; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20 +define amdgpu_kernel void @v5f32_arg(<5 x float> addrspace(1)* nocapture %out, <5 x float> %in) nounwind { +entry: + store <5 x float> %in, <5 x float> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v5i64_arg: +; HSA-GFX9: kernarg_segment_byte_size = 128 +; HSA-GFX9: kernarg_segment_alignment = 6 +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z +; SI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 +; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x21 +; MESA-VI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 +; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x84 +; HSA-GFX9-DAG: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 +; HSA-GFX9-DAG: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x60 +define amdgpu_kernel void @v5i64_arg(<5 x i64> addrspace(1)* nocapture %out, <5 x i64> %in) nounwind { +entry: + store <5 x i64> %in, <5 x i64> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v5f64_arg: +; HSA-GFX9: kernarg_segment_byte_size = 128 +; HSA-GFX9: kernarg_segment_alignment = 6 +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z +; SI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 +; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x21 +; MESA-VI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 +; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x84 +; HSA-GFX9-DAG: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 +; HSA-GFX9-DAG: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x60 +define amdgpu_kernel void @v5f64_arg(<5 x double> addrspace(1)* nocapture %out, <5 x double> %in) nounwind { +entry: + store <5 x double> %in, <5 x double> addrspace(1)* %out, align 8 + ret void +} + ; FIXME: Lots of unpack and re-pack junk on VI ; FUNC-LABEL: {{^}}v8i8_arg: ; HSA-GFX9: kernarg_segment_byte_size = 16