Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -261,6 +261,7 @@ def llvm_v4f32_ty : LLVMType; // 4 x float def llvm_v8f32_ty : LLVMType; // 8 x float def llvm_v16f32_ty : LLVMType; // 16 x float +def llvm_v32f32_ty : LLVMType; // 32 x float def llvm_v1f64_ty : LLVMType; // 1 x double def llvm_v2f64_ty : LLVMType; // 2 x double def llvm_v4f64_ty : LLVMType; // 4 x double Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -1663,8 +1663,8 @@ def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicNoRtn; // llvm.amdgcn.mfma.f32.* vdst, srcA, srcB, srcC, cbsz, abid, blgp -def int_amdgcn_mfma_f32_32x32x1f32 : Intrinsic<[llvm_v32i32_ty], - [llvm_float_ty, llvm_float_ty, llvm_v32i32_ty, +def int_amdgcn_mfma_f32_32x32x1f32 : Intrinsic<[llvm_v32f32_ty], + [llvm_float_ty, llvm_float_ty, llvm_v32f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoMem]>; def int_amdgcn_mfma_f32_16x16x1f32 : Intrinsic<[llvm_v16f32_ty], @@ -1683,8 +1683,8 @@ [llvm_float_ty, llvm_float_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoMem]>; -def int_amdgcn_mfma_f32_32x32x4f16 : Intrinsic<[llvm_v32i32_ty], - [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v32i32_ty, +def int_amdgcn_mfma_f32_32x32x4f16 : Intrinsic<[llvm_v32f32_ty], + [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v32f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoMem]>; def int_amdgcn_mfma_f32_16x16x4f16 : Intrinsic<[llvm_v16f32_ty], @@ -1723,8 +1723,8 @@ [llvm_i32_ty, llvm_i32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoMem]>; -def int_amdgcn_mfma_f32_32x32x2bf16 : Intrinsic<[llvm_v32i32_ty], - [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v32i32_ty, +def int_amdgcn_mfma_f32_32x32x2bf16 : Intrinsic<[llvm_v32f32_ty], + [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v32f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoMem]>; def int_amdgcn_mfma_f32_16x16x2bf16 : Intrinsic<[llvm_v16f32_ty], Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -165,6 +165,9 @@ setOperationAction(ISD::LOAD, MVT::v16f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); + setOperationAction(ISD::LOAD, MVT::v32f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32); + setOperationAction(ISD::LOAD, MVT::i64, Promote); AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32); @@ -256,6 +259,9 @@ setOperationAction(ISD::STORE, MVT::v16f32, Promote); AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32); + setOperationAction(ISD::STORE, MVT::v32f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32); + setOperationAction(ISD::STORE, MVT::i64, Promote); AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32); @@ -355,7 +361,10 @@ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom); setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom); Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -153,6 +153,7 @@ if (Subtarget->hasMAIInsts()) { addRegisterClass(MVT::v32i32, &AMDGPU::AReg_1024RegClass); + addRegisterClass(MVT::v32f32, &AMDGPU::AReg_1024RegClass); } computeRegisterProperties(Subtarget->getRegisterInfo()); @@ -263,8 +264,9 @@ // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. - for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, - MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v32i32 }) { + for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, + MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, + MVT::v32i32, MVT::v32f32 }) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -2178,14 +2178,13 @@ def VOP_V4F32_F32_F32_V4F32 : VOPProfile <[v4f32, f32, f32, v4f32]>; def VOP_V16F32_F32_F32_V16F32 : VOPProfile <[v16f32, f32, f32, v16f32]>; -// TODO: define v32f32 -def VOP_V32F32_F32_F32_V32F32 : VOPProfile <[v32i32, f32, f32, v32i32]>; +def VOP_V32F32_F32_F32_V32F32 : VOPProfile <[v32f32, f32, f32, v32f32]>; def VOP_V4F32_V4F16_V4F16_V4F32 : VOPProfile <[v4f32, v4f16, v4f16, v4f32]>; def VOP_V16F32_V4F16_V4F16_V16F32 : VOPProfile <[v16f32, v4f16, v4f16, v16f32]>; -def VOP_V32F32_V4F16_V4F16_V32F32 : VOPProfile <[v32i32, v4f16, v4f16, v32i32]>; +def VOP_V32F32_V4F16_V4F16_V32F32 : VOPProfile <[v32f32, v4f16, v4f16, v32f32]>; def VOP_V4F32_V2I16_V2I16_V4F32 : VOPProfile <[v4f32, v2i16, v2i16, v4f32]>; def VOP_V16F32_V2I16_V2I16_V16F32 : VOPProfile <[v16f32, v2i16, v2i16, v16f32]>; -def VOP_V32F32_V2I16_V2I16_V32F32 : VOPProfile <[v32i32, v2i16, v2i16, v32i32]>; +def VOP_V32F32_V2I16_V2I16_V32F32 : VOPProfile <[v32f32, v2i16, v2i16, v32f32]>; def VOP_V4I32_I32_I32_V4I32 : VOPProfile <[v4i32, i32, i32, v4i32]>; def VOP_V16I32_I32_I32_V16I32 : VOPProfile <[v16i32, i32, i32, v16i32]>; def VOP_V32I32_I32_I32_V32I32 : VOPProfile <[v32i32, i32, i32, v32i32]>; Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -942,6 +942,14 @@ def Insert_Element_v32i32_#Index : Insert_Element < i32, v32i32, Index, !cast(sub#Index) >; + + def Extract_Element_v32f32_#Index : Extract_Element < + f32, v32f32, Index, !cast(sub#Index) + >; + + def Insert_Element_v32f32_#Index : Insert_Element < + f32, v32f32, Index, !cast(sub#Index) + >; } // FIXME: Why do only some of these type combinations for SReg and @@ -1034,6 +1042,10 @@ def : BitConvert ; def : BitConvert ; +// 1024-bit bitcast +def : BitConvert ; +def : BitConvert ; + /********** =================== **********/ /********** Src & Dst modifiers **********/ /********** =================== **********/ Index: lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.td +++ lib/Target/AMDGPU/SIRegisterInfo.td @@ -757,11 +757,11 @@ let isAllocatable = 0; } -def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32], 32, (add SGPR_1024Regs)> { +def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, (add SGPR_1024Regs)> { let AllocationPriority = 19; } -def SReg_1024 : RegisterClass<"AMDGPU", [v32i32], 32, +def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, (add SGPR_1024)> { let CopyCost = 16; let AllocationPriority = 19; @@ -812,7 +812,7 @@ let AllocationPriority = 7; } -def VReg_1024 : RegisterClass<"AMDGPU", [v32i32], 32, (add VGPR_1024)> { +def VReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, (add VGPR_1024)> { let Size = 1024; let CopyCost = 32; let AllocationPriority = 8; @@ -840,7 +840,7 @@ } // TODO: add v32f32 value type -def AReg_1024 : RegisterClass<"AMDGPU", [v32i32], 32, (add AGPR_1024)> { +def AReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, (add AGPR_1024)> { let Size = 1024; let CopyCost = 65; let AllocationPriority = 8; Index: test/CodeGen/AMDGPU/agpr-register-count.ll =================================================================== --- test/CodeGen/AMDGPU/agpr-register-count.ll +++ test/CodeGen/AMDGPU/agpr-register-count.ll @@ -1,15 +1,15 @@ ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x i32>, i32, i32, i32) +declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) ; GCN-LABEL: {{^}}test_32_agprs: ; GCN: v_mfma_f32_32x32x1f32 a[0:31], {{v[0-9]+}}, {{v[0-9]+}}, 0 ; GCN-NOT: v28 ; GCN: NumVgprs: 32 ; GCN: VGPRBlocks: 7 -define amdgpu_kernel void @test_32_agprs(<32 x i32> addrspace(1)* %arg) { +define amdgpu_kernel void @test_32_agprs(<32 x float> addrspace(1)* %arg) { bb: - %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> , i32 0, i32 0, i32 0) - store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> , i32 0, i32 0, i32 0) + store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg ret void } Index: test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -1,11 +1,11 @@ ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x i32>, i32, i32, i32) +declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32) declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float, float, <16 x float>, i32, i32, i32) declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float, float, <4 x float>, i32, i32, i32) -declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half>, <4 x half>, <32 x i32>, i32, i32, i32) +declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half>, <4 x half>, <32 x float>, i32, i32, i32) declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32) declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half>, <4 x half>, <4 x float>, i32, i32, i32) declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32) @@ -15,7 +15,7 @@ declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32) declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32, i32, <16 x i32>, i32, i32, i32) declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32, i32) -declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x i32>, i32, i32, i32) +declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x float>, i32, i32, i32) declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32) declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32) declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32) @@ -100,11 +100,11 @@ ; GCN-DAG: global_store_dwordx4 ; GCN-DAG: global_store_dwordx4 ; GCN-DAG: global_store_dwordx4 -define amdgpu_kernel void @test_mfma_f32_32x32x1f32(<32 x i32> addrspace(1)* %arg) { +define amdgpu_kernel void @test_mfma_f32_32x32x1f32(<32 x float> addrspace(1)* %arg) { bb: - %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg - %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %in.1, i32 1, i32 2, i32 3) - store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg + %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3) + store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg ret void } @@ -326,14 +326,14 @@ ; GCN-DAG: global_store_dwordx4 ; GCN-DAG: global_store_dwordx4 ; GCN-DAG: global_store_dwordx4 -define amdgpu_kernel void @test_mfma_f32_32x32x4f16(<32 x i32> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) { +define amdgpu_kernel void @test_mfma_f32_32x32x4f16(<32 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) { bb: - %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg + %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1 %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p - %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %c.1, <4 x half> %c.2, <32 x i32> %in.1, i32 1, i32 2, i32 3) - store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %c.1, <4 x half> %c.2, <32 x float> %in.1, i32 1, i32 2, i32 3) + store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg ret void } @@ -794,13 +794,13 @@ ; GCN-DAG: global_store_dwordx4 ; GCN-DAG: global_store_dwordx4 ; GCN-DAG: global_store_dwordx4 -define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(<32 x i32> addrspace(1)* %arg) { +define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(<32 x float> addrspace(1)* %arg) { bb: - %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg + %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg %a = bitcast i32 1 to <2 x i16> %b = bitcast i32 2 to <2 x i16> - %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x i32> %in.1, i32 1, i32 2, i32 3) - store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3) + store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg ret void } @@ -957,12 +957,12 @@ ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_forward_acc: ; GCN: v_mfma_f32_32x32x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] -define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(<32 x i32> addrspace(1)* %arg) { +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(<32 x float> addrspace(1)* %arg) { bb: - %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg - %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %in.1, i32 0, i32 0, i32 0) - %mai.2 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %mai.1, i32 0, i32 0, i32 0) - store <32 x i32> %mai.2, <32 x i32> addrspace(1)* %arg + %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) + %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0) + store <32 x float> %mai.2, <32 x float> addrspace(1)* %arg ret void } @@ -1112,10 +1112,10 @@ ; GCN-DAG: global_store_dwordx4 ; GCN-DAG: global_store_dwordx4 ; GCN-DAG: global_store_dwordx4 -define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(<32 x i32> addrspace(1)* %arg) { +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(<32 x float> addrspace(1)* %arg) { bb: - %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> , i32 0, i32 0, i32 0) - store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> , i32 0, i32 0, i32 0) + store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg ret void } @@ -1184,7 +1184,7 @@ ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm: ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 @@ -1256,10 +1256,10 @@ ; GCN-DAG: global_store_dwordx4 ; GCN-DAG: global_store_dwordx4 ; GCN-DAG: global_store_dwordx4 -define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(<32 x i32> addrspace(1)* %arg) { +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(<32 x float> addrspace(1)* %arg) { bb: - %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> , i32 0, i32 0, i32 0) - store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> , i32 0, i32 0, i32 0) + store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg ret void } @@ -1350,12 +1350,12 @@ ; GCN-DAG: global_store_dwordx4 ; GCN-DAG: global_store_dwordx4 ; GCN-DAG: global_store_dwordx4 -define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(<32 x i32> addrspace(1)* %arg) { +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(<32 x float> addrspace(1)* %arg) { bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr inbounds <32 x i32>, <32 x i32> addrspace(1)* %arg, i32 %tid - %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %gep - %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %in.1, i32 1, i32 2, i32 3) - store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %gep + %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid + %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3) + store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep ret void } Index: test/CodeGen/AMDGPU/spill-agpr.ll =================================================================== --- test/CodeGen/AMDGPU/spill-agpr.ll +++ test/CodeGen/AMDGPU/spill-agpr.ll @@ -84,23 +84,23 @@ ; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; A2V: ScratchSize: 0 -define amdgpu_kernel void @max_32regs_mfma32(i32 addrspace(1)* %arg) #3 { +define amdgpu_kernel void @max_32regs_mfma32(float addrspace(1)* %arg) #3 { bb: %v = call i32 asm sideeffect "", "=a"() br label %use use: - %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 1.0, <32 x i32> , i32 0, i32 0, i32 0) + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 1.0, <32 x float> , i32 0, i32 0, i32 0) call void asm sideeffect "", "a"(i32 %v) - %elt1 = extractelement <32 x i32> %mai.1, i32 0 - store i32 %elt1, i32 addrspace(1)* %arg + %elt1 = extractelement <32 x float> %mai.1, i32 0 + store float %elt1, float addrspace(1)* %arg ret void } declare i32 @llvm.amdgcn.workitem.id.x() declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32) declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) -declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x i32>, i32, i32, i32) +declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) attributes #0 = { nounwind "amdgpu-num-vgpr"="24" } attributes #1 = { nounwind "amdgpu-num-vgpr"="8" } Index: test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll =================================================================== --- test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll +++ test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll @@ -233,19 +233,23 @@ ret void } +; FIXME: adding an AReg_1024 register class for v32f32 and v32i32 +; produces unnecessary copies and we still have some amount +; of conventional spilling. + ; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32_2bb: ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GFX908-NOT: SCRATCH_RSRC +; GFX908-FIXME-NOT: SCRATCH_RSRC ; GFX908-DAG: v_accvgpr_write_b32 a0, v ; GFX900: buffer_store_dword v ; GFX900: buffer_load_dword v -; GFX908-NOT: buffer_ +; GFX908-FIXME-NOT: buffer_ ; GFX908-DAG v_accvgpr_read_b32 ; GCN: NumVgprs: 256 ; GFX900: ScratchSize: 580 -; GFX908: ScratchSize: 0 +; GFX908-FIXME: ScratchSize: 0 ; GCN: VGPRBlocks: 63 ; GCN: NumVGPRsForWavesPerEU: 256 define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) {