Index: clang/include/clang/Basic/BuiltinsAMDGPU.def =================================================================== --- clang/include/clang/Basic/BuiltinsAMDGPU.def +++ clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -309,6 +309,12 @@ TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_32x32x16_i8, "V16iWiWiV16iIiIiIi", "nc", "mai-insts") TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x8_xf32, "V4fV2fV2fV4fIiIiIi", "nc", "mai-insts") TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x4_xf32, "V16fV2fV2fV16fIiIiIi", "nc", "mai-insts") +TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x32_f16, "V4fV4hV8hV4fiIiIi", "nc", "mai-insts") +TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x16_f16, "V16fV4hV8hV16fiIiIi", "nc", "mai-insts") +TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x32_bf16, "V4fV4sV8sV4fiIiIi", "nc", "mai-insts") +TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x16_bf16, "V16fV4sV8sV16fiIiIi", "nc", "mai-insts") +TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_16x16x64_i8, "V4iV2iV4iV4iiIiIi", "nc", "mai-insts") +TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_32x32x32_i8, "V16iV2iV4iV16iiIiIi", "nc", "mai-insts") #undef BUILTIN #undef TARGET_BUILTIN Index: clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl =================================================================== --- clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl +++ clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl @@ -10,13 +10,16 @@ typedef float v16f __attribute__((ext_vector_type(16))); typedef float v32f __attribute__((ext_vector_type(32))); typedef half v4h __attribute__((ext_vector_type(4))); +typedef half v8h __attribute__((ext_vector_type(8))); typedef half v16h __attribute__((ext_vector_type(16))); typedef half v32h __attribute__((ext_vector_type(32))); +typedef int v2i __attribute__((ext_vector_type(2))); typedef int v4i __attribute__((ext_vector_type(4))); typedef int v16i __attribute__((ext_vector_type(16))); typedef int v32i __attribute__((ext_vector_type(32))); typedef short v2s __attribute__((ext_vector_type(2))); typedef short v4s __attribute__((ext_vector_type(4))); +typedef short v8s __attribute__((ext_vector_type(8))); typedef short v16s __attribute__((ext_vector_type(16))); typedef short v32s __attribute__((ext_vector_type(32))); typedef double v4d __attribute__((ext_vector_type(4))); @@ -247,4 +250,46 @@ { *out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, 0, 0, 0); } + +// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x32_f16 +// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> %a, <8 x half> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) +void test_smfmac_f32_16x16x32_f16(global v4f* out, v4h a, v8h b, v4f c, int idx) +{ + *out = __builtin_amdgcn_smfmac_f32_16x16x32_f16(a, b, c, idx, 0, 0); +} + +// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x16_f16 +// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half> %a, <8 x half> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) +void test_smfmac_f32_32x32x16_f16(global v16f* out, v4h a, v8h b, v16f c, int idx) +{ + *out = __builtin_amdgcn_smfmac_f32_32x32x16_f16(a, b, c, idx, 0, 0); +} + +// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x32_bf16 +// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16> %a, <8 x i16> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) +void test_smfmac_f32_16x16x32_bf16(global v4f* out, v4s a, v8s b, v4f c, int idx) +{ + *out = __builtin_amdgcn_smfmac_f32_16x16x32_bf16(a, b, c, idx, 0, 0); +} + +// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x16_bf16 +// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16> %a, <8 x i16> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) +void test_smfmac_f32_32x32x16_bf16(global v16f* out, v4s a, v8s b, v16f c, int idx) +{ + *out = __builtin_amdgcn_smfmac_f32_32x32x16_bf16(a, b, c, idx, 0, 0); +} + +// CHECK-GFX940-LABEL: @test_smfmac_i32_16x16x64_i8 +// CHECK-GFX940: call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32> %a, <4 x i32> %b, <4 x i32> %c, i32 %idx, i32 0, i32 0) +void test_smfmac_i32_16x16x64_i8(global v4i* out, v2i a, v4i b, v4i c, int idx) +{ + *out = __builtin_amdgcn_smfmac_i32_16x16x64_i8(a, b, c, idx, 0, 0); +} + +// CHECK-GFX940-LABEL: @test_smfmac_i32_32x32x32_i8 +// CHECK-GFX940: call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32> %a, <4 x i32> %b, <16 x i32> %c, i32 %idx, i32 0, i32 0) +void test_smfmac_i32_32x32x32_i8(global v16i* out, v2i a, v4i b, v16i c, int idx) +{ + *out = __builtin_amdgcn_smfmac_i32_32x32x32_i8(a, b, c, idx, 0, 0); +} #endif // MFMA_GFX940_TESTS Index: clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl =================================================================== --- clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl +++ clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl @@ -3,8 +3,13 @@ typedef float v2f __attribute__((ext_vector_type(2))); typedef float v4f __attribute__((ext_vector_type(4))); typedef float v16f __attribute__((ext_vector_type(16))); +typedef int v2i __attribute__((ext_vector_type(2))); typedef int v4i __attribute__((ext_vector_type(4))); typedef int v16i __attribute__((ext_vector_type(16))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef short v4s __attribute__((ext_vector_type(4))); +typedef short v8s __attribute__((ext_vector_type(8))); void test_mfma_i32_16x16x32i8(global v4i* out, long a, long b, v4i c, int d) { @@ -33,3 +38,39 @@ *out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x4_xf32' must be a constant integer}} *out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x4_xf32' must be a constant integer}} } + +void test_smfmac_f32_16x16x32_f16(global v4f* out, v4h a, v8h b, v4f c, int idx, int d) +{ + *out = __builtin_amdgcn_smfmac_f32_16x16x32_f16(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x32_f16' must be a constant integer}} + *out = __builtin_amdgcn_smfmac_f32_16x16x32_f16(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x32_f16' must be a constant integer}} +} + +void test_smfmac_f32_32x32x16_f16(global v16f* out, v4h a, v8h b, v16f c, int idx, int d) +{ + *out = __builtin_amdgcn_smfmac_f32_32x32x16_f16(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x16_f16' must be a constant integer}} + *out = __builtin_amdgcn_smfmac_f32_32x32x16_f16(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x16_f16' must be a constant integer}} +} + +void test_smfmac_f32_16x16x32_bf16(global v4f* out, v4s a, v8s b, v4f c, int idx, int d) +{ + *out = __builtin_amdgcn_smfmac_f32_16x16x32_bf16(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x32_bf16' must be a constant integer}} + *out = __builtin_amdgcn_smfmac_f32_16x16x32_bf16(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x32_bf16' must be a constant integer}} +} + +void test_smfmac_f32_32x32x16_bf16(global v16f* out, v4s a, v8s b, v16f c, int idx, int d) +{ + *out = __builtin_amdgcn_smfmac_f32_32x32x16_bf16(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x16_bf16' must be a constant integer}} + *out = __builtin_amdgcn_smfmac_f32_32x32x16_bf16(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x16_bf16' must be a constant integer}} +} + +void test_smfmac_i32_16x16x64_i8(global v4i* out, v2i a, v4i b, v4i c, int idx, int d) +{ + *out = __builtin_amdgcn_smfmac_i32_16x16x64_i8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_i32_16x16x64_i8' must be a constant integer}} + *out = __builtin_amdgcn_smfmac_i32_16x16x64_i8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_i32_16x16x64_i8' must be a constant integer}} +} + +void test_smfmac_i32_32x32x32_i8(global v16i* out, v2i a, v4i b, v16i c, int idx, int d) +{ + *out = __builtin_amdgcn_smfmac_i32_32x32x32_i8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_i32_32x32x32_i8' must be a constant integer}} + *out = __builtin_amdgcn_smfmac_i32_32x32x32_i8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_i32_32x32x32_i8' must be a constant integer}} +} Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2004,6 +2004,22 @@ def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic; def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic; +// llvm.amdgcn.smfmac.?32.* vdst, srcA, srcB, srcC, index, cbsz, abid +class AMDGPUMSmfmacIntrinsic : + GCCBuiltin, + Intrinsic<[DestTy], + [SrcA, SrcB, DestTy, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrNoMem, IntrWillReturn, + ImmArg>, ImmArg>]>; + +def int_amdgcn_smfmac_f32_16x16x32_f16 : AMDGPUMSmfmacIntrinsic; +def int_amdgcn_smfmac_f32_32x32x16_f16 : AMDGPUMSmfmacIntrinsic; +def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic; +def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic; +def int_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUMSmfmacIntrinsic; +def int_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUMSmfmacIntrinsic; + //===----------------------------------------------------------------------===// // Special Intrinsics for backend internal use only. No frontend // should emit calls to these. Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -144,6 +144,7 @@ bool selectGlobalAtomicFadd(MachineInstr &I, MachineOperand &AddrOp, MachineOperand &DataOp) const; bool selectBVHIntrinsic(MachineInstr &I) const; + bool selectSMFMACIntrin(MachineInstr &I) const; bool selectWaveAddress(MachineInstr &I) const; std::pair selectVOP3ModsImpl(MachineOperand &Root, Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -971,6 +971,13 @@ return selectGroupStaticSize(I); case Intrinsic::returnaddress: return selectReturnAddress(I); + case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: + case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: + case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: + case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: + case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: + case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: + return selectSMFMACIntrin(I); default: return selectImpl(I, *CoverageInfo); } @@ -3054,6 +3061,41 @@ return true; } +bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const { + unsigned Opc; + switch (MI.getIntrinsicID()) { + case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: + Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64; + break; + case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: + Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64; + break; + case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: + Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64; + break; + case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: + Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64; + break; + case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: + Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64; + break; + case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: + Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64; + break; + default: + llvm_unreachable("unhandled smfmac intrinsic"); + } + + auto VDst_In = MI.getOperand(4); + + MI.setDesc(TII.get(Opc)); + MI.removeOperand(4); // VDst_In + MI.removeOperand(1); // Intrinsic ID + MI.addOperand(VDst_In); // Readd VDst_In to the end + MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); + return true; +} + bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const { Register DstReg = MI.getOperand(0).getReg(); Register SrcReg = MI.getOperand(1).getReg(); Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4267,6 +4267,20 @@ : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); break; } + case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: + case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: + case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: + case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: + case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: + case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: { + // vdst, srcA, srcB, srcC, idx + OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); + OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); + break; + } case Intrinsic::amdgcn_interp_p1: case Intrinsic::amdgcn_interp_p2: case Intrinsic::amdgcn_interp_mov: Index: llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -299,6 +299,12 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; // The dummy boolean output is divergent from the IR's perspective, // but the mask results are uniform. These produce a divergent and Index: llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h =================================================================== --- llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -127,6 +127,8 @@ MCOperand decodeOperand_AReg_1024(unsigned Val) const; MCOperand decodeOperand_AV_32(unsigned Val) const; MCOperand decodeOperand_AV_64(unsigned Val) const; + MCOperand decodeOperand_AV_128(unsigned Val) const; + MCOperand decodeOperand_AV_512(unsigned Val) const; enum OpWidthTy { OPW32, Index: llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp =================================================================== --- llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -147,6 +147,8 @@ DECODE_OPERAND_REG(AReg_1024) DECODE_OPERAND_REG(AV_32) DECODE_OPERAND_REG(AV_64) +DECODE_OPERAND_REG(AV_128) +DECODE_OPERAND_REG(AV_512) static DecodeStatus decodeOperand_VSrc16(MCInst &Inst, unsigned Imm, @@ -996,6 +998,14 @@ return decodeSrcOp(OPW64, Val); } +MCOperand AMDGPUDisassembler::decodeOperand_AV_128(unsigned Val) const { + return decodeSrcOp(OPW128, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_AV_512(unsigned Val) const { + return decodeSrcOp(OPW512, Val); +} + MCOperand AMDGPUDisassembler::decodeOperand_VReg_64(unsigned Val) const { return createRegOperand(AMDGPU::VReg_64RegClassID, Val); } Index: llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp =================================================================== --- llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -477,6 +477,7 @@ MRI.getRegClass(AMDGPU::AReg_192RegClassID).contains(Reg) || MRI.getRegClass(AMDGPU::AReg_224RegClassID).contains(Reg) || MRI.getRegClass(AMDGPU::AReg_256RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_512RegClassID).contains(Reg) || MRI.getRegClass(AMDGPU::AGPR_LO16RegClassID).contains(Reg)) Enc |= 512; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11661,6 +11661,19 @@ // so no use checks are needed. MRI.setRegClass(Op.getReg(), NewRC); } + + // Resolve the rest of AV operands to AGPRs. + if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) { + if (Src2->isReg() && Src2->getReg().isVirtual()) { + auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg()); + if (TRI->isVectorSuperClass(RC)) { + auto *NewRC = TRI->getEquivalentAGPRClass(RC); + MRI.setRegClass(Src2->getReg(), NewRC); + if (Src2->isTied()) + MRI.setRegClass(MI.getOperand(0).getReg(), NewRC); + } + } + } } return; Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4773,6 +4773,9 @@ case AMDGPU::AV_160RegClassID: RCID = AMDGPU::VReg_160RegClassID; break; + case AMDGPU::AV_512RegClassID: + RCID = AMDGPU::VReg_512RegClassID; + break; default: break; } Index: llvm/lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2412,6 +2412,13 @@ def VOP_V4F32_V2F32_V2F32_V4F32 : VOPProfile <[v4f32, v2f32, v2f32, v4f32]>; def VOP_V16F32_V2F32_V2F32_V16F32 : VOPProfile <[v16f32, v2f32, v2f32, v16f32]>; +def VOP_V4F32_V4F16_V8F16_I32 : VOPProfile <[v4f32, v4f16, v8f16, i32]>; +def VOP_V16F32_V4F16_V8F16_I32 : VOPProfile <[v16f32, v4f16, v8f16, i32]>; +def VOP_V4F32_V4I16_V8I16_I32 : VOPProfile <[v4f32, v4i16, v8i16, i32]>; +def VOP_V16F32_V4I16_V8I16_I32 : VOPProfile <[v16f32, v4i16, v8i16, i32]>; +def VOP_V4I32_V2I32_V4I32_I32 : VOPProfile <[v4i32, v2i32, v4i32, i32]>; +def VOP_V16I32_V2I32_V4I32_I32 : VOPProfile <[v16i32, v2i32, v4i32, i32]>; + class Commutable_REV { string RevOp = revOp; bit IsOrig = isOrig; Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1103,6 +1103,16 @@ let EncoderMethod = "getAVOperandEncoding"; } +def AVSrc_128 : RegisterOperand { + let DecoderMethod = "DecodeAV_128RegisterClass"; + let EncoderMethod = "getAVOperandEncoding"; +} + +def AVSrc_512 : RegisterOperand { + let DecoderMethod = "DecodeAV_512RegisterClass"; + let EncoderMethod = "getAVOperandEncoding"; +} + def AVLdSt_32 : RegisterOperand { let DecoderMethod = "DecodeAVLdSt_32RegisterClass"; let EncoderMethod = "getAVOperandEncoding"; Index: llvm/lib/Target/AMDGPU/SISchedule.td =================================================================== --- llvm/lib/Target/AMDGPU/SISchedule.td +++ llvm/lib/Target/AMDGPU/SISchedule.td @@ -277,6 +277,9 @@ def : InstRW<[Write4PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_4X4X")>; def : InstRW<[Write8PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>; +def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_16X16X")>; +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_32X32X")>; + } // End SchedModel = SIDPGFX940FullSpeedModel let SchedModel = GFX10SpeedModel in { Index: llvm/lib/Target/AMDGPU/VOP3Instructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -117,8 +117,13 @@ } class getVOP3MAIPat { - list ret = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, - timm:$cbsz, timm:$abid, timm:$blgp))]; + list ret = !if(!eq(P.Src0VT, P.Src1VT), + // mfma + [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, + timm:$cbsz, timm:$abid, timm:$blgp))], + // smfmac + [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, i32:$idx, + timm:$cbsz, timm:$abid))]); } // Consistently gives instructions a _e64 suffix. Index: llvm/lib/Target/AMDGPU/VOP3PInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -395,6 +395,16 @@ bit NoDstOverlap = !gt(DstVT.Size, 128); } +class VOPProfileSMFMAC + : VOPProfileMAI { + let Src1RC64 = _SrcBRC; + let Src2VT = DstVT; + let Asm64 = " $vdst, $src0, $src1, $idx$cbsz$abid"; + let Outs64 = (outs DstRC:$vdst); + let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, VRegSrc_32:$idx, cbsz:$cbsz, abid:$abid, Src2RC64:$src2); +} + def VOPProfileMAI_F32_F32_X4 : VOPProfileMAI; def VOPProfileMAI_F32_F32_X16 : VOPProfileMAI; def VOPProfileMAI_F32_F32_X32 : VOPProfileMAI; @@ -439,6 +449,13 @@ def VOPProfileMAI_F32_V2F32_X16_VCD : VOPProfileMAI; def VOPProfileMAI_F32_V2F32_X32_VCD : VOPProfileMAI; +def VOPProfileSMFMAC_F32_16X16X32_F16 : VOPProfileSMFMAC; +def VOPProfileSMFMAC_F32_32X32X16_F16 : VOPProfileSMFMAC; +def VOPProfileSMFMAC_F32_16X16X32_I16 : VOPProfileSMFMAC; +def VOPProfileSMFMAC_F32_32X32X16_I16 : VOPProfileSMFMAC; +def VOPProfileSMFMAC_I32_16X16X64_I8 : VOPProfileSMFMAC; +def VOPProfileSMFMAC_I32_32X32X32_I8 : VOPProfileSMFMAC; + class MFMATable { bit IsMac = is_mac; string FMAOp = Name; @@ -542,6 +559,22 @@ defm V_MFMA_F32_32X32X4XF32 : MAIInst<"v_mfma_f32_32x32x4xf32", "F32_V2F32_X32", int_amdgcn_mfma_f32_32x32x4_xf32>; } // End Predicates = [isGFX940Plus] +multiclass SMFMACInst { + let Constraints = "$vdst = $src2", DisableEncoding = "$src2", + isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in { + defm NAME : VOP3Inst("VOPProfileSMFMAC_" # P), node>; + } +} + +let SubtargetPredicate = isGFX940Plus in { +defm V_SMFMAC_F32_16X16X32_F16 : SMFMACInst<"v_smfmac_f32_16x16x32_f16", "F32_16X16X32_F16", int_amdgcn_smfmac_f32_16x16x32_f16>; +defm V_SMFMAC_F32_32X32X16_F16 : SMFMACInst<"v_smfmac_f32_32x32x16_f16", "F32_32X32X16_F16", int_amdgcn_smfmac_f32_32x32x16_f16>; +defm V_SMFMAC_F32_16X16X32_BF16 : SMFMACInst<"v_smfmac_f32_16x16x32_bf16", "F32_16X16X32_I16", int_amdgcn_smfmac_f32_16x16x32_bf16>; +defm V_SMFMAC_F32_32X32X16_BF16 : SMFMACInst<"v_smfmac_f32_32x32x16_bf16", "F32_32X32X16_I16", int_amdgcn_smfmac_f32_32x32x16_bf16>; +defm V_SMFMAC_I32_16X16X64_I8 : SMFMACInst<"v_smfmac_i32_16x16x64_i8", "I32_16X16X64_I8", int_amdgcn_smfmac_i32_16x16x64_i8>; +defm V_SMFMAC_I32_32X32X32_I8 : SMFMACInst<"v_smfmac_i32_32x32x32_i8", "I32_32X32X32_I8", int_amdgcn_smfmac_i32_32x32x32_i8>; +} + let SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1 in { defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3_Profile, any_fma>; defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3_Profile, any_fmul>; @@ -642,6 +675,14 @@ } } +multiclass VOP3P_Real_SMFMAC op> { + def _gfx940 : VOP3P_Real(NAME#"_e64"), SIEncodingFamily.VI>, + VOP3Pe_SMFMAC { + let AssemblerPredicate = isGFX940Plus; + let DecoderNamespace = "GFX8"; + } +} + defm V_PK_MAD_I16 : VOP3P_Real_vi <0x00>; defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x01>; defm V_PK_ADD_I16 : VOP3P_Real_vi <0x02>; @@ -756,6 +797,13 @@ defm V_MFMA_F64_16X16X4F64 : VOP3P_Real_MFMA_gfx940 <0x6e, "v_mfma_f64_16x16x4_f64">; defm V_MFMA_F64_4X4X4F64 : VOP3P_Real_MFMA_gfx940 <0x6f, "v_mfma_f64_4x4x4_4b_f64">; +defm V_SMFMAC_F32_16X16X32_F16 : VOP3P_Real_SMFMAC <0x62>; +defm V_SMFMAC_F32_32X32X16_F16 : VOP3P_Real_SMFMAC <0x64>; +defm V_SMFMAC_F32_16X16X32_BF16 : VOP3P_Real_SMFMAC <0x66>; +defm V_SMFMAC_F32_32X32X16_BF16 : VOP3P_Real_SMFMAC <0x68>; +defm V_SMFMAC_I32_16X16X64_I8 : VOP3P_Real_SMFMAC <0x6a>; +defm V_SMFMAC_I32_32X32X32_I8 : VOP3P_Real_SMFMAC <0x6c>; + let SubtargetPredicate = HasPackedFP32Ops in { defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>; defm V_PK_MUL_F32 : VOP3P_Real_vi <0x31>; Index: llvm/lib/Target/AMDGPU/VOPInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOPInstructions.td +++ llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -372,6 +372,36 @@ let Inst{63-61} = !if(P.HasSrc1, blgp, 0); } +class VOP3Pe_SMFMAC op> : Enc64 { + bits<10> vdst; + bits<10> src0; + bits<10> src1; + bits<9> idx; + bits<3> blgp; + bits<3> cbsz; + bits<4> abid; + + let vdst{8} = 1; // VGPR or AGPR, but not SGPR + let blgp = 0; + + let Inst{7-0} = vdst{7-0}; + + let Inst{10-8} = cbsz; + let Inst{14-11} = abid; + + let Inst{15} = vdst{9}; // acc(vdst) + + let Inst{22-16} = op; + let Inst{31-23} = 0x1a7; //encoding + let Inst{40-32} = src0{8-0}; + let Inst{49-41} = src1{8-0}; + let Inst{58-50} = idx; + + let Inst{59} = src0{9}; // acc(0) + let Inst{60} = src1{9}; // acc(1) + + let Inst{63-61} = blgp; +} class VOP3Pe_gfx10 op, VOPProfile P> : VOP3Pe { let Inst{31-23} = 0x198; //encoding Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx940.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx940.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx940.mir @@ -117,3 +117,195 @@ %3:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x4.xf32), %0, %1, %2, 0, 0, 0 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 ... + +--- +name: smfmac_f32_16x16x32_f16_vva +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1_vgpr2, $vgpr2_vgpr3_vgpr4_vgpr5, $agpr0_agpr1_agpr2_agpr3, $vgpr20 + + ; FAST-LABEL: name: smfmac_f32_16x16x32_f16_vva + ; FAST: liveins: $vgpr1_vgpr2, $vgpr2_vgpr3_vgpr4_vgpr5, $agpr0_agpr1_agpr2_agpr3, $vgpr20 + ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; FAST: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; FAST: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3 + ; FAST: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr20 + ; FAST: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.smfmac.f32.16x16x32.f16), [[COPY]](s64), [[COPY1]](s128), [[COPY2]](<4 x s32>), [[COPY3]](s32), 0, 0 + ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>) + ; GREEDY-LABEL: name: smfmac_f32_16x16x32_f16_vva + ; GREEDY: liveins: $vgpr1_vgpr2, $vgpr2_vgpr3_vgpr4_vgpr5, $agpr0_agpr1_agpr2_agpr3, $vgpr20 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3 + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr20 + ; GREEDY: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.smfmac.f32.16x16x32.f16), [[COPY]](s64), [[COPY1]](s128), [[COPY2]](<4 x s32>), [[COPY3]](s32), 0, 0 + ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + %2:_(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3 + %3:_(s32) = COPY $vgpr20 + %4:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.smfmac.f32.16x16x32.f16), %0, %1, %2, %3, 0, 0 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %4 +... + +--- +name: smfmac_f32_32x32x16_f16_vva +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1_vgpr2, $vgpr2_vgpr3_vgpr4_vgpr5, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $vgpr20 + + ; FAST-LABEL: name: smfmac_f32_32x32x16_f16_vva + ; FAST: liveins: $vgpr1_vgpr2, $vgpr2_vgpr3_vgpr4_vgpr5, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $vgpr20 + ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; FAST: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; FAST: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FAST: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr20 + ; FAST: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.smfmac.f32.32x32x16.f16), [[COPY]](s64), [[COPY1]](s128), [[COPY2]](<16 x s32>), [[COPY3]](s32), 0, 0 + ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>) + ; GREEDY-LABEL: name: smfmac_f32_32x32x16_f16_vva + ; GREEDY: liveins: $vgpr1_vgpr2, $vgpr2_vgpr3_vgpr4_vgpr5, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $vgpr20 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr20 + ; GREEDY: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.smfmac.f32.32x32x16.f16), [[COPY]](s64), [[COPY1]](s128), [[COPY2]](<16 x s32>), [[COPY3]](s32), 0, 0 + ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + %2:_(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + %3:_(s32) = COPY $vgpr20 + %4:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.smfmac.f32.32x32x16.f16), %0, %1, %2, %3, 0, 0 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %4 +... + +--- +name: smfmac_f32_16x16x32_bf16_vva +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1_vgpr2, $vgpr2_vgpr3_vgpr4_vgpr5, $agpr0_agpr1_agpr2_agpr3, $vgpr20 + + ; FAST-LABEL: name: smfmac_f32_16x16x32_bf16_vva + ; FAST: liveins: $vgpr1_vgpr2, $vgpr2_vgpr3_vgpr4_vgpr5, $agpr0_agpr1_agpr2_agpr3, $vgpr20 + ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; FAST: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; FAST: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3 + ; FAST: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr20 + ; FAST: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.smfmac.f32.16x16x32.bf16), [[COPY]](s64), [[COPY1]](s128), [[COPY2]](<4 x s32>), [[COPY3]](s32), 0, 0 + ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>) + ; GREEDY-LABEL: name: smfmac_f32_16x16x32_bf16_vva + ; GREEDY: liveins: $vgpr1_vgpr2, $vgpr2_vgpr3_vgpr4_vgpr5, $agpr0_agpr1_agpr2_agpr3, $vgpr20 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3 + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr20 + ; GREEDY: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.smfmac.f32.16x16x32.bf16), [[COPY]](s64), [[COPY1]](s128), [[COPY2]](<4 x s32>), [[COPY3]](s32), 0, 0 + ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + %2:_(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3 + %3:_(s32) = COPY $vgpr20 + %4:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.smfmac.f32.16x16x32.bf16), %0, %1, %2, %3, 0, 0 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %4 +... + +--- +name: smfmac_f32_32x32x16_bf16_vva +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1_vgpr2, $vgpr2_vgpr3_vgpr4_vgpr5, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $vgpr20 + + ; FAST-LABEL: name: smfmac_f32_32x32x16_bf16_vva + ; FAST: liveins: $vgpr1_vgpr2, $vgpr2_vgpr3_vgpr4_vgpr5, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $vgpr20 + ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; FAST: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; FAST: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FAST: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr20 + ; FAST: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.smfmac.f32.32x32x16.bf16), [[COPY]](s64), [[COPY1]](s128), [[COPY2]](<16 x s32>), [[COPY3]](s32), 0, 0 + ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>) + ; GREEDY-LABEL: name: smfmac_f32_32x32x16_bf16_vva + ; GREEDY: liveins: $vgpr1_vgpr2, $vgpr2_vgpr3_vgpr4_vgpr5, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $vgpr20 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr20 + ; GREEDY: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.smfmac.f32.32x32x16.bf16), [[COPY]](s64), [[COPY1]](s128), [[COPY2]](<16 x s32>), [[COPY3]](s32), 0, 0 + ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + %2:_(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + %3:_(s32) = COPY $vgpr20 + %4:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.smfmac.f32.32x32x16.bf16), %0, %1, %2, %3, 0, 0 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %4 +... + +--- +name: smfmac_i32_16x16x64_i8_vva +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1_vgpr2, $vgpr2_vgpr3_vgpr4_vgpr5, $agpr0_agpr1_agpr2_agpr3, $vgpr20 + + ; FAST-LABEL: name: smfmac_i32_16x16x64_i8_vva + ; FAST: liveins: $vgpr1_vgpr2, $vgpr2_vgpr3_vgpr4_vgpr5, $agpr0_agpr1_agpr2_agpr3, $vgpr20 + ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; FAST: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; FAST: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3 + ; FAST: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr20 + ; FAST: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.smfmac.i32.16x16x64.i8), [[COPY]](s64), [[COPY1]](s128), [[COPY2]](<4 x s32>), [[COPY3]](s32), 0, 0 + ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>) + ; GREEDY-LABEL: name: smfmac_i32_16x16x64_i8_vva + ; GREEDY: liveins: $vgpr1_vgpr2, $vgpr2_vgpr3_vgpr4_vgpr5, $agpr0_agpr1_agpr2_agpr3, $vgpr20 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3 + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr20 + ; GREEDY: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.smfmac.i32.16x16x64.i8), [[COPY]](s64), [[COPY1]](s128), [[COPY2]](<4 x s32>), [[COPY3]](s32), 0, 0 + ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + %2:_(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3 + %3:_(s32) = COPY $vgpr20 + %4:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.smfmac.i32.16x16x64.i8), %0, %1, %2, %3, 0, 0 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %4 +... + +--- +name: smfmac_i32_32x32x32_i8_vva +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1_vgpr2, $vgpr2_vgpr3_vgpr4_vgpr5, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $vgpr20 + + ; FAST-LABEL: name: smfmac_i32_32x32x32_i8_vva + ; FAST: liveins: $vgpr1_vgpr2, $vgpr2_vgpr3_vgpr4_vgpr5, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $vgpr20 + ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; FAST: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; FAST: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FAST: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr20 + ; FAST: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.smfmac.i32.32x32x32.i8), [[COPY]](s64), [[COPY1]](s128), [[COPY2]](<16 x s32>), [[COPY3]](s32), 0, 0 + ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>) + ; GREEDY-LABEL: name: smfmac_i32_32x32x32_i8_vva + ; GREEDY: liveins: $vgpr1_vgpr2, $vgpr2_vgpr3_vgpr4_vgpr5, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $vgpr20 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr20 + ; GREEDY: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.smfmac.i32.32x32x32.i8), [[COPY]](s64), [[COPY1]](s128), [[COPY2]](<16 x s32>), [[COPY3]](s32), 0, 0 + ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + %2:_(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + %3:_(s32) = COPY $vgpr20 + %4:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.smfmac.i32.32x32x32.i8), %0, %1, %2, %3, 0, 0 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %4 +... Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll @@ -1,12 +1,18 @@ -; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX940 %s -; RUN: llc -march=amdgcn -mcpu=gfx940 -global-isel -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s -; RUN: llc -march=amdgcn -mcpu=gfx940 -stress-regalloc=10 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX940 %s -; RUN: llc -march=amdgcn -mcpu=gfx940 -stress-regalloc=10 -global-isel -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s +; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX940,VGPRCD %s +; RUN: llc -march=amdgcn -mcpu=gfx940 -global-isel -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL,VGPRCD %s +; RUN: llc -march=amdgcn -mcpu=gfx940 -stress-regalloc=10 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX940,AGPRCD %s +; RUN: llc -march=amdgcn -mcpu=gfx940 -stress-regalloc=10 -global-isel -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL,AGPRCD %s declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64, i64, <4 x i32>, i32, i32, i32) declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64, i64, <16 x i32>, i32, i32, i32) declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float>, <2 x float>, <4 x float>, i32, i32, i32) declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float>, <2 x float>, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half>, <8 x half>, <4 x float>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half>, <8 x half>, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16>, <8 x i16>, <4 x float>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16>, <8 x i16>, <16 x float>, i32, i32, i32) +declare <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32>, <4 x i32>, <4 x i32>, i32, i32, i32) +declare <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32>, <4 x i32>, <16 x i32>, i32, i32, i32) ; GCN-LABEL: {{^}}test_mfma_i32_16x16x32i8: ; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1 @@ -80,4 +86,109 @@ ret void } +; GCN-LABEL: {{^}}test_smfmac_f32_16x16x32_f16: +; GCN: s_load_dwordx4 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}} +; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}} +; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}} +; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}} +; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}} +; GCN: v_smfmac_f32_16x16x32_f16 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2 +; GCN: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]] +define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(<4 x float> addrspace(1)* %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> %a, <8 x half> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_smfmac_f32_32x32x16_f16: +; GCN: s_load_dwordx16 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}} +; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}} +; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}} +; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}} +; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}} +; GCN: v_smfmac_f32_32x32x16_f16 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2 +; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}} +; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16 +; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32 +; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48 +define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(<16 x float> addrspace(1)* %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half> %a, <8 x half> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_smfmac_f32_16x16x32_bf16: +; GCN: s_load_dwordx4 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}} +; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}} +; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}} +; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}} +; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}} +; GCN: v_smfmac_f32_16x16x32_bf16 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2 +; GCN: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]] +define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(<4 x float> addrspace(1)* %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16> %a, <8 x i16> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_smfmac_f32_32x32x16_bf16: +; GCN: s_load_dwordx16 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}} +; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}} +; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}} +; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}} +; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}} +; GCN: v_smfmac_f32_32x32x16_bf16 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2 +; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}} +; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16 +; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32 +; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48 +define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(<16 x float> addrspace(1)* %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16> %a, <8 x i16> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_i8: +; GCN: s_load_dwordx4 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}} +; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}} +; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}} +; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}} +; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}} +; GCN: v_smfmac_i32_16x16x64_i8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2 +; GCN: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]] +define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(<4 x i32> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { +bb: + %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg + %mai.1 = tail call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32> %a, <4 x i32> %b, <4 x i32> %in.1, i32 %idx, i32 1, i32 2) + store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_i8: +; GCN: s_load_dwordx16 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}} +; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}} +; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}} +; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}} +; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}} +; GCN: v_smfmac_i32_32x32x32_i8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2 +; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}} +; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16 +; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32 +; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48 +define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(<16 x i32> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { +bb: + %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg + %mai.1 = tail call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32> %a, <4 x i32> %b, <16 x i32> %in.1, i32 %idx, i32 1, i32 2) + store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg + ret void +} + attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } Index: llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll +++ llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll @@ -5,6 +5,12 @@ declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64, i64, <16 x i32>, i32, i32, i32) declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float>, <2 x float>, <4 x float>, i32, i32, i32) declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float>, <2 x float>, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half>, <8 x half>, <4 x float>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half>, <8 x half>, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16>, <8 x i16>, <4 x float>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16>, <8 x i16>, <16 x float>, i32, i32, i32) +declare <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32>, <4 x i32>, <4 x i32>, i32, i32, i32) +declare <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32>, <4 x i32>, <16 x i32>, i32, i32, i32) ; GCN-LABEL: {{^}}test_mfma_i32_16x16x32i8: ; GCN: v_mfma_i32_16x16x32_i8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] @@ -45,3 +51,63 @@ store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg ret void } + +; GCN-LABEL: {{^}}test_smfmac_f32_16x16x32_f16: +; GCN: v_smfmac_f32_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} +define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(<4 x float> addrspace(1)* %arg, <4 x half> %a, <8 x half> %b, i32 %idx) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> %a, <8 x half> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_smfmac_f32_32x32x16_f16: +; GCN: v_smfmac_f32_32x32x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} +define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(<16 x float> addrspace(1)* %arg, <4 x half> %a, <8 x half> %b, i32 %idx) { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half> %a, <8 x half> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_smfmac_f32_16x16x32_bf16: +; GCN: v_smfmac_f32_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} +define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(<4 x float> addrspace(1)* %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16> %a, <8 x i16> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_smfmac_f32_32x32x16_bf16: +; GCN: v_smfmac_f32_32x32x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} +define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(<16 x float> addrspace(1)* %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16> %a, <8 x i16> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_i8: +; GCN: v_smfmac_i32_16x16x64_i8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} +define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(<4 x i32> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { +bb: + %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg + %mai.1 = tail call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32> %a, <4 x i32> %b, <4 x i32> %in.1, i32 %idx, i32 0, i32 0) + store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_i8: +; GCN: v_smfmac_i32_32x32x32_i8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} +define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(<16 x i32> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { +bb: + %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg + %mai.1 = tail call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32> %a, <4 x i32> %b, <16 x i32> %in.1, i32 %idx, i32 0, i32 0) + store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg + ret void +} Index: llvm/test/MC/AMDGPU/mai-gfx940.s =================================================================== --- llvm/test/MC/AMDGPU/mai-gfx940.s +++ llvm/test/MC/AMDGPU/mai-gfx940.s @@ -460,3 +460,51 @@ v_mfma_f32_32x32x4xf32 a[0:15], v[2:3], v[4:5], a[18:33] // GFX940: v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[4:5], a[18:33] ; encoding: [0x00,0x80,0xbf,0xd3,0x02,0x09,0x4a,0x04] // GFX90A: error: instruction not supported on this GPU + +v_smfmac_f32_16x16x32_f16 v[10:13], a[2:3], v[4:7], v0 cbsz:3 abid:1 +// GFX940: v_smfmac_f32_16x16x32_f16 v[10:13], a[2:3], v[4:7], v0 cbsz:3 abid:1 ; encoding: [0x0a,0x0b,0xe2,0xd3,0x02,0x09,0x02,0x0c] +// GFX90A: error: instruction not supported on this GPU + +v_smfmac_f32_16x16x32_f16 a[10:13], v[2:3], a[4:7], v1 +// GFX940: v_smfmac_f32_16x16x32_f16 a[10:13], v[2:3], a[4:7], v1 ; encoding: [0x0a,0x80,0xe2,0xd3,0x02,0x09,0x06,0x14] +// GFX90A: error: instruction not supported on this GPU + +v_smfmac_f32_32x32x16_f16 v[10:25], a[2:3], v[4:7], v2 cbsz:3 abid:1 +// GFX940: v_smfmac_f32_32x32x16_f16 v[10:25], a[2:3], v[4:7], v2 cbsz:3 abid:1 ; encoding: [0x0a,0x0b,0xe4,0xd3,0x02,0x09,0x0a,0x0c] +// GFX90A: error: instruction not supported on this GPU + +v_smfmac_f32_32x32x16_f16 a[10:25], v[2:3], a[4:7], v3 +// GFX940: v_smfmac_f32_32x32x16_f16 a[10:25], v[2:3], a[4:7], v3 ; encoding: [0x0a,0x80,0xe4,0xd3,0x02,0x09,0x0e,0x14] +// GFX90A: error: instruction not supported on this GPU + +v_smfmac_f32_16x16x32_bf16 v[10:13], a[2:3], v[4:7], v4 cbsz:3 abid:1 +// GFX940: v_smfmac_f32_16x16x32_bf16 v[10:13], a[2:3], v[4:7], v4 cbsz:3 abid:1 ; encoding: [0x0a,0x0b,0xe6,0xd3,0x02,0x09,0x12,0x0c] +// GFX90A: error: instruction not supported on this GPU + +v_smfmac_f32_16x16x32_bf16 a[10:13], v[2:3], a[4:7], v5 +// GFX940: v_smfmac_f32_16x16x32_bf16 a[10:13], v[2:3], a[4:7], v5 ; encoding: [0x0a,0x80,0xe6,0xd3,0x02,0x09,0x16,0x14] +// GFX90A: error: instruction not supported on this GPU + +v_smfmac_f32_32x32x16_bf16 v[10:25], a[2:3], v[4:7], v6 cbsz:3 abid:1 +// GFX940: v_smfmac_f32_32x32x16_bf16 v[10:25], a[2:3], v[4:7], v6 cbsz:3 abid:1 ; encoding: [0x0a,0x0b,0xe8,0xd3,0x02,0x09,0x1a,0x0c] +// GFX90A: error: instruction not supported on this GPU + +v_smfmac_f32_32x32x16_bf16 a[10:25], v[2:3], a[4:7], v7 +// GFX940: v_smfmac_f32_32x32x16_bf16 a[10:25], v[2:3], a[4:7], v7 ; encoding: [0x0a,0x80,0xe8,0xd3,0x02,0x09,0x1e,0x14] +// GFX90A: error: instruction not supported on this GPU + +v_smfmac_i32_16x16x64_i8 v[10:13], a[2:3], v[4:7], v8 cbsz:3 abid:1 +// GFX940: v_smfmac_i32_16x16x64_i8 v[10:13], a[2:3], v[4:7], v8 cbsz:3 abid:1 ; encoding: [0x0a,0x0b,0xea,0xd3,0x02,0x09,0x22,0x0c] +// GFX90A: error: instruction not supported on this GPU + +v_smfmac_i32_16x16x64_i8 a[10:13], v[2:3], a[4:7], v9 +// GFX940: v_smfmac_i32_16x16x64_i8 a[10:13], v[2:3], a[4:7], v9 ; encoding: [0x0a,0x80,0xea,0xd3,0x02,0x09,0x26,0x14] +// GFX90A: error: instruction not supported on this GPU + +v_smfmac_i32_32x32x32_i8 v[10:25], a[2:3], v[4:7], v10 cbsz:3 abid:1 +// GFX940: v_smfmac_i32_32x32x32_i8 v[10:25], a[2:3], v[4:7], v10 cbsz:3 abid:1 ; encoding: [0x0a,0x0b,0xec,0xd3,0x02,0x09,0x2a,0x0c] +// GFX90A: error: instruction not supported on this GPU + +v_smfmac_i32_32x32x32_i8 a[10:25], v[2:3], a[4:7], v11 +// GFX940: v_smfmac_i32_32x32x32_i8 a[10:25], v[2:3], a[4:7], v11 ; encoding: [0x0a,0x80,0xec,0xd3,0x02,0x09,0x2e,0x14] +// GFX90A: error: instruction not supported on this GPU Index: llvm/test/MC/Disassembler/AMDGPU/mai-gfx940.txt =================================================================== --- llvm/test/MC/Disassembler/AMDGPU/mai-gfx940.txt +++ llvm/test/MC/Disassembler/AMDGPU/mai-gfx940.txt @@ -62,3 +62,45 @@ # GFX940: v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[4:5], a[2:17] ; encoding: [0x00,0x80,0xbf,0xd3,0x02,0x09,0x0a,0x04] 0x00,0x80,0xbf,0xd3,0x02,0x09,0x0a,0x04 + +# GFX940: v_smfmac_f32_16x16x32_f16 v[10:13], a[2:3], v[4:7], v0 cbsz:3 abid:1 ; encoding: [0x0a,0x0b,0xe2,0xd3,0x02,0x09,0x02,0x0c] +0x0a,0x0b,0xe2,0xd3,0x02,0x09,0x02,0x0c + +# GFX940: v_smfmac_f32_16x16x32_f16 a[10:13], v[2:3], a[4:7], v1 ; encoding: [0x0a,0x80,0xe2,0xd3,0x02,0x09,0x06,0x14] +0x0a,0x80,0xe2,0xd3,0x02,0x09,0x06,0x14 + +# GFX940: v_smfmac_f32_32x32x16_f16 v[10:25], a[2:3], v[4:7], v2 cbsz:3 abid:1 ; encoding: [0x0a,0x0b,0xe4,0xd3,0x02,0x09,0x0a,0x0c] +0x0a,0x0b,0xe4,0xd3,0x02,0x09,0x0a,0x0c + +# GFX940: v_smfmac_f32_32x32x16_f16 a[10:25], v[2:3], a[4:7], v3 ; encoding: [0x0a,0x80,0xe4,0xd3,0x02,0x09,0x0e,0x14] +0x0a,0x80,0xe4,0xd3,0x02,0x09,0x0e,0x14 + +# GFX940: v_smfmac_f32_16x16x32_bf16 v[10:13], a[2:3], v[4:7], v4 cbsz:3 abid:1 ; encoding: [0x0a,0x0b,0xe6,0xd3,0x02,0x09,0x12,0x0c] +0x0a,0x0b,0xe6,0xd3,0x02,0x09,0x12,0x0c + +# GFX940: v_smfmac_f32_16x16x32_bf16 a[10:13], v[2:3], a[4:7], v5 ; encoding: [0x0a,0x80,0xe6,0xd3,0x02,0x09,0x16,0x14] +0x0a,0x80,0xe6,0xd3,0x02,0x09,0x16,0x14 + +# GFX940: v_smfmac_f32_32x32x16_bf16 v[10:25], a[2:3], v[4:7], v6 cbsz:3 abid:1 ; encoding: [0x0a,0x0b,0xe8,0xd3,0x02,0x09,0x1a,0x0c] +0x0a,0x0b,0xe8,0xd3,0x02,0x09,0x1a,0x0c + +# GFX940: v_smfmac_f32_32x32x16_bf16 a[10:25], v[2:3], a[4:7], v7 ; encoding: [0x0a,0x80,0xe8,0xd3,0x02,0x09,0x1e,0x14] +0x0a,0x80,0xe8,0xd3,0x02,0x09,0x1e,0x14 + +# GFX940: v_smfmac_f32_32x32x16_bf16 v[10:25], a[2:3], v[4:7], v8 cbsz:3 abid:1 ; encoding: [0x0a,0x0b,0xe8,0xd3,0x02,0x09,0x22,0x0c] +0x0a,0x0b,0xe8,0xd3,0x02,0x09,0x22,0x0c + +# GFX940: v_smfmac_f32_32x32x16_bf16 a[10:25], v[2:3], a[4:7], v9 ; encoding: [0x0a,0x80,0xe8,0xd3,0x02,0x09,0x26,0x14] +0x0a,0x80,0xe8,0xd3,0x02,0x09,0x26,0x14 + +# GFX940: v_smfmac_i32_16x16x64_i8 v[10:13], a[2:3], v[4:7], v10 cbsz:3 abid:1 ; encoding: [0x0a,0x0b,0xea,0xd3,0x02,0x09,0x2a,0x0c] +0x0a,0x0b,0xea,0xd3,0x02,0x09,0x2a,0x0c + +# GFX940: v_smfmac_i32_16x16x64_i8 a[10:13], v[2:3], a[4:7], v11 ; encoding: [0x0a,0x80,0xea,0xd3,0x02,0x09,0x2e,0x14] +0x0a,0x80,0xea,0xd3,0x02,0x09,0x2e,0x14 + +# GFX940: v_smfmac_i32_32x32x32_i8 v[10:25], a[2:3], v[4:7], v12 cbsz:3 abid:1 ; encoding: [0x0a,0x0b,0xec,0xd3,0x02,0x09,0x32,0x0c] +0x0a,0x0b,0xec,0xd3,0x02,0x09,0x32,0x0c + +# GFX940: v_smfmac_i32_32x32x32_i8 a[10:25], v[2:3], a[4:7], v13 ; encoding: [0x0a,0x80,0xec,0xd3,0x02,0x09,0x36,0x14] +0x0a,0x80,0xec,0xd3,0x02,0x09,0x36,0x14