diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -33,9 +33,9 @@ // Big endian vectors must be passed as if they were 1-element vectors so that // their lanes are in a consistent order. - CCIfBigEndian>>, - CCIfBigEndian>>, // In AAPCS, an SRet is passed in X8, not X0 like a normal pointer parameter. @@ -75,10 +75,10 @@ CCIfConsecutiveRegs>, CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, - nxv2f32, nxv4f32, nxv2f64], + nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64], CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>, CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, - nxv2f32, nxv4f32, nxv2f64], + nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64], CCPassIndirect>, CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1], @@ -102,22 +102,24 @@ [W0, W1, W2, W3, W4, W5, W6, W7]>>, CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, // If more than will fit in registers, pass them on the stack instead. - CCIfType<[i1, i8, i16, f16], CCAssignToStack<8, 8>>, + CCIfType<[i1, i8, i16, f16, bf16], CCAssignToStack<8, 8>>, CCIfType<[i32, f32], CCAssignToStack<8, 8>>, - CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16], + CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16, v4bf16], CCAssignToStack<8, 8>>, - CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToStack<16, 16>> ]>; @@ -132,9 +134,9 @@ // Big endian vectors must be passed as if they were 1-element vectors so that // their lanes are in a consistent order. - CCIfBigEndian>>, - CCIfBigEndian>>, CCIfType<[i1, i8, i16], CCPromoteToType>, @@ -144,18 +146,20 @@ [W0, W1, W2, W3, W4, W5, W6, W7]>>, CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, - nxv2f32, nxv4f32, nxv2f64], + nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64], CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>, CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1], @@ -165,7 +169,7 @@ // Vararg functions on windows pass floats in integer registers let Entry = 1 in def CC_AArch64_Win64_VarArg : CallingConv<[ - CCIfType<[f16, f32], CCPromoteToType>, + CCIfType<[f16, bf16, f32], CCPromoteToType>, CCIfType<[f64], CCBitConvertToType>, CCDelegateTo ]>; @@ -219,19 +223,22 @@ [W0, W1, W2, W3, W4, W5, W6, W7]>>, CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, // If more than will fit in registers, pass them on the stack instead. CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>, - CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16", CCAssignToStack<2, 2>>, + CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16 || ValVT == MVT::bf16", + CCAssignToStack<2, 2>>, CCIfType<[i32, f32], CCAssignToStack<4, 4>>, // Re-demote pointers to 32-bits so we don't end up storing 64-bit @@ -239,9 +246,9 @@ CCIfPtr>>, CCIfPtr>>, - CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16], + CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16, v4bf16], CCAssignToStack<8, 8>>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToStack<16, 16>> ]>; @@ -255,14 +262,14 @@ // Handle all scalar types as either i64 or f64. CCIfType<[i8, i16, i32], CCPromoteToType>, - CCIfType<[f16, f32], CCPromoteToType>, + CCIfType<[f16, bf16, f32], CCPromoteToType>, // Everything is on the stack. // i128 is split to two i64s, and its stack alignment is 16 bytes. CCIfType<[i64], CCIfSplit>>, - CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], + CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16], CCAssignToStack<8, 8>>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToStack<16, 16>> ]>; @@ -275,16 +282,16 @@ // Handle all scalar types as either i32 or f32. CCIfType<[i8, i16], CCPromoteToType>, - CCIfType<[f16], CCPromoteToType>, + CCIfType<[f16, bf16], CCPromoteToType>, // Everything is on the stack. // i128 is split to two i64s, and its stack alignment is 16 bytes. CCIfPtr>>, CCIfType<[i32, f32], CCAssignToStack<4, 4>>, CCIfType<[i64], CCIfSplit>>, - CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], + CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16], CCAssignToStack<8, 8>>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToStack<16, 16>> ]>; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -132,6 +132,7 @@ if (Subtarget->hasFPARMv8()) { addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); + addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass); addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); @@ -148,6 +149,7 @@ addDRTypeForNEON(MVT::v1i64); addDRTypeForNEON(MVT::v1f64); addDRTypeForNEON(MVT::v4f16); + addDRTypeForNEON(MVT::v4bf16); addQRTypeForNEON(MVT::v4f32); addQRTypeForNEON(MVT::v2f64); @@ -156,6 +158,7 @@ addQRTypeForNEON(MVT::v4i32); addQRTypeForNEON(MVT::v2i64); addQRTypeForNEON(MVT::v8f16); + addQRTypeForNEON(MVT::v8bf16); } if (Subtarget->hasSVE()) { @@ -174,6 +177,9 @@ addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass); addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass); addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass); addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass); addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass); addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass); @@ -3578,6 +3584,8 @@ RC = &AArch64::GPR64RegClass; else if (RegVT == MVT::f16) RC = &AArch64::FPR16RegClass; + else if (RegVT == MVT::bf16) + RC = &AArch64::FPR16RegClass; else if (RegVT == MVT::f32) RC = &AArch64::FPR32RegClass; else if (RegVT == MVT::f64 || RegVT.is64BitVector()) diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -4447,14 +4447,14 @@ SDPatternOperator OpN> { // Unscaled half-precision to 32-bit def UWHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR32, asm, - [(set GPR32:$Rd, (OpN FPR16:$Rn))]> { + [(set GPR32:$Rd, (OpN (f16 FPR16:$Rn)))]> { let Inst{31} = 0; // 32-bit GPR flag let Predicates = [HasFullFP16]; } // Unscaled half-precision to 64-bit def UXHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR64, asm, - [(set GPR64:$Rd, (OpN FPR16:$Rn))]> { + [(set GPR64:$Rd, (OpN (f16 FPR16:$Rn)))]> { let Inst{31} = 1; // 64-bit GPR flag let Predicates = [HasFullFP16]; } @@ -4489,7 +4489,7 @@ // Scaled half-precision to 32-bit def SWHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR32, fixedpoint_f16_i32, asm, - [(set GPR32:$Rd, (OpN (fmul FPR16:$Rn, + [(set GPR32:$Rd, (OpN (fmul (f16 FPR16:$Rn), fixedpoint_f16_i32:$scale)))]> { let Inst{31} = 0; // 32-bit GPR flag let scale{5} = 1; @@ -4499,7 +4499,7 @@ // Scaled half-precision to 64-bit def SXHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR64, fixedpoint_f16_i64, asm, - [(set GPR64:$Rd, (OpN (fmul FPR16:$Rn, + [(set GPR64:$Rd, (OpN (fmul (f16 FPR16:$Rn), fixedpoint_f16_i64:$scale)))]> { let Inst{31} = 1; // 64-bit GPR flag let Predicates = [HasFullFP16]; @@ -4615,7 +4615,7 @@ // Scaled def SWHri: BaseIntegerToFP { let Inst{31} = 0; // 32-bit GPR flag @@ -4643,7 +4643,7 @@ } def SXHri: BaseIntegerToFP { let Inst{31} = 1; // 64-bit GPR flag @@ -4816,7 +4816,7 @@ multiclass FPConversion { // Double-precision to Half-precision def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm, - [(set FPR16:$Rd, (any_fpround FPR64:$Rn))]>; + [(set (f16 FPR16:$Rd), (any_fpround FPR64:$Rn))]>; // Double-precision to Single-precision def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm, @@ -4824,11 +4824,11 @@ // Half-precision to Double-precision def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm, - [(set FPR64:$Rd, (fpextend FPR16:$Rn))]>; + [(set FPR64:$Rd, (fpextend (f16 FPR16:$Rn)))]>; // Half-precision to Single-precision def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm, - [(set FPR32:$Rd, (fpextend FPR16:$Rn))]>; + [(set FPR32:$Rd, (fpextend (f16 FPR16:$Rn)))]>; // Single-precision to Double-precision def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm, @@ -4836,7 +4836,7 @@ // Single-precision to Half-precision def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm, - [(set FPR16:$Rd, (any_fpround FPR32:$Rn))]>; + [(set (f16 FPR16:$Rd), (any_fpround FPR32:$Rn))]>; } //--- @@ -4938,7 +4938,7 @@ multiclass TwoOperandFPDataNeg opcode, string asm, SDNode node> { def Hrr : BaseTwoOperandFPData { + [(set (f16 FPR16:$Rd), (fneg (node (f16 FPR16:$Rn), (f16 FPR16:$Rm))))]> { let Inst{23-22} = 0b11; // 16-bit size flag let Predicates = [HasFullFP16]; } @@ -4980,7 +4980,7 @@ multiclass ThreeOperandFPData { def Hrrr : BaseThreeOperandFPData { let Inst{23-22} = 0b11; // 16-bit size flag let Predicates = [HasFullFP16]; @@ -5042,7 +5042,7 @@ SDPatternOperator OpNode = null_frag> { let Defs = [NZCV] in { def Hrr : BaseTwoOperandFPComparison { + [(OpNode (f16 FPR16:$Rn), (f16 FPR16:$Rm)), (implicit NZCV)]> { let Inst{23-22} = 0b11; let Predicates = [HasFullFP16]; } @@ -6742,7 +6742,7 @@ [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>; let Predicates = [HasNEON, HasFullFP16] in { def NAME#16 : BaseSIMDThreeScalar; + [(set (f16 FPR16:$Rd), (OpNode (f16 FPR16:$Rn), (f16 FPR16:$Rm)))]>; } // Predicates = [HasNEON, HasFullFP16] } @@ -6949,7 +6949,7 @@ [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>; let Predicates = [HasNEON, HasFullFP16] in { def v1i16 : BaseSIMDTwoScalar; + [(set (f16 FPR16:$Rd), (OpNode (f16 FPR16:$Rn)))]>; } } @@ -7091,10 +7091,10 @@ let Predicates = [HasNEON, HasFullFP16] in { def v4i16v : BaseSIMDAcrossLanes<0, 0, {sz1, 0}, opcode, FPR16, V64, asm, ".4h", - [(set FPR16:$Rd, (intOp (v4f16 V64:$Rn)))]>; + [(set (f16 FPR16:$Rd), (intOp (v4f16 V64:$Rn)))]>; def v8i16v : BaseSIMDAcrossLanes<1, 0, {sz1, 0}, opcode, FPR16, V128, asm, ".8h", - [(set FPR16:$Rd, (intOp (v8f16 V128:$Rn)))]>; + [(set (f16 FPR16:$Rd), (intOp (v8f16 V128:$Rn)))]>; } // Predicates = [HasNEON, HasFullFP16] def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128, asm, ".4s", @@ -8095,7 +8095,7 @@ def : Pat<(v8f16 (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), (AArch64dup (f16 FPR16Op_lo:$Rm)))), (!cast(INST # "v8i16_indexed") V128:$Rd, V128:$Rn, - (SUBREG_TO_REG (i32 0), FPR16Op_lo:$Rm, hsub), (i64 0))>; + (SUBREG_TO_REG (i32 0), (f16 FPR16Op_lo:$Rm), hsub), (i64 0))>; def : Pat<(v4f16 (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), (AArch64duplane16 (v8f16 V128_lo:$Rm), @@ -8105,7 +8105,7 @@ def : Pat<(v4f16 (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), (AArch64dup (f16 FPR16Op_lo:$Rm)))), (!cast(INST # "v4i16_indexed") V64:$Rd, V64:$Rn, - (SUBREG_TO_REG (i32 0), FPR16Op_lo:$Rm, hsub), (i64 0))>; + (SUBREG_TO_REG (i32 0), (f16 FPR16Op_lo:$Rm), hsub), (i64 0))>; def : Pat<(f16 (OpNode (f16 FPR16:$Rd), (f16 FPR16:$Rn), (vector_extract (v8f16 V128_lo:$Rm), VectorIndexH:$idx))), diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -2529,7 +2529,7 @@ [(set FPR8Op:$Rt, (load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>; defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16Op, "ldur", - [(set FPR16Op:$Rt, + [(set (f16 FPR16Op:$Rt), (load (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>; defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32Op, "ldur", [(set (f32 FPR32Op:$Rt), diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -422,18 +422,20 @@ def FPR8 : RegisterClass<"AArch64", [untyped], 8, (sequence "B%u", 0, 31)> { let Size = 8; } -def FPR16 : RegisterClass<"AArch64", [f16], 16, (sequence "H%u", 0, 31)> { +def FPR16 : RegisterClass<"AArch64", [f16, bf16], 16, (sequence "H%u", 0, 31)> { let Size = 16; } + def FPR16_lo : RegisterClass<"AArch64", [f16], 16, (trunc FPR16, 16)> { let Size = 16; } def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>; def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32, - v1i64, v4f16], - 64, (sequence "D%u", 0, 31)>; + v1i64, v4f16, v4bf16], + 64, (sequence "D%u", 0, 31)>; def FPR64_lo : RegisterClass<"AArch64", - [v8i8, v4i16, v2i32, v1i64, v4f16, v2f32, v1f64], + [v8i8, v4i16, v2i32, v1i64, v4f16, v4bf16, v2f32, + v1f64], 64, (trunc FPR64, 16)>; // We don't (yet) have an f128 legal type, so don't use that here. We @@ -441,13 +443,14 @@ // that here. def FPR128 : RegisterClass<"AArch64", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128, - v8f16], + v8f16, v8bf16], 128, (sequence "Q%u", 0, 31)>; // The lower 16 vector registers. Some instructions can only take registers // in this range. def FPR128_lo : RegisterClass<"AArch64", - [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], + [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16, + v8bf16], 128, (trunc FPR128, 16)>; // Pairs, triples, and quads of 64-bit vector registers. @@ -876,6 +879,7 @@ class ZPRClass : RegisterClass<"AArch64", [nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, + nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64], 128, (sequence "Z%u", 0, lastreg)> {