diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -1321,6 +1321,8 @@ } } else if (VT == MVT::f16) { Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost; + } else if (VT == MVT::bf16) { + Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost; } else if (VT == MVT::f32) { Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost; } else if (VT == MVT::f64 || VT.is64BitVector()) { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -622,6 +622,7 @@ setOperationAction(ISD::BITCAST, MVT::i16, Custom); setOperationAction(ISD::BITCAST, MVT::f16, Custom); + setOperationAction(ISD::BITCAST, MVT::bf16, Custom); // Indexed loads and stores are supported. for (unsigned im = (unsigned)ISD::PRE_INC; @@ -633,6 +634,7 @@ setIndexedLoadAction(im, MVT::f64, Legal); setIndexedLoadAction(im, MVT::f32, Legal); setIndexedLoadAction(im, MVT::f16, Legal); + setIndexedLoadAction(im, MVT::bf16, Legal); setIndexedStoreAction(im, MVT::i8, Legal); setIndexedStoreAction(im, MVT::i16, Legal); setIndexedStoreAction(im, MVT::i32, Legal); @@ -640,6 +642,7 @@ setIndexedStoreAction(im, MVT::f64, Legal); setIndexedStoreAction(im, MVT::f32, Legal); setIndexedStoreAction(im, MVT::f16, Legal); + setIndexedStoreAction(im, MVT::bf16, Legal); } // Trap. @@ -2818,7 +2821,8 @@ } static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) { - if (Op.getValueType() != MVT::f16) + EVT OpVT = Op.getValueType(); + if (OpVT != MVT::f16 && OpVT != MVT::bf16) return SDValue(); assert(Op.getOperand(0).getValueType() == MVT::i16); @@ -2827,7 +2831,7 @@ Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0)); Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op); return SDValue( - DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op, + DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op, DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 0); } @@ -3582,9 +3586,7 @@ RC = &AArch64::GPR32RegClass; else if (RegVT == MVT::i64) RC = &AArch64::GPR64RegClass; - else if (RegVT == MVT::f16) - RC = &AArch64::FPR16RegClass; - else if (RegVT == MVT::bf16) + else if (RegVT == MVT::f16 || RegVT == MVT::bf16) RC = &AArch64::FPR16RegClass; else if (RegVT == MVT::f32) RC = &AArch64::FPR32RegClass; @@ -5279,8 +5281,8 @@ Cmp); } - assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || - LHS.getValueType() == MVT::f64); + assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 || + LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally // clean. Some of them require two branches to implement. @@ -7305,7 +7307,8 @@ return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); // vrev <4 x i16> -> REV32 if (VT.getVectorElementType() == MVT::i16 || - VT.getVectorElementType() == MVT::f16) + VT.getVectorElementType() == MVT::f16 || + VT.getVectorElementType() == MVT::bf16) return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); // vrev <4 x i8> -> REV16 assert(VT.getVectorElementType() == MVT::i8); @@ -7318,7 +7321,7 @@ unsigned Opcode; if (EltTy == MVT::i8) Opcode = AArch64ISD::DUPLANE8; - else if (EltTy == MVT::i16 || EltTy == MVT::f16) + else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16) Opcode = AArch64ISD::DUPLANE16; else if (EltTy == MVT::i32 || EltTy == MVT::f32) Opcode = AArch64ISD::DUPLANE32; @@ -7425,7 +7428,7 @@ static unsigned getDUPLANEOp(EVT EltType) { if (EltType == MVT::i8) return AArch64ISD::DUPLANE8; - if (EltType == MVT::i16 || EltType == MVT::f16) + if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16) return AArch64ISD::DUPLANE16; if (EltType == MVT::i32 || EltType == MVT::f32) return AArch64ISD::DUPLANE32; @@ -7661,6 +7664,7 @@ SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64); break; case MVT::f16: + case MVT::bf16: case MVT::f32: case MVT::f64: // Fine as is @@ -8367,8 +8371,8 @@ if (VT.getVectorElementType().isFloatingPoint()) { SmallVector Ops; EVT EltTy = VT.getVectorElementType(); - assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) && - "Unsupported floating-point vector type"); + assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 || + EltTy == MVT::f64) && "Unsupported floating-point vector type"); LLVM_DEBUG( dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int " "BITCASTS, and try again\n"); @@ -8487,11 +8491,12 @@ // Insertion/extraction are legal for V128 types. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || - VT == MVT::v8f16) + VT == MVT::v8f16 || VT == MVT::v8bf16) return Op; if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && - VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) + VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 && + VT != MVT::v4bf16) return SDValue(); // For V64 types, we perform insertion by expanding the value @@ -8521,11 +8526,12 @@ // Insertion/extraction are legal for V128 types. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || - VT == MVT::v8f16) + VT == MVT::v8f16 || VT == MVT::v8bf16) return Op; if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && - VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) + VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 && + VT != MVT::v4bf16) return SDValue(); // For V64 types, we perform extraction by expanding the value @@ -13690,7 +13696,8 @@ SDLoc DL(N); SDValue Op = N->getOperand(0); - if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16) + if (N->getValueType(0) != MVT::i16 || + (Op.getValueType() != MVT::f16 && Op.getValueType() != MVT::bf16)) return; Op = SDValue( diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -2329,6 +2329,10 @@ [(set (f128 FPR128Op:$Rt), (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>; +// bf16 load pattern +def : Pat <(bf16 (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))), + (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>; + // For regular load, we do not have any alignment requirement. // Thus, it is safe to directly map the vector loads with interesting // addressing modes. @@ -2974,6 +2978,11 @@ (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>; +// bf16 store pattern +def : Pat<(store (bf16 FPR16Op:$Rt), + (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)), + (STRHui FPR16:$Rt, GPR64sp:$Rn, uimm12s2:$offset)>; + let AddedComplexity = 10 in { // Match all store 64 bits width whose type is compatible with FPR64 @@ -4776,6 +4785,7 @@ defm : ExtPat; defm : ExtPat; defm : ExtPat; +defm : ExtPat; defm : ExtPat; defm : ExtPat; defm : ExtPat; @@ -4897,16 +4907,29 @@ (v4f16 (DUPv4i16lane (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub), (i64 0)))>; +def : Pat<(v4bf16 (AArch64dup (bf16 FPR16:$Rn))), + (v4bf16 (DUPv4i16lane + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub), + (i64 0)))>; def : Pat<(v8f16 (AArch64dup (f16 FPR16:$Rn))), (v8f16 (DUPv8i16lane (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub), (i64 0)))>; +def : Pat<(v8bf16 (AArch64dup (bf16 FPR16:$Rn))), + (v8bf16 (DUPv8i16lane + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub), + (i64 0)))>; def : Pat<(v4f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)), (DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>; def : Pat<(v8f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)), (DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>; +def : Pat<(v4bf16 (AArch64duplane16 (v8bf16 V128:$Rn), VectorIndexH:$imm)), + (DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>; +def : Pat<(v8bf16 (AArch64duplane16 (v8bf16 V128:$Rn), VectorIndexH:$imm)), + (DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>; + def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)), (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>; def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)), @@ -5022,6 +5045,11 @@ def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))), (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; +def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))), + (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; +def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))), + (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; + def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))), (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (i32 FPR32:$Rn), ssub))>; @@ -5038,6 +5066,11 @@ def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))), (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; +def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))), + (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; +def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))), + (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; + def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))), (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>; def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))), @@ -5063,6 +5096,23 @@ (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)), (i64 0))>; +def : Pat<(v4bf16 (vector_insert (v4bf16 V64:$Rn), + (bf16 FPR16:$Rm), (i64 VectorIndexS:$imm))), + (EXTRACT_SUBREG + (INSvi16lane + (v8bf16 (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), V64:$Rn, dsub)), + VectorIndexS:$imm, + (v8bf16 (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)), + (i64 0)), + dsub)>; + +def : Pat<(v8bf16 (vector_insert (v8bf16 V128:$Rn), + (bf16 FPR16:$Rm), (i64 VectorIndexH:$imm))), + (INSvi16lane + V128:$Rn, VectorIndexH:$imm, + (v8bf16 (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)), + (i64 0))>; + def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn), (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))), (EXTRACT_SUBREG @@ -5144,6 +5194,7 @@ } defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; defm : Neon_INS_elt_pattern; defm : Neon_INS_elt_pattern; @@ -5157,6 +5208,9 @@ (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>; def : Pat<(vector_extract (v8f16 V128:$Rn), 0), (f16 (EXTRACT_SUBREG V128:$Rn, hsub))>; +def : Pat<(vector_extract (v8bf16 V128:$Rn), 0), + (bf16 (EXTRACT_SUBREG V128:$Rn, hsub))>; + def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx), (f64 (CPYi64 V128:$Rn, VectorIndexD:$idx))>; @@ -5164,6 +5218,8 @@ (f32 (CPYi32 V128:$Rn, VectorIndexS:$idx))>; def : Pat<(vector_extract (v8f16 V128:$Rn), VectorIndexH:$idx), (f16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>; +def : Pat<(vector_extract (v8bf16 V128:$Rn), VectorIndexH:$idx), + (bf16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>; // All concat_vectors operations are canonicalised to act on i64 vectors for // AArch64. In the general case we need an instruction, which had just as well be @@ -5179,6 +5235,7 @@ def : ConcatPat; def : ConcatPat; def : ConcatPat; +def : ConcatPat; def : ConcatPat; // If the high lanes are undef, though, we can just ignore them: @@ -6620,6 +6677,7 @@ def : Pat<(v8i8 (AArch64NvCast (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v4i16 (AArch64NvCast (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v4f16 (AArch64NvCast (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4bf16 (AArch64NvCast (v2i32 FPR64:$src))), (v4bf16 FPR64:$src)>; def : Pat<(v2i32 (AArch64NvCast (v2i32 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2f32 (AArch64NvCast (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>; @@ -6627,12 +6685,14 @@ def : Pat<(v8i8 (AArch64NvCast (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v4i16 (AArch64NvCast (v4i16 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v4f16 (AArch64NvCast (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4bf16 (AArch64NvCast (v4i16 FPR64:$src))), (v4bf16 FPR64:$src)>; def : Pat<(v2i32 (AArch64NvCast (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v1i64 (AArch64NvCast (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v8i8 (AArch64NvCast (v8i8 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v4i16 (AArch64NvCast (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v4f16 (AArch64NvCast (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4bf16 (AArch64NvCast (v8i8 FPR64:$src))), (v4bf16 FPR64:$src)>; def : Pat<(v2i32 (AArch64NvCast (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2f32 (AArch64NvCast (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>; @@ -6640,6 +6700,7 @@ def : Pat<(v8i8 (AArch64NvCast (f64 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v4i16 (AArch64NvCast (f64 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v4f16 (AArch64NvCast (f64 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4bf16 (AArch64NvCast (f64 FPR64:$src))), (v4bf16 FPR64:$src)>; def : Pat<(v2i32 (AArch64NvCast (f64 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2f32 (AArch64NvCast (f64 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v1i64 (AArch64NvCast (f64 FPR64:$src))), (v1i64 FPR64:$src)>; @@ -6656,6 +6717,7 @@ def : Pat<(v16i8 (AArch64NvCast (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (AArch64NvCast (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (AArch64NvCast (v4i32 FPR128:$src))), (v8bf16 FPR128:$src)>; def : Pat<(v4i32 (AArch64NvCast (v4i32 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v4f32 (AArch64NvCast (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>; @@ -6664,6 +6726,7 @@ def : Pat<(v16i8 (AArch64NvCast (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (AArch64NvCast (v8i16 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (AArch64NvCast (v8i16 FPR128:$src))), (v8bf16 FPR128:$src)>; def : Pat<(v4i32 (AArch64NvCast (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v4f32 (AArch64NvCast (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>; @@ -6672,6 +6735,7 @@ def : Pat<(v16i8 (AArch64NvCast (v16i8 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (AArch64NvCast (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (AArch64NvCast (v16i8 FPR128:$src))), (v8bf16 FPR128:$src)>; def : Pat<(v4i32 (AArch64NvCast (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v4f32 (AArch64NvCast (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>; @@ -6680,6 +6744,7 @@ def : Pat<(v16i8 (AArch64NvCast (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (AArch64NvCast (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (AArch64NvCast (v2i64 FPR128:$src))), (v8bf16 FPR128:$src)>; def : Pat<(v4i32 (AArch64NvCast (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v2i64 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v4f32 (AArch64NvCast (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>; @@ -6691,6 +6756,7 @@ def : Pat<(v4f32 (AArch64NvCast (v4f32 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (AArch64NvCast (v4f32 FPR128:$src))), (v8bf16 FPR128:$src)>; def : Pat<(v2f64 (AArch64NvCast (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v16i8 (AArch64NvCast (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>; @@ -6699,6 +6765,7 @@ def : Pat<(v2i64 (AArch64NvCast (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v2f64 (AArch64NvCast (v2f64 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (AArch64NvCast (v2f64 FPR128:$src))), (v8bf16 FPR128:$src)>; def : Pat<(v4f32 (AArch64NvCast (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>; let Predicates = [IsLE] in { @@ -6706,6 +6773,7 @@ def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(v4f16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; +def : Pat<(v4bf16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))), @@ -6716,6 +6784,8 @@ (COPY_TO_REGCLASS V64:$Vn, GPR64)>; def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))), (COPY_TO_REGCLASS V64:$Vn, GPR64)>; +def : Pat<(i64 (bitconvert (v4bf16 V64:$Vn))), + (COPY_TO_REGCLASS V64:$Vn, GPR64)>; def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))), (COPY_TO_REGCLASS V64:$Vn, GPR64)>; def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))), @@ -6730,6 +6800,8 @@ (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; def : Pat<(v4f16 (bitconvert GPR64:$Xn)), (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; +def : Pat<(v4bf16 (bitconvert GPR64:$Xn)), + (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; @@ -6741,6 +6813,8 @@ (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))), (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; +def : Pat<(i64 (bitconvert (v4bf16 V64:$Vn))), + (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))), (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; } @@ -6770,6 +6844,7 @@ def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), (v1i64 FPR64:$src)>; +def : Pat<(v1i64 (bitconvert (v4bf16 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>; } let Predicates = [IsBE] in { @@ -6781,6 +6856,8 @@ (v1i64 (REV64v8i8 FPR64:$src))>; def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), (v1i64 (REV64v4i16 FPR64:$src))>; +def : Pat<(v1i64 (bitconvert (v4bf16 FPR64:$src))), + (v1i64 (REV64v4i16 FPR64:$src))>; def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 (REV64v2i32 FPR64:$src))>; } @@ -6794,6 +6871,7 @@ def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v4bf16 FPR64:$src))), (v2i32 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), @@ -6808,6 +6886,8 @@ (v2i32 (REV64v2i32 FPR64:$src))>; def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), (v2i32 (REV32v4i16 FPR64:$src))>; +def : Pat<(v2i32 (bitconvert (v4bf16 FPR64:$src))), + (v2i32 (REV32v4i16 FPR64:$src))>; } def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>; @@ -6834,6 +6914,7 @@ (v4i16 (REV64v4i16 FPR64:$src))>; } def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v4bf16 FPR64:$src))), (v4i16 FPR64:$src)>; let Predicates = [IsLE] in { def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), (v4f16 FPR64:$src)>; @@ -6842,6 +6923,13 @@ def : Pat<(v4f16 (bitconvert (f64 FPR64:$src))), (v4f16 FPR64:$src)>; def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), (v4f16 FPR64:$src)>; def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), (v4f16 FPR64:$src)>; + +def : Pat<(v4bf16 (bitconvert (v1i64 FPR64:$src))), (v4bf16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v2i32 FPR64:$src))), (v4bf16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v8i8 FPR64:$src))), (v4bf16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (f64 FPR64:$src))), (v4bf16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v2f32 FPR64:$src))), (v4bf16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v1f64 FPR64:$src))), (v4bf16 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), @@ -6856,8 +6944,22 @@ (v4f16 (REV32v4i16 FPR64:$src))>; def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), (v4f16 (REV64v4i16 FPR64:$src))>; + +def : Pat<(v4bf16 (bitconvert (v1i64 FPR64:$src))), + (v4bf16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4bf16 (bitconvert (v2i32 FPR64:$src))), + (v4bf16 (REV32v4i16 FPR64:$src))>; +def : Pat<(v4bf16 (bitconvert (v8i8 FPR64:$src))), + (v4bf16 (REV16v8i8 FPR64:$src))>; +def : Pat<(v4bf16 (bitconvert (f64 FPR64:$src))), + (v4bf16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4bf16 (bitconvert (v2f32 FPR64:$src))), + (v4bf16 (REV32v4i16 FPR64:$src))>; +def : Pat<(v4bf16 (bitconvert (v1f64 FPR64:$src))), + (v4bf16 (REV64v4i16 FPR64:$src))>; } def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v4i16 FPR64:$src))), (v4bf16 FPR64:$src)>; let Predicates = [IsLE] in { def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), (v8i8 FPR64:$src)>; @@ -6867,6 +6969,7 @@ def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v4bf16 FPR64:$src))), (v8i8 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), @@ -6883,6 +6986,8 @@ (v8i8 (REV64v8i8 FPR64:$src))>; def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))), (v8i8 (REV16v8i8 FPR64:$src))>; +def : Pat<(v8i8 (bitconvert (v4bf16 FPR64:$src))), + (v8i8 (REV16v8i8 FPR64:$src))>; } let Predicates = [IsLE] in { @@ -6891,6 +6996,7 @@ def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))), (f64 FPR64:$src)>; def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))), (f64 FPR64:$src)>; def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))), (f64 FPR64:$src)>; +def : Pat<(f64 (bitconvert (v4bf16 FPR64:$src))), (f64 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))), @@ -6903,6 +7009,8 @@ (f64 (REV64v8i8 FPR64:$src))>; def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))), (f64 (REV64v4i16 FPR64:$src))>; +def : Pat<(f64 (bitconvert (v4bf16 FPR64:$src))), + (f64 (REV64v4i16 FPR64:$src))>; } def : Pat<(f64 (bitconvert (v1i64 FPR64:$src))), (f64 FPR64:$src)>; def : Pat<(f64 (bitconvert (v1f64 FPR64:$src))), (f64 FPR64:$src)>; @@ -6913,6 +7021,7 @@ def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))), (v1f64 FPR64:$src)>; def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>; def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), (v1f64 FPR64:$src)>; +def : Pat<(v1f64 (bitconvert (v4bf16 FPR64:$src))), (v1f64 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), @@ -6925,6 +7034,8 @@ (v1f64 (REV64v2i32 FPR64:$src))>; def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), (v1f64 (REV64v4i16 FPR64:$src))>; +def : Pat<(v1f64 (bitconvert (v4bf16 FPR64:$src))), + (v1f64 (REV64v4i16 FPR64:$src))>; } def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>; def : Pat<(v1f64 (bitconvert (f64 FPR64:$src))), (v1f64 FPR64:$src)>; @@ -6936,6 +7047,7 @@ def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), (v2f32 FPR64:$src)>; +def : Pat<(v2f32 (bitconvert (v4bf16 FPR64:$src))), (v2f32 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), @@ -6950,6 +7062,8 @@ (v2f32 (REV64v2i32 FPR64:$src))>; def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), (v2f32 (REV32v4i16 FPR64:$src))>; +def : Pat<(v2f32 (bitconvert (v4bf16 FPR64:$src))), + (v2f32 (REV32v4i16 FPR64:$src))>; } def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>; @@ -6960,6 +7074,7 @@ def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>; def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>; def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))), (f128 FPR128:$src)>; +def : Pat<(f128 (bitconvert (v8bf16 FPR128:$src))), (f128 FPR128:$src)>; def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>; } let Predicates = [IsBE] in { @@ -6974,6 +7089,9 @@ def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))), (f128 (EXTv16i8 (REV64v8i16 FPR128:$src), (REV64v8i16 FPR128:$src), (i32 8)))>; +def : Pat<(f128 (bitconvert (v8bf16 FPR128:$src))), + (f128 (EXTv16i8 (REV64v8i16 FPR128:$src), + (REV64v8i16 FPR128:$src), (i32 8)))>; def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>; def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), @@ -6989,6 +7107,7 @@ def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))), (v2f64 FPR128:$src)>; +def : Pat<(v2f64 (bitconvert (v8bf16 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>; } @@ -7002,6 +7121,8 @@ (v2f64 (REV64v8i16 FPR128:$src))>; def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))), (v2f64 (REV64v8i16 FPR128:$src))>; +def : Pat<(v2f64 (bitconvert (v8bf16 FPR128:$src))), + (v2f64 (REV64v8i16 FPR128:$src))>; def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 (REV64v16i8 FPR128:$src))>; def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), @@ -7013,6 +7134,7 @@ def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v8bf16 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>; @@ -7025,6 +7147,8 @@ (v4f32 (REV32v8i16 FPR128:$src))>; def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))), (v4f32 (REV32v8i16 FPR128:$src))>; +def : Pat<(v4f32 (bitconvert (v8bf16 FPR128:$src))), + (v4f32 (REV32v8i16 FPR128:$src))>; def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 (REV32v16i8 FPR128:$src))>; def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), @@ -7041,6 +7165,7 @@ def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v8bf16 FPR128:$src))), (v2i64 FPR128:$src)>; } let Predicates = [IsBE] in { def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), @@ -7056,6 +7181,8 @@ (v2i64 (REV64v4i32 FPR128:$src))>; def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), (v2i64 (REV64v8i16 FPR128:$src))>; +def : Pat<(v2i64 (bitconvert (v8bf16 FPR128:$src))), + (v2i64 (REV64v8i16 FPR128:$src))>; } def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>; @@ -7066,6 +7193,7 @@ def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v8bf16 FPR128:$src))), (v4i32 FPR128:$src)>; } let Predicates = [IsBE] in { def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))), @@ -7082,6 +7210,8 @@ (v4i32 (REV64v4i32 FPR128:$src))>; def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), (v4i32 (REV32v8i16 FPR128:$src))>; +def : Pat<(v4i32 (bitconvert (v8bf16 FPR128:$src))), + (v4i32 (REV32v8i16 FPR128:$src))>; } def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>; @@ -7110,6 +7240,7 @@ (v8i16 (REV32v8i16 FPR128:$src))>; } def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v8bf16 FPR128:$src))), (v8i16 FPR128:$src)>; let Predicates = [IsLE] in { def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))), (v8f16 FPR128:$src)>; @@ -7118,6 +7249,13 @@ def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>; def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>; def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>; + +def : Pat<(v8bf16 (bitconvert (f128 FPR128:$src))), (v8bf16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v2i64 FPR128:$src))), (v8bf16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v4i32 FPR128:$src))), (v8bf16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v16i8 FPR128:$src))), (v8bf16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v2f64 FPR128:$src))), (v8bf16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v4f32 FPR128:$src))), (v8bf16 FPR128:$src)>; } let Predicates = [IsBE] in { def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))), @@ -7134,8 +7272,24 @@ (v8f16 (REV64v8i16 FPR128:$src))>; def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), (v8f16 (REV32v8i16 FPR128:$src))>; + +def : Pat<(v8bf16 (bitconvert (f128 FPR128:$src))), + (v8bf16 (EXTv16i8 (REV64v8i16 FPR128:$src), + (REV64v8i16 FPR128:$src), + (i32 8)))>; +def : Pat<(v8bf16 (bitconvert (v2i64 FPR128:$src))), + (v8bf16 (REV64v8i16 FPR128:$src))>; +def : Pat<(v8bf16 (bitconvert (v4i32 FPR128:$src))), + (v8bf16 (REV32v8i16 FPR128:$src))>; +def : Pat<(v8bf16 (bitconvert (v16i8 FPR128:$src))), + (v8bf16 (REV16v16i8 FPR128:$src))>; +def : Pat<(v8bf16 (bitconvert (v2f64 FPR128:$src))), + (v8bf16 (REV64v8i16 FPR128:$src))>; +def : Pat<(v8bf16 (bitconvert (v4f32 FPR128:$src))), + (v8bf16 (REV32v8i16 FPR128:$src))>; } def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v8i16 FPR128:$src))), (v8bf16 FPR128:$src)>; let Predicates = [IsLE] in { def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), (v16i8 FPR128:$src)>; @@ -7145,6 +7299,7 @@ def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v8bf16 FPR128:$src))), (v16i8 FPR128:$src)>; } let Predicates = [IsBE] in { def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), @@ -7163,6 +7318,8 @@ (v16i8 (REV32v16i8 FPR128:$src))>; def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 (REV16v16i8 FPR128:$src))>; +def : Pat<(v16i8 (bitconvert (v8bf16 FPR128:$src))), + (v16i8 (REV16v16i8 FPR128:$src))>; } def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 0))), @@ -7173,6 +7330,8 @@ (EXTRACT_SUBREG V128:$Rn, dsub)>; def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 0))), (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v4bf16 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 0))), (EXTRACT_SUBREG V128:$Rn, dsub)>; def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 0))), @@ -7204,6 +7363,8 @@ (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>; def : Pat<(insert_subvector undef, (v4f16 FPR64:$src), (Ty 0)), (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR64:$src, dsub)>; + def : Pat<(insert_subvector undef, (v4bf16 FPR64:$src), (Ty 0)), + (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR64:$src, dsub)>; def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (Ty 0)), (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>; } diff --git a/llvm/test/CodeGen/AArch64/bf16-vector-bitcast.ll b/llvm/test/CodeGen/AArch64/bf16-vector-bitcast.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/bf16-vector-bitcast.ll @@ -0,0 +1,218 @@ +; RUN: llc < %s -asm-verbose=0 -mtriple=aarch64-none-eabi | FileCheck %s + +define <4 x i16> @v4bf16_to_v4i16(float, <4 x bfloat> %a) nounwind { +; CHECK-LABEL: v4bf16_to_v4i16: +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast <4 x bfloat> %a to <4 x i16> + ret <4 x i16> %1 +} + +define <2 x i32> @v4bf16_to_v2i32(float, <4 x bfloat> %a) nounwind { +; CHECK-LABEL: v4bf16_to_v2i32: +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast <4 x bfloat> %a to <2 x i32> + ret <2 x i32> %1 +} + +define <1 x i64> @v4bf16_to_v1i64(float, <4 x bfloat> %a) nounwind { +; CHECK-LABEL: v4bf16_to_v1i64: +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast <4 x bfloat> %a to <1 x i64> + ret <1 x i64> %1 +} + +define i64 @v4bf16_to_i64(float, <4 x bfloat> %a) nounwind { +; CHECK-LABEL: v4bf16_to_i64: +; CHECK-NEXT: fmov x0, d1 +; CHECK-NEXT: ret +entry: + %1 = bitcast <4 x bfloat> %a to i64 + ret i64 %1 +} + +define <2 x float> @v4bf16_to_v2float(float, <4 x bfloat> %a) nounwind { +; CHECK-LABEL: v4bf16_to_v2float: +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast <4 x bfloat> %a to <2 x float> + ret <2 x float> %1 +} + +define <1 x double> @v4bf16_to_v1double(float, <4 x bfloat> %a) nounwind { +; CHECK-LABEL: v4bf16_to_v1double: +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast <4 x bfloat> %a to <1 x double> + ret <1 x double> %1 +} + +define double @v4bf16_to_double(float, <4 x bfloat> %a) nounwind { +; CHECK-LABEL: v4bf16_to_double: +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast <4 x bfloat> %a to double + ret double %1 +} + + +define <4 x bfloat> @v4i16_to_v4bf16(float, <4 x i16> %a) nounwind { +; CHECK-LABEL: v4i16_to_v4bf16: +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast <4 x i16> %a to <4 x bfloat> + ret <4 x bfloat> %1 +} + +define <4 x bfloat> @v2i32_to_v4bf16(float, <2 x i32> %a) nounwind { +; CHECK-LABEL: v2i32_to_v4bf16: +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast <2 x i32> %a to <4 x bfloat> + ret <4 x bfloat> %1 +} + +define <4 x bfloat> @v1i64_to_v4bf16(float, <1 x i64> %a) nounwind { +; CHECK-LABEL: v1i64_to_v4bf16: +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast <1 x i64> %a to <4 x bfloat> + ret <4 x bfloat> %1 +} + +define <4 x bfloat> @i64_to_v4bf16(float, i64 %a) nounwind { +; CHECK-LABEL: i64_to_v4bf16: +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ret +entry: + %1 = bitcast i64 %a to <4 x bfloat> + ret <4 x bfloat> %1 +} + +define <4 x bfloat> @v2float_to_v4bf16(float, <2 x float> %a) nounwind { +; CHECK-LABEL: v2float_to_v4bf16: +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast <2 x float> %a to <4 x bfloat> + ret <4 x bfloat> %1 +} + +define <4 x bfloat> @v1double_to_v4bf16(float, <1 x double> %a) nounwind { +; CHECK-LABEL: v1double_to_v4bf16: +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast <1 x double> %a to <4 x bfloat> + ret <4 x bfloat> %1 +} + +define <4 x bfloat> @double_to_v4bf16(float, double %a) nounwind { +; CHECK-LABEL: double_to_v4bf16: +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast double %a to <4 x bfloat> + ret <4 x bfloat> %1 +} + +define <8 x i16> @v8bf16_to_v8i16(float, <8 x bfloat> %a) nounwind { +; CHECK-LABEL: v8bf16_to_v8i16: +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast <8 x bfloat> %a to <8 x i16> + ret <8 x i16> %1 +} + +define <4 x i32> @v8bf16_to_v4i32(float, <8 x bfloat> %a) nounwind { +; CHECK-LABEL: v8bf16_to_v4i32: +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast <8 x bfloat> %a to <4 x i32> + ret <4 x i32> %1 +} + +define <2 x i64> @v8bf16_to_v2i64(float, <8 x bfloat> %a) nounwind { +; CHECK-LABEL: v8bf16_to_v2i64: +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast <8 x bfloat> %a to <2 x i64> + ret <2 x i64> %1 +} + +define <4 x float> @v8bf16_to_v4float(float, <8 x bfloat> %a) nounwind { +; CHECK-LABEL: v8bf16_to_v4float: +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast <8 x bfloat> %a to <4 x float> + ret <4 x float> %1 +} + +define <2 x double> @v8bf16_to_v2double(float, <8 x bfloat> %a) nounwind { +; CHECK-LABEL: v8bf16_to_v2double: +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast <8 x bfloat> %a to <2 x double> + ret <2 x double> %1 +} + +define <8 x bfloat> @v8i16_to_v8bf16(float, <8 x i16> %a) nounwind { +; CHECK-LABEL: v8i16_to_v8bf16: +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast <8 x i16> %a to <8 x bfloat> + ret <8 x bfloat> %1 +} + +define <8 x bfloat> @v4i32_to_v8bf16(float, <4 x i32> %a) nounwind { +; CHECK-LABEL: v4i32_to_v8bf16: +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast <4 x i32> %a to <8 x bfloat> + ret <8 x bfloat> %1 +} + +define <8 x bfloat> @v2i64_to_v8bf16(float, <2 x i64> %a) nounwind { +; CHECK-LABEL: v2i64_to_v8bf16: +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast <2 x i64> %a to <8 x bfloat> + ret <8 x bfloat> %1 +} + +define <8 x bfloat> @v4float_to_v8bf16(float, <4 x float> %a) nounwind { +; CHECK-LABEL: v4float_to_v8bf16: +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast <4 x float> %a to <8 x bfloat> + ret <8 x bfloat> %1 +} + +define <8 x bfloat> @v2double_to_v8bf16(float, <2 x double> %a) nounwind { +; CHECK-LABEL: v2double_to_v8bf16: +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast <2 x double> %a to <8 x bfloat> + ret <8 x bfloat> %1 +} diff --git a/llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll @@ -0,0 +1,165 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -asm-verbose=0 -mtriple=aarch64-none-eabi | FileCheck %s + +; bfloat16x4_t test_vcreate_bf16(uint64_t a) { return vcreate_bf16(a); } +define <4 x bfloat> @test_vcreate_bf16(i64 %a) nounwind { +; CHECK-LABEL: test_vcreate_bf16: +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ret +entry: + %0 = bitcast i64 %a to <4 x bfloat> + ret <4 x bfloat> %0 +} + +; bfloat16x4_t test_vdup_n_bf16(bfloat16_t v) { return vdup_n_bf16(v); } +define <4 x bfloat> @test_vdup_n_bf16(bfloat %v) nounwind { +; CHECK-LABEL: test_vdup_n_bf16: +; CHECK-NEXT: dup v0.4h, v0.h[0] +; CHECK-NEXT: ret +entry: + %vecinit.i = insertelement <4 x bfloat> undef, bfloat %v, i32 0 + %vecinit3.i = shufflevector <4 x bfloat> %vecinit.i, <4 x bfloat> undef, <4 x i32> zeroinitializer + ret <4 x bfloat> %vecinit3.i +} + +; bfloat16x8_t test_vdupq_n_bf16(bfloat16_t v) { return vdupq_n_bf16(v); } +define <8 x bfloat> @test_vdupq_n_bf16(bfloat %v) nounwind { +; CHECK-LABEL: test_vdupq_n_bf16: +; CHECK-NEXT: dup v0.8h, v0.h[0] +; CHECK-NEXT: ret +entry: + %vecinit.i = insertelement <8 x bfloat> undef, bfloat %v, i32 0 + %vecinit7.i = shufflevector <8 x bfloat> %vecinit.i, <8 x bfloat> undef, <8 x i32> zeroinitializer + ret <8 x bfloat> %vecinit7.i +} + +; bfloat16x4_t test_vdup_lane_bf16(bfloat16x4_t v) { return vdup_lane_bf16(v, 1); } +define <4 x bfloat> @test_vdup_lane_bf16(<4 x bfloat> %v) nounwind { +; CHECK-LABEL: test_vdup_lane_bf16: +; CHECK-NEXT: dup v0.4h, v0.h[1] +; CHECK-NEXT: ret +entry: + %lane = shufflevector <4 x bfloat> %v, <4 x bfloat> undef, <4 x i32> + ret <4 x bfloat> %lane +} + +; bfloat16x8_t test_vdupq_lane_bf16(bfloat16x4_t v) { return vdupq_lane_bf16(v, 1); } +define <8 x bfloat> @test_vdupq_lane_bf16(<4 x bfloat> %v) nounwind { +; CHECK-LABEL: test_vdupq_lane_bf16: +; CHECK-NEXT: dup v0.8h, v0.h[1] +; CHECK-NEXT: ret +entry: + %lane = shufflevector <4 x bfloat> %v, <4 x bfloat> undef, <8 x i32> + ret <8 x bfloat> %lane +} + +; bfloat16x4_t test_vdup_laneq_bf16(bfloat16x8_t v) { return vdup_laneq_bf16(v, 7); } +define <4 x bfloat> @test_vdup_laneq_bf16(<8 x bfloat> %v) nounwind { +; CHECK-LABEL: test_vdup_laneq_bf16: +; CHECK-NEXT: dup v0.4h, v0.h[7] +; CHECK-NEXT: ret +entry: + %lane = shufflevector <8 x bfloat> %v, <8 x bfloat> undef, <4 x i32> + ret <4 x bfloat> %lane +} + +; bfloat16x8_t test_vdupq_laneq_bf16(bfloat16x8_t v) { return vdupq_laneq_bf16(v, 7); } +define <8 x bfloat> @test_vdupq_laneq_bf16(<8 x bfloat> %v) nounwind { +; CHECK-LABEL: test_vdupq_laneq_bf16: +; CHECK-NEXT: dup v0.8h, v0.h[7] +; CHECK-NEXT: ret +entry: + %lane = shufflevector <8 x bfloat> %v, <8 x bfloat> undef, <8 x i32> + ret <8 x bfloat> %lane +} + +; bfloat16x8_t test_vcombine_bf16(bfloat16x4_t low, bfloat16x4_t high) { return vcombine_bf16(low, high); } +define <8 x bfloat> @test_vcombine_bf16(<4 x bfloat> %low, <4 x bfloat> %high) nounwind { +; CHECK-LABEL: test_vcombine_bf16: +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret +entry: + %shuffle.i = shufflevector <4 x bfloat> %low, <4 x bfloat> %high, <8 x i32> + ret <8 x bfloat> %shuffle.i +} + +; bfloat16x4_t test_vget_high_bf16(bfloat16x8_t a) { return vget_high_bf16(a); } +define <4 x bfloat> @test_vget_high_bf16(<8 x bfloat> %a) nounwind { +; CHECK-LABEL: test_vget_high_bf16: +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ret +entry: + %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> + ret <4 x bfloat> %shuffle.i +} + +; bfloat16x4_t test_vget_low_bf16(bfloat16x8_t a) { return vget_low_bf16(a); } +define <4 x bfloat> @test_vget_low_bf16(<8 x bfloat> %a) nounwind { +; CHECK-LABEL: test_vget_low_bf16: +; CHECK-NEXT: ret +entry: + %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> + ret <4 x bfloat> %shuffle.i +} + +; bfloat16_t test_vget_lane_bf16(bfloat16x4_t v) { return vget_lane_bf16(v, 1); } +define bfloat @test_vget_lane_bf16(<4 x bfloat> %v) nounwind { +; CHECK-LABEL: test_vget_lane_bf16: +; CHECK-NEXT: mov h0, v0.h[1] +; CHECK-NEXT: ret +entry: + %vget_lane = extractelement <4 x bfloat> %v, i32 1 + ret bfloat %vget_lane +} + +; bfloat16_t test_vgetq_lane_bf16(bfloat16x8_t v) { return vgetq_lane_bf16(v, 7); } +define bfloat @test_vgetq_lane_bf16(<8 x bfloat> %v) nounwind { +; CHECK-LABEL: test_vgetq_lane_bf16: +; CHECK-NEXT: mov h0, v0.h[7] +; CHECK-NEXT: ret +entry: + %vgetq_lane = extractelement <8 x bfloat> %v, i32 7 + ret bfloat %vgetq_lane +} + +; bfloat16x4_t test_vset_lane_bf16(bfloat16_t a, bfloat16x4_t v) { return vset_lane_bf16(a, v, 1); } +define <4 x bfloat> @test_vset_lane_bf16(bfloat %a, <4 x bfloat> %v) nounwind { +; CHECK-LABEL: test_vset_lane_bf16: +; CHECK-NEXT: mov v1.h[1], v0.h[0] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %vset_lane = insertelement <4 x bfloat> %v, bfloat %a, i32 1 + ret <4 x bfloat> %vset_lane +} + +; bfloat16x8_t test_vsetq_lane_bf16(bfloat16_t a, bfloat16x8_t v) { return vsetq_lane_bf16(a, v, 7); } +define <8 x bfloat> @test_vsetq_lane_bf16(bfloat %a, <8 x bfloat> %v) nounwind { +; CHECK-LABEL: test_vsetq_lane_bf16: +; CHECK-NEXT: mov v1.h[7], v0.h[0] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %vset_lane = insertelement <8 x bfloat> %v, bfloat %a, i32 7 + ret <8 x bfloat> %vset_lane +} + +; bfloat16_t test_vduph_lane_bf16(bfloat16x4_t v) { return vduph_lane_bf16(v, 1); } +define bfloat @test_vduph_lane_bf16(<4 x bfloat> %v) nounwind { +; CHECK-LABEL: test_vduph_lane_bf16: +; CHECK-NEXT: mov h0, v0.h[1] +; CHECK-NEXT: ret +entry: + %vget_lane = extractelement <4 x bfloat> %v, i32 1 + ret bfloat %vget_lane +} + +; bfloat16_t test_vduph_laneq_bf16(bfloat16x8_t v) { return vduph_laneq_bf16(v, 7); } +define bfloat @test_vduph_laneq_bf16(<8 x bfloat> %v) nounwind { +; CHECK-LABEL: test_vduph_laneq_bf16: +; CHECK-NEXT: mov h0, v0.h[7] +; CHECK-NEXT: ret +entry: + %vgetq_lane = extractelement <8 x bfloat> %v, i32 7 + ret bfloat %vgetq_lane +} diff --git a/llvm/test/CodeGen/AArch64/bf16.ll b/llvm/test/CodeGen/AArch64/bf16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/bf16.ll @@ -0,0 +1,38 @@ +; RUN: llc < %s -asm-verbose=0 -mtriple=arm64-eabi | FileCheck %s +; RUN: llc < %s -asm-verbose=0 -mtriple=aarch64-eabi | FileCheck %s + +; test argument passing and simple load/store + +define bfloat @test_load(bfloat* %p) nounwind { +; CHECK-LABEL: test_load: +; CHECK-NEXT: ldr h0, [x0] +; CHECK-NEXT: ret + %tmp1 = load bfloat, bfloat* %p, align 16 + ret bfloat %tmp1 +} + +define <4 x bfloat> @test_vec_load(<4 x bfloat>* %p) nounwind { +; CHECK-LABEL: test_vec_load: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <4 x bfloat>, <4 x bfloat>* %p, align 16 + ret <4 x bfloat> %tmp1 +} + +define void @test_store(bfloat* %a, bfloat %b) nounwind { +; CHECK-LABEL: test_store: +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: ret + store bfloat %b, bfloat* %a, align 16 + ret void +} + +; Simple store of v4bf16 +define void @test_vec_store(<4 x bfloat>* %a, <4 x bfloat> %b) nounwind { +; CHECK-LABEL: test_vec_store: +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret +entry: + store <4 x bfloat> %b, <4 x bfloat>* %a, align 16 + ret void +}