Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -647,6 +647,7 @@ SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorAND(SDValue Op, SelectionDAG &DAG) const; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -755,6 +755,17 @@ setOperationAction(ISD::FROUND, Ty, Legal); } + if (Subtarget->hasFullFP16()) { + for (MVT Ty : {MVT::v4f16, MVT::v8f16}) { + setOperationAction(ISD::FFLOOR, Ty, Legal); + setOperationAction(ISD::FNEARBYINT, Ty, Legal); + setOperationAction(ISD::FCEIL, Ty, Legal); + setOperationAction(ISD::FRINT, Ty, Legal); + setOperationAction(ISD::FTRUNC, Ty, Legal); + setOperationAction(ISD::FROUND, Ty, Legal); + } + } + setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); } @@ -2336,7 +2347,8 @@ SDLoc(Op)).first; } -static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { +SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, + SelectionDAG &DAG) const { // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. // Any additional optimization in this function should be recorded // in the cost tables. @@ -2344,8 +2356,9 @@ EVT VT = Op.getValueType(); unsigned NumElts = InVT.getVectorNumElements(); - // f16 vectors are promoted to f32 before a conversion. - if (InVT.getVectorElementType() == MVT::f16) { + // f16 conversions are promoted to f32 when full fp16 is not supported. + if (InVT.getVectorElementType() == MVT::f16 && + !Subtarget->hasFullFP16()) { MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts); SDLoc dl(Op); return DAG.getNode( Index: llvm/test/CodeGen/AArch64/arm64-vfloatintrinsics.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-vfloatintrinsics.ll +++ llvm/test/CodeGen/AArch64/arm64-vfloatintrinsics.ll @@ -141,6 +141,15 @@ %1 = call %v4f16 @llvm.nearbyint.v4f16(%v4f16 %a) ret %v4f16 %1 } +define %v4f16 @test_v4f16.round(%v4f16 %a) { + ; CHECK-LABEL: test_v4f16.round: + ; CHECK-NOFP16-COUNT-4: frinta s{{[0-9]+}}, s{{[0-9]+}} + ; CHECK-FP16-NOT: fcvt + ; CHECK-FP16: frinta.4h + ; CHECK-FP16-NEXT: ret + %1 = call %v4f16 @llvm.round.v4f16(%v4f16 %a) + ret %v4f16 %1 +} declare %v4f16 @llvm.sqrt.v4f16(%v4f16) #0 declare %v4f16 @llvm.powi.v4f16(%v4f16, i32) #0 @@ -159,6 +168,7 @@ declare %v4f16 @llvm.trunc.v4f16(%v4f16) #0 declare %v4f16 @llvm.rint.v4f16(%v4f16) #0 declare %v4f16 @llvm.nearbyint.v4f16(%v4f16) #0 +declare %v4f16 @llvm.round.v4f16(%v4f16) #0 ;;; @@ -298,6 +308,15 @@ %1 = call %v8f16 @llvm.nearbyint.v8f16(%v8f16 %a) ret %v8f16 %1 } +define %v8f16 @test_v8f16.round(%v8f16 %a) { + ; CHECK-LABEL: test_v8f16.round: + ; CHECK-NOFP16-COUNT-8: frinta s{{[0-9]+}}, s{{[0-9]+}} + ; CHECK-FP16-NOT: fcvt + ; CHECK-FP16: frinta.8h + ; CHECK-FP16-NEXT: ret + %1 = call %v8f16 @llvm.round.v8f16(%v8f16 %a) + ret %v8f16 %1 +} declare %v8f16 @llvm.sqrt.v8f16(%v8f16) #0 declare %v8f16 @llvm.powi.v8f16(%v8f16, i32) #0 @@ -316,6 +335,7 @@ declare %v8f16 @llvm.trunc.v8f16(%v8f16) #0 declare %v8f16 @llvm.rint.v8f16(%v8f16) #0 declare %v8f16 @llvm.nearbyint.v8f16(%v8f16) #0 +declare %v8f16 @llvm.round.v8f16(%v8f16) #0 ;;; Float vectors Index: llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll =================================================================== --- llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll +++ llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll @@ -257,40 +257,44 @@ define <4 x i8> @fptosi_i8(<4 x half> %a) #0 { ; CHECK-COMMON-LABEL: fptosi_i8: -; CHECK-COMMON-NEXT: fcvtl [[REG1:v[0-9]+\.4s]], v0.4h -; CHECK-COMMON-NEXT: fcvtzs [[REG2:v[0-9]+\.4s]], [[REG1]] -; CHECK-COMMON-NEXT: xtn v0.4h, [[REG2]] -; CHECK-COMMON-NEXT: ret +; CHECK-FP16: fcvtzs v0.4h, v0.4h +; CHECK-CVT-NEXT: fcvtl [[REG1:v[0-9]+\.4s]], v0.4h +; CHECK-CVT-NEXT: fcvtzs [[REG2:v[0-9]+\.4s]], [[REG1]] +; CHECK-CVT-NEXT: xtn v0.4h, [[REG2]] +; CHECK-COMMON-NEXT: ret %1 = fptosi<4 x half> %a to <4 x i8> ret <4 x i8> %1 } define <4 x i16> @fptosi_i16(<4 x half> %a) #0 { ; CHECK-COMMON-LABEL: fptosi_i16: -; CHECK-COMMON-NEXT: fcvtl [[REG1:v[0-9]+\.4s]], v0.4h -; CHECK-COMMON-NEXT: fcvtzs [[REG2:v[0-9]+\.4s]], [[REG1]] -; CHECK-COMMON-NEXT: xtn v0.4h, [[REG2]] -; CHECK-COMMON-NEXT: ret +; CHECK-FP16: fcvtzs v0.4h, v0.4h +; CHECK-CVT-NEXT: fcvtl [[REG1:v[0-9]+\.4s]], v0.4h +; CHECK-CVT-NEXT: fcvtzs [[REG2:v[0-9]+\.4s]], [[REG1]] +; CHECK-CVT-NEXT: xtn v0.4h, [[REG2]] +; CHECK-COMMON-NEXT: ret %1 = fptosi<4 x half> %a to <4 x i16> ret <4 x i16> %1 } define <4 x i8> @fptoui_i8(<4 x half> %a) #0 { ; CHECK-COMMON-LABEL: fptoui_i8: -; CHECK-COMMON-NEXT: fcvtl [[REG1:v[0-9]+\.4s]], v0.4h +; CHECK-FP16: fcvtzs v0.4h, v0.4h +; CHECK-CVT-NEXT: fcvtl [[REG1:v[0-9]+\.4s]], v0.4h ; NOTE: fcvtzs selected here because the xtn shaves the sign bit -; CHECK-COMMON-NEXT: fcvtzs [[REG2:v[0-9]+\.4s]], [[REG1]] -; CHECK-COMMON-NEXT: xtn v0.4h, [[REG2]] -; CHECK-COMMON-NEXT: ret +; CHECK-CVT-NEXT: fcvtzs [[REG2:v[0-9]+\.4s]], [[REG1]] +; CHECK-CVT-NEXT: xtn v0.4h, [[REG2]] +; CHECK-COMMON-NEXT: ret %1 = fptoui<4 x half> %a to <4 x i8> ret <4 x i8> %1 } define <4 x i16> @fptoui_i16(<4 x half> %a) #0 { ; CHECK-COMMON-LABEL: fptoui_i16: -; CHECK-COMMON-NEXT: fcvtl [[REG1:v[0-9]+\.4s]], v0.4h -; CHECK-COMMON-NEXT: fcvtzu [[REG2:v[0-9]+\.4s]], [[REG1]] -; CHECK-COMMON-NEXT: xtn v0.4h, [[REG2]] +; CHECK-FP16: fcvtzu v0.4h, v0.4h +; CHECK-CVT-NEXT: fcvtl [[REG1:v[0-9]+\.4s]], v0.4h +; CHECK-CVT-NEXT: fcvtzu [[REG2:v[0-9]+\.4s]], [[REG1]] +; CHECK-CVT-NEXT: xtn v0.4h, [[REG2]] ; CHECK-COMMON-NEXT: ret %1 = fptoui<4 x half> %a to <4 x i16> ret <4 x i16> %1 Index: llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll =================================================================== --- llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll +++ llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll @@ -395,40 +395,45 @@ define <8 x i8> @fptosi_i8(<8 x half> %a) #0 { ; CHECK-LABEL: fptosi_i8: -; CHECK-DAG: fcvtl [[LO:v[0-9]+\.4s]], v0.4h -; CHECK-DAG: fcvtl2 [[HI:v[0-9]+\.4s]], v0.8h -; CHECK-DAG: fcvtzs [[LOF32:v[0-9]+\.4s]], [[LO]] -; CHECK-DAG: xtn [[I16:v[0-9]+]].4h, [[LOF32]] -; CHECK-DAG: fcvtzs [[HIF32:v[0-9]+\.4s]], [[HI]] -; CHECK-DAG: xtn2 [[I16]].8h, [[HIF32]] -; CHECK-NEXT: xtn v0.8b, [[I16]].8h -; CHECK-NEXT: ret +; CHECK-FP16-NEXT: fcvtzs [[LO:v[0-9]+\.8h]], v0.8h +; CHECK-CVT-DAG: fcvtl [[LO:v[0-9]+\.4s]], v0.4h +; CHECK-CVT-DAG: fcvtl2 [[HI:v[0-9]+\.4s]], v0.8h +; CHECK-CVT-DAG: fcvtzs [[LOF32:v[0-9]+\.4s]], [[LO]] +; CHECK-CVT-DAG: xtn [[I16:v[0-9]+]].4h, [[LOF32]] +; CHECK-CVT-DAG: fcvtzs [[HIF32:v[0-9]+\.4s]], [[HI]] +; CHECK-CVT-DAG: xtn2 [[I16]].8h, [[HIF32]] +; CHECK-CVT-DAG: xtn v0.8b, [[I16]].8h +; CHECK-FP16-NEXT: xtn v0.8b, [[LO]] +; CHECK-NEXT: ret %1 = fptosi<8 x half> %a to <8 x i8> ret <8 x i8> %1 } define <8 x i16> @fptosi_i16(<8 x half> %a) #0 { ; CHECK-LABEL: fptosi_i16: -; CHECK-DAG: fcvtl [[LO:v[0-9]+\.4s]], v0.4h -; CHECK-DAG: fcvtl2 [[HI:v[0-9]+\.4s]], v0.8h -; CHECK-DAG: fcvtzs [[LOF32:v[0-9]+\.4s]], [[LO]] -; CHECK-DAG: xtn [[I16:v[0-9]+]].4h, [[LOF32]] -; CHECK-DAG: fcvtzs [[HIF32:v[0-9]+\.4s]], [[HI]] -; CHECK-NEXT: xtn2 [[I16]].8h, [[HIF32]] -; CHECK-NEXT: ret +; CHECK-FP16-NEXT: fcvtzs v0.8h, v0.8h +; CHECK-CVT_DAG: fcvtl [[LO:v[0-9]+\.4s]], v0.4h +; CHECK-CVT_DAG: fcvtl2 [[HI:v[0-9]+\.4s]], v0.8h +; CHECK-CVT_DAG: fcvtzs [[LOF32:v[0-9]+\.4s]], [[LO]] +; CHECK-CVT_DAG: xtn [[I16:v[0-9]+]].4h, [[LOF32]] +; CHECK-CVT_DAG: fcvtzs [[HIF32:v[0-9]+\.4s]], [[HI]] +; CHECK-CVT_DAG: xtn2 [[I16]].8h, [[HIF32]] +; CHECK-COMMON_NEXT: ret %1 = fptosi<8 x half> %a to <8 x i16> ret <8 x i16> %1 } define <8 x i8> @fptoui_i8(<8 x half> %a) #0 { ; CHECK-LABEL: fptoui_i8: -; CHECK-DAG: fcvtl [[LO:v[0-9]+\.4s]], v0.4h -; CHECK-DAG: fcvtl2 [[HI:v[0-9]+\.4s]], v0.8h -; CHECK-DAG: fcvtzu [[LOF32:v[0-9]+\.4s]], [[LO]] -; CHECK-DAG: xtn [[I16:v[0-9]+]].4h, [[LOF32]] -; CHECK-DAG: fcvtzu [[HIF32:v[0-9]+\.4s]], [[HI]] -; CHECK-DAG: xtn2 [[I16]].8h, [[HIF32]] -; CHECK-NEXT: xtn v0.8b, [[I16]].8h +; CHECK-FP16-NEXT: fcvtzu [[LO:v[0-9]+\.8h]], v0.8h +; CHECK-CVT-DAG: fcvtl [[LO:v[0-9]+\.4s]], v0.4h +; CHECK-CVT-DAG: fcvtl2 [[HI:v[0-9]+\.4s]], v0.8h +; CHECK-CVT-DAG: fcvtzu [[LOF32:v[0-9]+\.4s]], [[LO]] +; CHECK-CVT-DAG: xtn [[I16:v[0-9]+]].4h, [[LOF32]] +; CHECK-CVT-DAG: fcvtzu [[HIF32:v[0-9]+\.4s]], [[HI]] +; CHECK-CVT-DAG: xtn2 [[I16]].8h, [[HIF32]] +; CHECK-CVT-DAG: xtn v0.8b, [[I16]].8h +; CHECK-FP16-NEXT: xtn v0.8b, [[LO]] ; CHECK-NEXT: ret %1 = fptoui<8 x half> %a to <8 x i8> ret <8 x i8> %1 @@ -436,13 +441,14 @@ define <8 x i16> @fptoui_i16(<8 x half> %a) #0 { ; CHECK-LABEL: fptoui_i16: -; CHECK-DAG: fcvtl [[LO:v[0-9]+\.4s]], v0.4h -; CHECK-DAG: fcvtl2 [[HI:v[0-9]+\.4s]], v0.8h -; CHECK-DAG: fcvtzu [[LOF32:v[0-9]+\.4s]], [[LO]] -; CHECK-DAG: xtn [[I16:v[0-9]+]].4h, [[LOF32]] -; CHECK-DAG: fcvtzu [[HIF32:v[0-9]+\.4s]], [[HI]] -; CHECK-NEXT: xtn2 [[I16]].8h, [[HIF32]] -; CHECK-NEXT: ret +; CHECK-FP16-NEXT: fcvtzu v0.8h, v0.8h +; CHECK-CVT-DAG: fcvtl [[LO:v[0-9]+\.4s]], v0.4h +; CHECK-CVT-DAG: fcvtl2 [[HI:v[0-9]+\.4s]], v0.8h +; CHECK-CVT-DAG: fcvtzu [[LOF32:v[0-9]+\.4s]], [[LO]] +; CHECK-CVT-DAG: xtn [[I16:v[0-9]+]].4h, [[LOF32]] +; CHECK-CVT-DAG: fcvtzu [[HIF32:v[0-9]+\.4s]], [[HI]] +; CHECK-CVT-DAG: xtn2 [[I16]].8h, [[HIF32]] +; CHECK-NEXT: ret %1 = fptoui<8 x half> %a to <8 x i16> ret <8 x i16> %1 }