Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -321,6 +321,10 @@ setOperationAction(ISD::FPOW, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); + if (Subtarget->hasFullFP16()) + setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom); + else + setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); setOperationAction(ISD::FREM, MVT::f16, Promote); setOperationAction(ISD::FPOW, MVT::f16, Promote); @@ -333,7 +337,6 @@ setOperationAction(ISD::FLOG, MVT::f16, Promote); setOperationAction(ISD::FLOG2, MVT::f16, Promote); setOperationAction(ISD::FLOG10, MVT::f16, Promote); - setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); if (!Subtarget->hasFullFP16()) { setOperationAction(ISD::SELECT, MVT::f16, Promote); @@ -4086,25 +4089,26 @@ In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL)); EVT VecVT; - EVT EltVT; uint64_t EltMask; SDValue VecVal1, VecVal2; - if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { - EltVT = MVT::i32; - VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32); - EltMask = 0x80000000ULL; + auto setVecVal = [&] (int Idx) { if (!VT.isVector()) { - VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, + VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1); - VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, + VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2); } else { VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); } + }; + + if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { + VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32); + EltMask = 0x80000000ULL; + setVecVal(AArch64::ssub); } else if (VT == MVT::f64 || VT == MVT::v2f64) { - EltVT = MVT::i64; VecVT = MVT::v2i64; // We want to materialize a mask with the high bit set, but the AdvSIMD @@ -4112,15 +4116,11 @@ // 64-bit elements. Instead, materialize zero and then negate it. EltMask = 0; - if (!VT.isVector()) { - VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, - DAG.getUNDEF(VecVT), In1); - VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, - DAG.getUNDEF(VecVT), In2); - } else { - VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); - VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); - } + setVecVal(AArch64::dsub); + } else if (VT == MVT::f16 || VT == MVT::v4f16 || VT == MVT::v8f16) { + VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16); + EltMask = 0x8000ULL; + setVecVal(AArch64::hsub); } else { llvm_unreachable("Invalid type for copysign!"); } @@ -4138,6 +4138,8 @@ SDValue Sel = DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec); + if (VT == MVT::f16) + return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel); if (VT == MVT::f32) return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel); else if (VT == MVT::f64) Index: test/CodeGen/AArch64/f16-instructions.ll =================================================================== --- test/CodeGen/AArch64/f16-instructions.ll +++ test/CodeGen/AArch64/f16-instructions.ll @@ -934,37 +934,57 @@ ret half %r } -; CHECK-COMMON-LABEL: test_copysign: -; CHECK-COMMON-NEXT: fcvt s1, h1 -; CHECK-COMMON-NEXT: fcvt s0, h0 -; CHECK-COMMON-NEXT: movi.4s v2, #128, lsl #24 -; CHECK-COMMON-NEXT: bit.16b v0, v1, v2 -; CHECK-COMMON-NEXT: fcvt h0, s0 -; CHECK-COMMON-NEXT: ret +; CHECK-CVT-LABEL: test_copysign: +; CHECK-CVT-NEXT: fcvt s1, h1 +; CHECK-CVT-NEXT: fcvt s0, h0 +; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 +; CHECK-CVT-NEXT: bit.16b v0, v1, v2 +; CHECK-CVT-NEXT: fcvt h0, s0 +; CHECK-CVT-NEXT: ret + +; CHECK-FP16-LABEL: test_copysign: +; CHECK-FP16-NEXT: movi.8h v2, #128, lsl #8 +; CHECK-FP16-NEXT: bit.16b v0, v1, v2 +; CHECK-FP16-NEXT: ret + define half @test_copysign(half %a, half %b) #0 { %r = call half @llvm.copysign.f16(half %a, half %b) ret half %r } -; CHECK-COMMON-LABEL: test_copysign_f32: -; CHECK-COMMON-NEXT: fcvt s0, h0 -; CHECK-COMMON-NEXT: movi.4s v2, #128, lsl #24 -; CHECK-COMMON-NEXT: bit.16b v0, v1, v2 -; CHECK-COMMON-NEXT: fcvt h0, s0 -; CHECK-COMMON-NEXT: ret +; CHECK-CVT-LABEL: test_copysign_f32: +; CHECK-CVT-NEXT: fcvt s0, h0 +; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 +; CHECK-CVT-NEXT: bit.16b v0, v1, v2 +; CHECK-CVT-NEXT: fcvt h0, s0 +; CHECK-CVT-NEXT: ret + +; CHECK-FP16-LABEL: test_copysign_f32: +; CHECK-FP16-NEXT: fcvt h1, s1 +; CHECK-FP16-NEXT: movi.8h v2, #128, lsl #8 +; CHECK-FP16-NEXT: bit.16b v0, v1, v2 +; CHECK-FP16-NEXT: ret + define half @test_copysign_f32(half %a, float %b) #0 { %tb = fptrunc float %b to half %r = call half @llvm.copysign.f16(half %a, half %tb) ret half %r } -; CHECK-COMMON-LABEL: test_copysign_f64: -; CHECK-COMMON-NEXT: fcvt s1, d1 -; CHECK-COMMON-NEXT: fcvt s0, h0 -; CHECK-COMMON-NEXT: movi.4s v2, #128, lsl #24 -; CHECK-COMMON-NEXT: bit.16b v0, v1, v2 -; CHECK-COMMON-NEXT: fcvt h0, s0 -; CHECK-COMMON-NEXT: ret +; CHECK-CVT-LABEL: test_copysign_f64: +; CHECK-CVT-NEXT: fcvt s1, d1 +; CHECK-CVT-NEXT: fcvt s0, h0 +; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 +; CHECK-CVT-NEXT: bit.16b v0, v1, v2 +; CHECK-CVT-NEXT: fcvt h0, s0 +; CHECK-CVT-NEXT: ret + +; CHECK-FP16-LABEL: test_copysign_f64: +; CHECK-FP16-NEXT: fcvt h1, d1 +; CHECK-FP16-NEXT: movi.8h v2, #128, lsl #8 +; CHECK-FP16-NEXT: bit.16b v0, v1, v2 +; CHECK-FP16-NEXT: ret + define half @test_copysign_f64(half %a, double %b) #0 { %tb = fptrunc double %b to half %r = call half @llvm.copysign.f16(half %a, half %tb) @@ -974,12 +994,19 @@ ; Check that the FP promotion will use a truncating FP_ROUND, so we can fold ; away the (fpext (fp_round )) here. -; CHECK-COMMON-LABEL: test_copysign_extended: -; CHECK-COMMON-NEXT: fcvt s1, h1 -; CHECK-COMMON-NEXT: fcvt s0, h0 -; CHECK-COMMON-NEXT: movi.4s v2, #128, lsl #24 -; CHECK-COMMON-NEXT: bit.16b v0, v1, v2 -; CHECK-COMMON-NEXT: ret +; CHECK-CVT-LABEL: test_copysign_extended: +; CHECK-CVT-NEXT: fcvt s1, h1 +; CHECK-CVT-NEXT: fcvt s0, h0 +; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 +; CHECK-CVT-NEXT: bit.16b v0, v1, v2 +; CHECK-CVT-NEXT: ret + +; CHECK-FP16-LABEL: test_copysign_extended: +; CHECK-FP16-NEXT: movi.8h v2, #128, lsl #8 +; CHECK-FP16-NEXT: bit.16b v0, v1, v2 +; CHECK-FP16-NEXT: fcvt s0, h0 +; CHECK-FP16-NEXT: ret + define float @test_copysign_extended(half %a, half %b) #0 { %r = call half @llvm.copysign.f16(half %a, half %b) %xr = fpext half %r to float