diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1396,10 +1396,12 @@ setOperationAction(ISD::FLOG10, VT, Expand); setOperationAction(ISD::FEXP, VT, Expand); setOperationAction(ISD::FEXP2, VT, Expand); + } - // But we do support custom-lowering for FCOPYSIGN. + // But we do support custom-lowering for FCOPYSIGN. + if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 || + ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16())) setOperationAction(ISD::FCOPYSIGN, VT, Custom); - } setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll --- a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll +++ b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll @@ -292,9 +292,9 @@ define i32 @fcopysign(i32 %arg) { ; CHECK-LABEL: 'fcopysign' ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F16 = call half @llvm.copysign.f16(half undef, half undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8F16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16F16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.copysign.f32(float undef, float undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.copysign.v2f32(<2 x float> undef, <2 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef) diff --git a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll --- a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll @@ -195,20 +195,8 @@ ; NOFP16-NEXT: mov.h v0[3], v1[0] ; NOFP16-NEXT: ret -; FP16-NEXT: mov h2, v1[1] -; FP16-NEXT: mov h3, v0[1] -; FP16-NEXT: movi.8h v4, #128, lsl #8 -; FP16-NEXT: mov h5, v1[2] -; FP16-NEXT: bit.16b v3, v2, v4 -; FP16-NEXT: mov h2, v0[2] -; FP16-NEXT: bit.16b v2, v5, v4 -; FP16-NEXT: mov h5, v0[3] -; FP16-NEXT: bit.16b v0, v1, v4 -; FP16-NEXT: mov h1, v1[3] -; FP16-NEXT: mov.h v0[1], v3[0] -; FP16-NEXT: mov.h v0[2], v2[0] -; FP16-NEXT: bit.16b v5, v1, v4 -; FP16-NEXT: mov.h v0[3], v5[0] +; FP16-NEXT: movi.4h v2, #128, lsl #8 +; FP16-NEXT: bit.8b v0, v1, v2 ; FP16-NEXT: ret %r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %b) ret <4 x half> %r @@ -246,20 +234,8 @@ ; NOFP16-NEXT: ret ; FP16-NEXT: fcvtn v1.4h, v1.4s -; FP16-NEXT: mov h2, v0[1] -; FP16-NEXT: movi.8h v3, #128, lsl #8 -; FP16-NEXT: mov h4, v0[2] -; FP16-NEXT: mov h5, v1[1] -; FP16-NEXT: bit.16b v2, v5, v3 -; FP16-NEXT: mov h5, v1[2] -; FP16-NEXT: bit.16b v4, v5, v3 -; FP16-NEXT: mov h5, v0[3] -; FP16-NEXT: bit.16b v0, v1, v3 -; FP16-NEXT: mov h1, v1[3] -; FP16-NEXT: mov.h v0[1], v2[0] -; FP16-NEXT: mov.h v0[2], v4[0] -; FP16-NEXT: bit.16b v5, v1, v3 -; FP16-NEXT: mov.h v0[3], v5[0] +; FP16-NEXT: movi.4h v2, #128, lsl #8 +; FP16-NEXT: bit.8b v0, v1, v2 ; FP16-NEXT: ret %tmp0 = fptrunc <4 x float> %b to <4 x half> %r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %tmp0) @@ -295,23 +271,17 @@ ; NOFP16-NEXT: mov.h v0[3], v1[0] ; NOFP16-NEXT: ret -; FP16-NEXT: mov h3, v0[1] -; FP16-NEXT: movi.8h v4, #128, lsl #8 -; FP16-NEXT: fcvt h5, d1 -; FP16-NEXT: mov h6, v0[2] -; FP16-NEXT: mov h7, v0[3] -; FP16-NEXT: bit.16b v0, v5, v4 -; FP16-NEXT: fcvt h5, d2 -; FP16-NEXT: bit.16b v6, v5, v4 +; FP16-NEXT: mov d3, v1[1] +; FP16-NEXT: fcvt h1, d1 +; FP16-NEXT: fcvt h3, d3 +; FP16-NEXT: mov.h v1[1], v3[0] +; FP16-NEXT: fcvt h3, d2 ; FP16-NEXT: mov d2, v2[1] -; FP16-NEXT: mov d1, v1[1] -; FP16-NEXT: fcvt h1, d1 -; FP16-NEXT: fcvt h2, d2 -; FP16-NEXT: bit.16b v3, v1, v4 -; FP16-NEXT: mov.h v0[1], v3[0] -; FP16-NEXT: mov.h v0[2], v6[0] -; FP16-NEXT: bit.16b v7, v2, v4 -; FP16-NEXT: mov.h v0[3], v7[0] +; FP16-NEXT: fcvt h2, d2 +; FP16-NEXT: mov.h v1[2], v3[0] +; FP16-NEXT: mov.h v1[3], v2[0] +; FP16-NEXT: movi.4h v2, #128, lsl #8 +; FP16-NEXT: bit.8b v0, v1, v2 ; FP16-NEXT: ret %tmp0 = fptrunc <4 x double> %b to <4 x half> %r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %tmp0) @@ -380,36 +350,8 @@ ; NOFP16-NEXT: mov.h v0[7], v1[0] ; NOFP16-NEXT: ret -; FP16-NEXT: mov h4, v1[1] -; FP16-NEXT: mov h3, v0[1] ; FP16-NEXT: movi.8h v2, #128, lsl #8 -; FP16-NEXT: mov h5, v1[2] -; FP16-NEXT: mov h6, v0[2] -; FP16-NEXT: mov h7, v1[3] -; FP16-NEXT: mov h16, v0[3] -; FP16-NEXT: mov h17, v1[4] -; FP16-NEXT: bit.16b v3, v4, v2 -; FP16-NEXT: mov h4, v0[4] -; FP16-NEXT: bit.16b v6, v5, v2 -; FP16-NEXT: mov h5, v1[5] -; FP16-NEXT: bit.16b v16, v7, v2 -; FP16-NEXT: mov h7, v0[5] -; FP16-NEXT: bit.16b v4, v17, v2 -; FP16-NEXT: mov h17, v1[6] -; FP16-NEXT: bit.16b v7, v5, v2 -; FP16-NEXT: mov h5, v0[6] -; FP16-NEXT: bit.16b v5, v17, v2 -; FP16-NEXT: mov h17, v0[7] -; FP16-NEXT: bit.16b v0, v1, v2 -; FP16-NEXT: mov.h v0[1], v3[0] -; FP16-NEXT: mov.h v0[2], v6[0] -; FP16-NEXT: mov.h v0[3], v16[0] -; FP16-NEXT: mov.h v0[4], v4[0] -; FP16-NEXT: mov h1, v1[7] -; FP16-NEXT: mov.h v0[5], v7[0] -; FP16-NEXT: mov.h v0[6], v5[0] -; FP16-NEXT: bit.16b v17, v1, v2 -; FP16-NEXT: mov.h v0[7], v17[0] +; FP16-NEXT: bit.16b v0, v1, v2 ; FP16-NEXT: ret %r = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %b) ret <8 x half> %r @@ -475,36 +417,10 @@ ; NOFP16-NEXT: ret ; FP16-NEXT: fcvtn v2.4h, v2.4s -; FP16-NEXT: fcvtn v4.4h, v1.4s -; FP16-NEXT: mov h3, v0[1] -; FP16-NEXT: movi.8h v1, #128, lsl #8 -; FP16-NEXT: mov h5, v0[2] -; FP16-NEXT: mov h6, v0[3] -; FP16-NEXT: mov h7, v0[4] -; FP16-NEXT: mov h16, v0[5] -; FP16-NEXT: mov h17, v0[6] -; FP16-NEXT: mov h18, v4[1] -; FP16-NEXT: bit.16b v3, v18, v1 -; FP16-NEXT: mov h18, v4[2] -; FP16-NEXT: bit.16b v5, v18, v1 -; FP16-NEXT: mov h18, v0[7] -; FP16-NEXT: bit.16b v0, v4, v1 -; FP16-NEXT: mov h4, v4[3] -; FP16-NEXT: bit.16b v6, v4, v1 -; FP16-NEXT: mov h4, v2[1] -; FP16-NEXT: bit.16b v16, v4, v1 -; FP16-NEXT: mov h4, v2[2] -; FP16-NEXT: bit.16b v17, v4, v1 -; FP16-NEXT: mov.h v0[1], v3[0] -; FP16-NEXT: mov.h v0[2], v5[0] -; FP16-NEXT: mov.h v0[3], v6[0] -; FP16-NEXT: bit.16b v7, v2, v1 -; FP16-NEXT: mov h2, v2[3] -; FP16-NEXT: mov.h v0[4], v7[0] -; FP16-NEXT: mov.h v0[5], v16[0] -; FP16-NEXT: mov.h v0[6], v17[0] -; FP16-NEXT: bit.16b v18, v2, v1 -; FP16-NEXT: mov.h v0[7], v18[0] +; FP16-NEXT: fcvtn v1.4h, v1.4s +; FP16-NEXT: mov.d v1[1], v2[0] +; FP16-NEXT: movi.8h v2, #128, lsl #8 +; FP16-NEXT: bit.16b v0, v1, v2 ; FP16-NEXT: ret %tmp0 = fptrunc <8 x float> %b to <8 x half> %r = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %tmp0)