diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7256,103 +7256,88 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { + if (!Subtarget->hasNEON()) + return SDValue(); + EVT VT = Op.getValueType(); + EVT IntVT = VT.changeTypeToInteger(); SDLoc DL(Op); SDValue In1 = Op.getOperand(0); SDValue In2 = Op.getOperand(1); EVT SrcVT = In2.getValueType(); - if (VT.isScalableVector()) { - if (VT != SrcVT) - return SDValue(); + if (SrcVT.bitsLT(VT)) + In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); + else if (SrcVT.bitsGT(VT)) + In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL)); - // copysign(x,y) -> (y & SIGN_MASK) | (x & ~SIGN_MASK) - // - // A possible alternative sequence involves using FNEG_MERGE_PASSTHRU; - // maybe useful for copysign operations with mismatched VTs. - // - // IntVT here is chosen so it's a legal type with the same element width - // as the input. - EVT IntVT = + if (VT.isScalableVector()) + IntVT = getPackedSVEVectorVT(VT.getVectorElementType().changeTypeToInteger()); - unsigned NumBits = VT.getScalarSizeInBits(); - SDValue SignMask = DAG.getConstant(APInt::getSignMask(NumBits), DL, IntVT); - SDValue InvSignMask = DAG.getNOT(DL, SignMask, IntVT); - SDValue Sign = DAG.getNode(ISD::AND, DL, IntVT, SignMask, - getSVESafeBitCast(IntVT, In2, DAG)); - SDValue Magnitude = DAG.getNode(ISD::AND, DL, IntVT, InvSignMask, - getSVESafeBitCast(IntVT, In1, DAG)); - SDValue IntResult = DAG.getNode(ISD::OR, DL, IntVT, Sign, Magnitude); - return getSVESafeBitCast(VT, IntResult, DAG); - } - if (!Subtarget->hasNEON()) + if (VT != In2.getValueType()) return SDValue(); - if (SrcVT.bitsLT(VT)) - In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); - else if (SrcVT.bitsGT(VT)) - In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL)); + auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) { + if (VT.isScalableVector()) + return getSVESafeBitCast(VT, Op, DAG); - EVT VecVT; - uint64_t EltMask; - SDValue VecVal1, VecVal2; + return DAG.getBitcast(VT, Op); + }; - auto setVecVal = [&] (int Idx) { + SDValue VecVal1, VecVal2; + EVT VecVT; + auto SetVecVal = [&](int Idx = -1) { if (!VT.isVector()) { - VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, - DAG.getUNDEF(VecVT), In1); - VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, - DAG.getUNDEF(VecVT), In2); + VecVal1 = + DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1); + VecVal2 = + DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2); } else { - VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); - VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); + VecVal1 = BitCast(VecVT, In1, DAG); + VecVal2 = BitCast(VecVT, In2, DAG); } }; - - if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { - VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32); - EltMask = 0x80000000ULL; - setVecVal(AArch64::ssub); - } else if (VT == MVT::f64 || VT == MVT::v2f64) { + if (VT.isVector()) { + VecVT = IntVT; + SetVecVal(); + } else if (VT == MVT::f64) { VecVT = MVT::v2i64; - - // We want to materialize a mask with the high bit set, but the AdvSIMD - // immediate moves cannot materialize that in a single instruction for - // 64-bit elements. Instead, materialize zero and then negate it. - EltMask = 0; - - setVecVal(AArch64::dsub); - } else if (VT == MVT::f16 || VT == MVT::v4f16 || VT == MVT::v8f16) { - VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16); - EltMask = 0x8000ULL; - setVecVal(AArch64::hsub); + SetVecVal(AArch64::dsub); + } else if (VT == MVT::f32) { + VecVT = MVT::v4i32; + SetVecVal(AArch64::ssub); + } else if (VT == MVT::f16) { + VecVT = MVT::v8i16; + SetVecVal(AArch64::hsub); } else { llvm_unreachable("Invalid type for copysign!"); } - SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT); + unsigned BitWidth = In1.getScalarValueSizeInBits(); + SDValue SignMaskV = DAG.getConstant(APInt::getSignMask(BitWidth), DL, VecVT); - // If we couldn't materialize the mask above, then the mask vector will be - // the zero vector, and we need to negate it here. - if (VT == MVT::f64 || VT == MVT::v2f64) { - BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec); - BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec); - BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec); + if (VecVT == MVT::v2i64) { + // We want to materialize a mask with the high bit set, but the AdvSIMD + // immediate moves cannot materialize that in a single instruction for + // 64-bit elements. Instead, materialize zero and then negate it. + SignMaskV = DAG.getConstant(0, DL, VecVT); + SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV); + SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV); + SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV); } - SDValue Sel = - DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec); - + SDValue BSP = + DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2); if (VT == MVT::f16) - return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel); + return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP); if (VT == MVT::f32) - return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel); - else if (VT == MVT::f64) - return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel); - else - return DAG.getNode(ISD::BITCAST, DL, VT, Sel); + return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP); + if (VT == MVT::f64) + return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP); + + return BitCast(VT, BSP, DAG); } SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { @@ -17501,6 +17486,32 @@ return SDValue(); } +SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget, + bool fixedSVEVectorVT) { + EVT VT = N->getValueType(0); + + // Don't expand for SVE2 + if (!VT.isScalableVector() || Subtarget->hasSVE2() || + Subtarget->hasStreamingSVE()) + return SDValue(); + + // Don't expand for NEON + if (VT.isFixedLengthVector() && !fixedSVEVectorVT) + return SDValue(); + + SDLoc DL(N); + + SDValue SignMask = N->getOperand(0); + SDValue In1 = N->getOperand(1); + SDValue In2 = N->getOperand(2); + + SDValue InvSignMask = DAG.getNOT(DL, SignMask, VT); + SDValue Sign = DAG.getNode(ISD::AND, DL, VT, SignMask, In2); + SDValue Magnitude = DAG.getNode(ISD::AND, DL, VT, InvSignMask, In1); + return DAG.getNode(ISD::OR, DL, VT, Sign, Magnitude); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -17599,6 +17610,9 @@ return performVectorShiftCombine(N, *this, DCI); case AArch64ISD::SUNPKLO: return performSunpkloCombine(N, DAG); + case AArch64ISD::BSP: + return performBSPExpandForSVE( + N, DAG, Subtarget, useSVEForFixedLengthVectorVT(N->getValueType(0))); case ISD::INSERT_VECTOR_ELT: return performInsertVectorEltCombine(N, DCI); case ISD::EXTRACT_VECTOR_ELT: diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -3085,7 +3085,7 @@ // SVE2 bitwise ternary operations defm EOR3_ZZZZ : sve2_int_bitwise_ternary_op<0b000, "eor3", int_aarch64_sve_eor3>; defm BCAX_ZZZZ : sve2_int_bitwise_ternary_op<0b010, "bcax", int_aarch64_sve_bcax>; - defm BSL_ZZZZ : sve2_int_bitwise_ternary_op<0b001, "bsl", int_aarch64_sve_bsl>; + defm BSL_ZZZZ : sve2_int_bitwise_ternary_op<0b001, "bsl", int_aarch64_sve_bsl, AArch64bsp>; defm BSL1N_ZZZZ : sve2_int_bitwise_ternary_op<0b011, "bsl1n", int_aarch64_sve_bsl1n>; defm BSL2N_ZZZZ : sve2_int_bitwise_ternary_op<0b101, "bsl2n", int_aarch64_sve_bsl2n>; defm NBSL_ZZZZ : sve2_int_bitwise_ternary_op<0b111, "nbsl", int_aarch64_sve_nbsl>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -4421,7 +4421,8 @@ let ElementSize = ElementSizeNone; } -multiclass sve2_int_bitwise_ternary_op opc, string asm, SDPatternOperator op> { +multiclass sve2_int_bitwise_ternary_op opc, string asm, SDPatternOperator op, + SDPatternOperator ir_op = null_frag> { def NAME : sve2_int_bitwise_ternary_op_d; def : InstAlias(NAME)>; def : SVE_3_Op_Pat(NAME)>; def : SVE_3_Op_Pat(NAME)>; + + + def : SVE_3_Op_Pat(NAME)>; + def : SVE_3_Op_Pat(NAME)>; + def : SVE_3_Op_Pat(NAME)>; + def : SVE_3_Op_Pat(NAME)>; } class sve2_int_rotate_right_imm tsz8_64, string asm, diff --git a/llvm/test/CodeGen/AArch64/arm64-fcopysign.ll b/llvm/test/CodeGen/AArch64/arm64-fcopysign.ll --- a/llvm/test/CodeGen/AArch64/arm64-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/arm64-fcopysign.ll @@ -9,7 +9,7 @@ ; CHECK-NEXT: movi.4s v2, #128, lsl #24 ; CHECK-NEXT: ; kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: ; kill: def $s1 killed $s1 def $q1 -; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: bif.16b v0, v1, v2 ; CHECK-NEXT: ; kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: ret entry: @@ -24,7 +24,7 @@ ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: ; kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: fneg.2d v2, v2 -; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: bif.16b v0, v1, v2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret entry: @@ -41,7 +41,7 @@ ; CHECK-NEXT: fadd s1, s1, s2 ; CHECK-NEXT: fneg.2d v2, v3 ; CHECK-NEXT: fcvt d1, s1 -; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: bif.16b v0, v1, v2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp1 = fadd float %b, %c @@ -58,8 +58,8 @@ ; CHECK-NEXT: movi.4s v1, #128, lsl #24 ; CHECK-NEXT: fcvt s0, d0 ; CHECK-NEXT: fmov s2, #0.50000000 -; CHECK-NEXT: bit.16b v2, v0, v1 -; CHECK-NEXT: fadd s0, s0, s2 +; CHECK-NEXT: bsl.16b v1, v2, v0 +; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll --- a/llvm/test/CodeGen/AArch64/f16-instructions.ll +++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple aarch64-unknown-unknown -aarch64-neon-syntax=apple -asm-verbose=false -disable-post-ra -frame-pointer=non-leaf | FileCheck %s --check-prefix=CHECK-CVT --check-prefix=CHECK-COMMON ; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+fullfp16 -aarch64-neon-syntax=apple -asm-verbose=false -disable-post-ra -frame-pointer=non-leaf | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-FP16 @@ -1101,8 +1102,8 @@ ; CHECK-CVT-LABEL: test_copysign: ; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 -; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 +; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: bit.16b v0, v1, v2 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret @@ -1138,8 +1139,8 @@ ; CHECK-CVT-LABEL: test_copysign_f64: ; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 -; CHECK-CVT-NEXT: fcvt s1, d1 ; CHECK-CVT-NEXT: fcvt s0, h0 +; CHECK-CVT-NEXT: fcvt s1, d1 ; CHECK-CVT-NEXT: bit.16b v0, v1, v2 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret @@ -1161,8 +1162,8 @@ ; CHECK-CVT-LABEL: test_copysign_extended: ; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 -; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 +; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: bit.16b v0, v1, v2 ; CHECK-CVT-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll --- a/llvm/test/CodeGen/AArch64/fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/fcopysign.ll @@ -98,7 +98,7 @@ ; CHECK-NEXT: movi v2.4s, #128, lsl #24 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 -; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: ret ; @@ -122,7 +122,7 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: fneg v2.2d, v2.2d -; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret ; @@ -145,7 +145,7 @@ ; CHECK-NEXT: movi v2.4s, #128, lsl #24 ; CHECK-NEXT: fcvt s1, h1 ; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: fcvt h0, s0 ; CHECK-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll b/llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll --- a/llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll +++ b/llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll @@ -1,17 +1,10 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -asm-verbose=false -mtriple=aarch64-none-eabi -mattr=-fullfp16 | FileCheck %s --check-prefix=CHECK-CVT --check-prefix=CHECK-COMMON ; RUN: llc < %s -asm-verbose=false -mtriple=aarch64-none-eabi -mattr=+fullfp16 | FileCheck %s --check-prefix=CHECK-FP16 --check-prefix=CHECK-COMMON define <4 x half> @add_h(<4 x half> %a, <4 x half> %b) { entry: -; CHECK-CVT-LABEL: add_h: -; CHECK-CVT-DAG: fcvtl [[OP1:v[0-9]+\.4s]], v0.4h -; CHECK-CVT-DAG: fcvtl [[OP2:v[0-9]+\.4s]], v1.4h -; CHECK-CVT-NEXT: fadd [[RES:v[0-9]+.4s]], [[OP1]], [[OP2]] -; CHECK-CVT-NEXT: fcvtn v0.4h, [[RES]] - -; CHECK-FP16-LABEL: add_h: -; CHECK-FP16: fadd v0.4h, v0.4h, v1.4h -; CHECK-FP16-NEXT: ret + %0 = fadd <4 x half> %a, %b ret <4 x half> %0 } @@ -19,24 +12,13 @@ define <4 x half> @build_h4(<4 x half> %a) { entry: -; CHECK-COMMON-LABEL: build_h4: -; CHECK-COMMON: mov [[GPR:w[0-9]+]], #15565 -; CHECK-COMMON-NEXT: dup v0.4h, [[GPR]] ret <4 x half> } define <4 x half> @sub_h(<4 x half> %a, <4 x half> %b) { entry: -; CHECK-CVT-LABEL: sub_h: -; CHECK-CVT-DAG: fcvtl [[OP1:v[0-9]+\.4s]], v0.4h -; CHECK-CVT-DAG: fcvtl [[OP2:v[0-9]+\.4s]], v1.4h -; CHECK-CVT-NEXT: fsub [[RES:v[0-9]+.4s]], [[OP1]], [[OP2]] -; CHECK-CVT-NEXT: fcvtn v0.4h, [[RES]] - -; CHECK-FP16-LABEL: sub_h: -; CHECK-FP16: fsub v0.4h, v0.4h, v1.4h -; CHECK-FP16-NEXT: ret + %0 = fsub <4 x half> %a, %b ret <4 x half> %0 } @@ -44,15 +26,7 @@ define <4 x half> @mul_h(<4 x half> %a, <4 x half> %b) { entry: -; CHECK-CVT-LABEL: mul_h: -; CHECK-CVT-DAG: fcvtl [[OP1:v[0-9]+\.4s]], v0.4h -; CHECK-CVT-DAG: fcvtl [[OP2:v[0-9]+\.4s]], v1.4h -; CHECK-CVT-NEXT: fmul [[RES:v[0-9]+.4s]], [[OP1]], [[OP2]] -; CHECK-CVT-NEXT: fcvtn v0.4h, [[RES]] - -; CHECK-FP16-LABEL: mul_h: -; CHECK-FP16: fmul v0.4h, v0.4h, v1.4h -; CHECK-FP16-NEXT: ret + %0 = fmul <4 x half> %a, %b ret <4 x half> %0 } @@ -60,15 +34,7 @@ define <4 x half> @div_h(<4 x half> %a, <4 x half> %b) { entry: -; CHECK-CVT-LABEL: div_h: -; CHECK-CVT-DAG: fcvtl [[OP1:v[0-9]+\.4s]], v0.4h -; CHECK-CVT-DAG: fcvtl [[OP2:v[0-9]+\.4s]], v1.4h -; CHECK-CVT-NEXT: fdiv [[RES:v[0-9]+.4s]], [[OP1]], [[OP2]] -; CHECK-CVT-NEXT: fcvtn v0.4h, [[RES]] - -; CHECK-FP16-LABEL: div_h: -; CHECK-FP16: fdiv v0.4h, v0.4h, v1.4h -; CHECK-FP16-NEXT: ret + %0 = fdiv <4 x half> %a, %b ret <4 x half> %0 } @@ -76,9 +42,6 @@ define <4 x half> @load_h(<4 x half>* %a) { entry: -; CHECK-COMMON-LABEL: load_h: -; CHECK-COMMON: ldr d0, [x0] -; CHECK-COMMON-NEXT: ret %0 = load <4 x half>, <4 x half>* %a, align 4 ret <4 x half> %0 } @@ -86,17 +49,11 @@ define void @store_h(<4 x half>* %a, <4 x half> %b) { entry: -; CHECK-COMMON-LABEL: store_h: -; CHECK-COMMON: str d0, [x0] -; CHECK-COMMON-NEXT: ret store <4 x half> %b, <4 x half>* %a, align 4 ret void } define <4 x half> @s_to_h(<4 x float> %a) { -; CHECK-COMMON-LABEL: s_to_h: -; CHECK-COMMON: fcvtn v0.4h, v0.4s -; CHECK-COMMON-NEXT: ret %1 = fptrunc <4 x float> %a to <4 x half> ret <4 x half> %1 } @@ -116,9 +73,6 @@ } define <4 x float> @h_to_s(<4 x half> %a) { -; CHECK-COMMON-LABEL: h_to_s: -; CHECK-COMMON: fcvtl v0.4s, v0.4h -; CHECK-COMMON-NEXT: ret %1 = fpext <4 x half> %a to <4 x float> ret <4 x float> %1 } @@ -137,394 +91,167 @@ } define <4 x half> @bitcast_i_to_h(float, <4 x i16> %a) { -; CHECK-COMMON-LABEL: bitcast_i_to_h: -; CHECK-COMMON: fmov d0, d1 -; CHECK-COMMON-NEXT: ret %2 = bitcast <4 x i16> %a to <4 x half> ret <4 x half> %2 } define <4 x i16> @bitcast_h_to_i(float, <4 x half> %a) { -; CHECK-COMMON-LABEL: bitcast_h_to_i: -; CHECK-COMMON: fmov d0, d1 -; CHECK-COMMON-NEXT: ret %2 = bitcast <4 x half> %a to <4 x i16> ret <4 x i16> %2 } define <4 x half> @sitofp_i8(<4 x i8> %a) #0 { -; CHECK-COMMON-LABEL: sitofp_i8: -; CHECK-COMMON-NEXT: shl [[OP1:v[0-9]+\.4h]], v0.4h, #8 -; CHECK-COMMON-NEXT: sshr [[OP2:v[0-9]+\.4h]], [[OP1]], #8 -; CHECK-FP16-NEXT: scvtf v0.4h, [[OP2]] -; CHECK-CVT-NEXT: sshll [[OP3:v[0-9]+\.4s]], [[OP2]], #0 -; CHECK-CVT-NEXT: scvtf [[OP4:v[0-9]+\.4s]], [[OP3]] -; CHECK-CVT-NEXT: fcvtn v0.4h, [[OP4]] -; CHECK-COMMON-NEXT: ret %1 = sitofp <4 x i8> %a to <4 x half> ret <4 x half> %1 } define <4 x half> @sitofp_i16(<4 x i16> %a) #0 { -; CHECK-COMMON-LABEL: sitofp_i16: -; CHECK-FP16-NEXT: scvtf v0.4h, v0.4h -; CHECK-CVT-NEXT: sshll [[OP1:v[0-9]+\.4s]], v0.4h, #0 -; CHECK-CVT-NEXT: scvtf [[OP2:v[0-9]+\.4s]], [[OP1]] -; CHECK-CVT-NEXT: fcvtn v0.4h, [[OP2]] -; CHECK-COMMON-NEXT: ret %1 = sitofp <4 x i16> %a to <4 x half> ret <4 x half> %1 } define <4 x half> @sitofp_i32(<4 x i32> %a) #0 { -; CHECK-COMMON-LABEL: sitofp_i32: -; CHECK-COMMON-NEXT: scvtf [[OP1:v[0-9]+\.4s]], v0.4s -; CHECK-COMMON-NEXT: fcvtn v0.4h, [[OP1]] -; CHECK-COMMON-NEXT: ret %1 = sitofp <4 x i32> %a to <4 x half> ret <4 x half> %1 } define <4 x half> @sitofp_i64(<4 x i64> %a) #0 { -; CHECK-COMMON-LABEL: sitofp_i64: -; CHECK-COMMON-DAG: scvtf [[OP1:v[0-9]+\.2d]], v0.2d -; CHECK-COMMON-DAG: scvtf [[OP2:v[0-9]+\.2d]], v1.2d -; CHECK-COMMON-DAG: fcvtn [[OP3:v[0-9]+]].2s, [[OP1]] -; CHECK-COMMON-NEXT: fcvtn2 [[OP3]].4s, [[OP2]] -; CHECK-COMMON-NEXT: fcvtn v0.4h, [[OP3]].4s -; CHECK-COMMON-NEXT: ret %1 = sitofp <4 x i64> %a to <4 x half> ret <4 x half> %1 } define <4 x half> @uitofp_i8(<4 x i8> %a) #0 { -; CHECK-COMMON-LABEL: uitofp_i8: -; CHECK-COMMON-NEXT: bic v0.4h, #255, lsl #8 -; CHECK-FP16-NEXT: ucvtf v0.4h, v0.4h -; CHECK-CVT-NEXT: ushll [[OP1:v[0-9]+\.4s]], v0.4h, #0 -; CHECK-CVT-NEXT: ucvtf [[OP2:v[0-9]+\.4s]], [[OP1]] -; CHECK-CVT-NEXT: fcvtn v0.4h, [[OP2]] -; CHECK-COMMON-NEXT: ret %1 = uitofp <4 x i8> %a to <4 x half> ret <4 x half> %1 } define <4 x half> @uitofp_i16(<4 x i16> %a) #0 { -; CHECK-COMMON-LABEL: uitofp_i16: -; CHECK-FP16-NEXT: ucvtf v0.4h, v0.4h -; CHECK-CVT-NEXT: ushll [[OP1:v[0-9]+\.4s]], v0.4h, #0 -; CHECK-CVT-NEXT: ucvtf [[OP2:v[0-9]+\.4s]], [[OP1]] -; CHECK-CVT-NEXT: fcvtn v0.4h, [[OP2]] -; CHECK-COMMON-NEXT: ret %1 = uitofp <4 x i16> %a to <4 x half> ret <4 x half> %1 } define <4 x half> @uitofp_i32(<4 x i32> %a) #0 { -; CHECK-COMMON-LABEL: uitofp_i32: -; CHECK-COMMON-NEXT: ucvtf [[OP1:v[0-9]+\.4s]], v0.4s -; CHECK-COMMON-NEXT: fcvtn v0.4h, [[OP1]] -; CHECK-COMMON-NEXT: ret %1 = uitofp <4 x i32> %a to <4 x half> ret <4 x half> %1 } define <4 x half> @uitofp_i64(<4 x i64> %a) #0 { -; CHECK-COMMON-LABEL: uitofp_i64: -; CHECK-COMMON-DAG: ucvtf [[OP1:v[0-9]+\.2d]], v0.2d -; CHECK-COMMON-DAG: ucvtf [[OP2:v[0-9]+\.2d]], v1.2d -; CHECK-COMMON-DAG: fcvtn [[OP3:v[0-9]+]].2s, [[OP1]] -; CHECK-COMMON-NEXT: fcvtn2 [[OP3]].4s, [[OP2]] -; CHECK-COMMON-NEXT: fcvtn v0.4h, [[OP3]].4s -; CHECK-COMMON-NEXT: ret %1 = uitofp <4 x i64> %a to <4 x half> ret <4 x half> %1 } define void @test_insert_at_zero(half %a, <4 x half>* %b) #0 { -; CHECK-COMMON-LABEL: test_insert_at_zero: -; CHECK-COMMON-NEXT: str d0, [x0] -; CHECK-COMMON-NEXT: ret %1 = insertelement <4 x half> undef, half %a, i64 0 store <4 x half> %1, <4 x half>* %b, align 4 ret void } define <4 x i8> @fptosi_i8(<4 x half> %a) #0 { -; CHECK-COMMON-LABEL: fptosi_i8: -; CHECK-FP16: fcvtzs v0.4h, v0.4h -; CHECK-CVT-NEXT: fcvtl [[REG1:v[0-9]+\.4s]], v0.4h -; CHECK-CVT-NEXT: fcvtzs [[REG2:v[0-9]+\.4s]], [[REG1]] -; CHECK-CVT-NEXT: xtn v0.4h, [[REG2]] -; CHECK-COMMON-NEXT: ret %1 = fptosi<4 x half> %a to <4 x i8> ret <4 x i8> %1 } define <4 x i16> @fptosi_i16(<4 x half> %a) #0 { -; CHECK-COMMON-LABEL: fptosi_i16: -; CHECK-FP16: fcvtzs v0.4h, v0.4h -; CHECK-CVT-NEXT: fcvtl [[REG1:v[0-9]+\.4s]], v0.4h -; CHECK-CVT-NEXT: fcvtzs [[REG2:v[0-9]+\.4s]], [[REG1]] -; CHECK-CVT-NEXT: xtn v0.4h, [[REG2]] -; CHECK-COMMON-NEXT: ret %1 = fptosi<4 x half> %a to <4 x i16> ret <4 x i16> %1 } define <4 x i8> @fptoui_i8(<4 x half> %a) #0 { -; CHECK-COMMON-LABEL: fptoui_i8: -; CHECK-FP16: fcvtzs v0.4h, v0.4h -; CHECK-CVT-NEXT: fcvtl [[REG1:v[0-9]+\.4s]], v0.4h ; NOTE: fcvtzs selected here because the xtn shaves the sign bit -; CHECK-CVT-NEXT: fcvtzs [[REG2:v[0-9]+\.4s]], [[REG1]] -; CHECK-CVT-NEXT: xtn v0.4h, [[REG2]] -; CHECK-COMMON-NEXT: ret %1 = fptoui<4 x half> %a to <4 x i8> ret <4 x i8> %1 } define <4 x i16> @fptoui_i16(<4 x half> %a) #0 { -; CHECK-COMMON-LABEL: fptoui_i16: -; CHECK-FP16: fcvtzu v0.4h, v0.4h -; CHECK-CVT-NEXT: fcvtl [[REG1:v[0-9]+\.4s]], v0.4h -; CHECK-CVT-NEXT: fcvtzu [[REG2:v[0-9]+\.4s]], [[REG1]] -; CHECK-CVT-NEXT: xtn v0.4h, [[REG2]] -; CHECK-COMMON-NEXT: ret %1 = fptoui<4 x half> %a to <4 x i16> ret <4 x i16> %1 } define <4 x i1> @test_fcmp_une(<4 x half> %a, <4 x half> %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_une: -; CHECK-CVT: fcvtl -; CHECK-CVT: fcvtl -; CHECK-CVT: fcmeq -; CHECK-CVT: mvn -; CHECK-CVT: xtn -; CHECK-CVT: ret - -; CHECK-FP16-LABEL: test_fcmp_une: -; CHECK-FP16-NOT: fcvt -; CHECK-FP16: fcmeq v{{[0-9]}}.4h, v{{[0-9]}}.4h + %1 = fcmp une <4 x half> %a, %b ret <4 x i1> %1 } define <4 x i1> @test_fcmp_ueq(<4 x half> %a, <4 x half> %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_ueq: -; CHECK-CVT: fcvtl -; CHECK-CVT: fcvtl -; CHECK-CVT: fcmgt -; CHECK-CVT: fcmgt -; CHECK-CVT: orr -; CHECK-CVT: xtn -; CHECK-CVT: mvn -; CHECK-CVT: ret - -; CHECK-FP16-LABEL: test_fcmp_ueq: -; CHECK-FP16-NOT: fcvt -; CHECK-FP16: fcmgt v{{[0-9]}}.4h, v{{[0-9]}}.4h -; CHECK-FP16: fcmgt v{{[0-9]}}.4h, v{{[0-9]}}.4h + %1 = fcmp ueq <4 x half> %a, %b ret <4 x i1> %1 } define <4 x i1> @test_fcmp_ugt(<4 x half> %a, <4 x half> %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_ugt: -; CHECK-CVT: fcvtl -; CHECK-CVT: fcvtl -; CHECK-CVT: fcmge -; CHECK-CVT: xtn -; CHECK-CVT: mvn -; CHECK-CVT: ret - -; CHECK-FP16-LABEL: test_fcmp_ugt: -; CHECK-FP16-NOT: fcvt -; CHECK-FP16: fcmge v{{[0-9]}}.4h, v{{[0-9]}}.4h + %1 = fcmp ugt <4 x half> %a, %b ret <4 x i1> %1 } define <4 x i1> @test_fcmp_uge(<4 x half> %a, <4 x half> %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_uge: -; CHECK-CVT: fcvtl -; CHECK-CVT: fcvtl -; CHECK-CVT: fcmgt -; CHECK-CVT: xtn -; CHECK-CVT: mvn -; CHECK-CVT: ret - -; CHECK-FP16-LABEL: test_fcmp_uge: -; CHECK-FP16-NOT: fcvt -; CHECK-FP16: fcmgt v{{[0-9]}}.4h, v{{[0-9]}}.4h + %1 = fcmp uge <4 x half> %a, %b ret <4 x i1> %1 } define <4 x i1> @test_fcmp_ult(<4 x half> %a, <4 x half> %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_ult: -; CHECK-CVT: fcvtl -; CHECK-CVT: fcvtl -; CHECK-CVT: fcmge -; CHECK-CVT: xtn -; CHECK-CVT: mvn -; CHECK-CVT: ret - -; CHECK-FP16-LABEL: test_fcmp_ult: -; CHECK-FP16-NOT: fcvt -; CHECK-FP16: fcmge v{{[0-9]}}.4h, v{{[0-9]}}.4h + %1 = fcmp ult <4 x half> %a, %b ret <4 x i1> %1 } define <4 x i1> @test_fcmp_ule(<4 x half> %a, <4 x half> %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_ule: -; CHECK-CVT: fcvtl -; CHECK-CVT: fcvtl -; CHECK-CVT: fcmgt -; CHECK-CVT: xtn -; CHECK-CVT: mvn -; CHECK-CVT: ret - -; CHECK-FP16-LABEL: test_fcmp_ule: -; CHECK-FP16-NOT: fcvt -; CHECK-FP16: fcmgt v{{[0-9]}}.4h, v{{[0-9]}}.4h + %1 = fcmp ule <4 x half> %a, %b ret <4 x i1> %1 } define <4 x i1> @test_fcmp_uno(<4 x half> %a, <4 x half> %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_uno: -; CHECK-CVT: fcvtl -; CHECK-CVT: fcvtl -; CHECK-CVT: fcmge -; CHECK-CVT: fcmgt -; CHECK-CVT: orr -; CHECK-CVT: xtn -; CHECK-CVT: mvn -; CHECK-CVT: ret - -; CHECK-FP16-LABEL: test_fcmp_uno: -; CHECK-FP16-NOT: fcvt -; CHECK-FP16: fcmge v{{[0-9]}}.4h, v{{[0-9]}}.4h -; CHECK-FP16: fcmgt v{{[0-9]}}.4h, v{{[0-9]}}.4h + %1 = fcmp uno <4 x half> %a, %b ret <4 x i1> %1 } define <4 x i1> @test_fcmp_one(<4 x half> %a, <4 x half> %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_one: -; CHECK-CVT: fcvtl -; CHECK-CVT: fcvtl -; CHECK-CVT: fcmgt -; CHECK-CVT: fcmgt -; CHECK-CVT: orr -; CHECK-CVT: xtn -; CHECK-CVT: ret - -; CHECK-FP16-LABEL: test_fcmp_one: -; CHECK-FP16-NOT: fcvt -; CHECK-FP16: fcmgt v{{[0-9]}}.4h, v{{[0-9]}}.4h -; CHECK-FP16: fcmgt v{{[0-9]}}.4h, v{{[0-9]}}.4h + %1 = fcmp one <4 x half> %a, %b ret <4 x i1> %1 } define <4 x i1> @test_fcmp_oeq(<4 x half> %a, <4 x half> %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_oeq: -; CHECK-CVT: fcvtl -; CHECK-CVT: fcvtl -; CHECK-CVT: fcmeq -; CHECK-CVT: xtn -; CHECK-CVT: ret - -; CHECK-FP16-LABEL: test_fcmp_oeq: -; CHECK-FP16-NOT: fcvt -; CHECK-FP16: fcmeq v{{[0-9]}}.4h, v{{[0-9]}}.4h + %1 = fcmp oeq <4 x half> %a, %b ret <4 x i1> %1 } define <4 x i1> @test_fcmp_ogt(<4 x half> %a, <4 x half> %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_ogt: -; CHECK-CVT: fcvtl -; CHECK-CVT: fcvtl -; CHECK-CVT: fcmgt -; CHECK-CVT: xtn -; CHECK-CVT: ret - -; CHECK-FP16-LABEL: test_fcmp_ogt: -; CHECK-FP16-NOT: fcvt -; CHECK-FP16: fcmgt v{{[0-9]}}.4h, v{{[0-9]}}.4h + %1 = fcmp ogt <4 x half> %a, %b ret <4 x i1> %1 } define <4 x i1> @test_fcmp_oge(<4 x half> %a, <4 x half> %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_oge: -; CHECK-CVT: fcvtl -; CHECK-CVT: fcvtl -; CHECK-CVT: fcmge -; CHECK-CVT: xtn -; CHECK-CVT: ret - -; CHECK-FP16-LABEL: test_fcmp_oge: -; CHECK-FP16-NOT: fcvt -; CHECK-FP16: fcmge v{{[0-9]}}.4h, v{{[0-9]}}.4h + %1 = fcmp oge <4 x half> %a, %b ret <4 x i1> %1 } define <4 x i1> @test_fcmp_olt(<4 x half> %a, <4 x half> %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_olt: -; CHECK-CVT: fcvtl -; CHECK-CVT: fcvtl -; CHECK-CVT: fcmgt -; CHECK-CVT: xtn -; CHECK-CVT: ret - -; CHECK-FP16-LABEL: test_fcmp_olt: -; CHECK-FP16-NOT: fcvt -; CHECK-FP16: fcmgt v{{[0-9]}}.4h, v{{[0-9]}}.4h + %1 = fcmp olt <4 x half> %a, %b ret <4 x i1> %1 } define <4 x i1> @test_fcmp_ole(<4 x half> %a, <4 x half> %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_ole: -; CHECK-CVT: fcvtl -; CHECK-CVT: fcvtl -; CHECK-CVT: fcmge -; CHECK-CVT: xtn -; CHECK-CVT: ret - -; CHECK-FP16-LABEL: test_fcmp_ole: -; CHECK-FP16-NOT: fcvt -; CHECK-FP16: fcmge v{{[0-9]}}.4h, v{{[0-9]}}.4h + %1 = fcmp ole <4 x half> %a, %b ret <4 x i1> %1 } define <4 x i1> @test_fcmp_ord(<4 x half> %a, <4 x half> %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_ord: -; CHECK-CVT: fcvtl -; CHECK-CVT: fcvtl -; CHECK-CVT: fcmge -; CHECK-CVT: fcmgt -; CHECK-CVT: orr -; CHECK-CVT: xtn -; CHECK-CVT: ret - -; CHECK-FP16-LABEL: test_fcmp_ord: -; CHECK-FP16-NOT: fcvt -; CHECK-FP16: fcmge v{{[0-9]}}.4h, v{{[0-9]}}.4h -; CHECK-FP16: fcmgt v{{[0-9]}}.4h, v{{[0-9]}}.4h + %1 = fcmp ord <4 x half> %a, %b ret <4 x i1> %1 } diff --git a/llvm/test/CodeGen/AArch64/sve2-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve2-fcopysign.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-fcopysign.ll @@ -0,0 +1,225 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple aarch64-eabi -mattr=+sve2 -o - | FileCheck --check-prefixes=CHECK %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + +;============ v2f32 + +define @test_copysign_v2f32_v2f32( %a, %b) #0 { +; CHECK-LABEL: test_copysign_v2f32_v2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #-2147483648 +; CHECK-NEXT: mov z2.s, w8 +; CHECK-NEXT: bsl z2.d, z2.d, z0.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %r = call @llvm.copysign.v2f32( %a, %b) + ret %r +} + +define @test_copysign_v2f32_v2f64( %a, %b) #0 { +; CHECK-LABEL: test_copysign_v2f32_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #-2147483648 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvt z2.s, p0/m, z1.d +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: bsl z1.d, z1.d, z0.d, z2.d +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %tmp0 = fptrunc %b to + %r = call @llvm.copysign.v2f32( %a, %tmp0) + ret %r +} + +declare @llvm.copysign.v2f32( %a, %b) #0 + +;============ v4f32 + +define @test_copysign_v4f32_v4f32( %a, %b) #0 { +; CHECK-LABEL: test_copysign_v4f32_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #-2147483648 +; CHECK-NEXT: mov z2.s, w8 +; CHECK-NEXT: bsl z2.d, z2.d, z0.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %r = call @llvm.copysign.v4f32( %a, %b) + ret %r +} + +; SplitVecOp #1 +define @test_copysign_v4f32_v4f64( %a, %b) #0 { +; CHECK-LABEL: test_copysign_v4f32_v4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #-2147483648 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvt z2.s, p0/m, z2.d +; CHECK-NEXT: fcvt z1.s, p0/m, z1.d +; CHECK-NEXT: uzp1 z2.s, z1.s, z2.s +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: bsl z1.d, z1.d, z0.d, z2.d +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %tmp0 = fptrunc %b to + %r = call @llvm.copysign.v4f32( %a, %tmp0) + ret %r +} + +declare @llvm.copysign.v4f32( %a, %b) #0 + +;============ v2f64 + +define @test_copysign_v2f64_v232( %a, %b) #0 { +; CHECK-LABEL: test_copysign_v2f64_v232: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #-9223372036854775808 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvt z2.d, p0/m, z1.s +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: bsl z1.d, z1.d, z0.d, z2.d +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %tmp0 = fpext %b to + %r = call @llvm.copysign.v2f64( %a, %tmp0) + ret %r +} + +define @test_copysign_v2f64_v2f64( %a, %b) #0 { +; CHECK-LABEL: test_copysign_v2f64_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #-9223372036854775808 +; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: bsl z2.d, z2.d, z0.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %r = call @llvm.copysign.v2f64( %a, %b) + ret %r +} + +declare @llvm.copysign.v2f64( %a, %b) #0 + +;============ v4f64 + +; SplitVecRes mismatched +define @test_copysign_v4f64_v4f32( %a, %b) #0 { +; CHECK-LABEL: test_copysign_v4f64_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #-9223372036854775808 +; CHECK-NEXT: uunpkhi z5.d, z2.s +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: fcvt z5.d, p0/m, z5.s +; CHECK-NEXT: fcvt z2.d, p0/m, z2.s +; CHECK-NEXT: mov z3.d, x8 +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: bsl z3.d, z3.d, z1.d, z5.d +; CHECK-NEXT: bsl z4.d, z4.d, z0.d, z2.d +; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: mov z0.d, z4.d +; CHECK-NEXT: ret + %tmp0 = fpext %b to + %r = call @llvm.copysign.v4f64( %a, %tmp0) + ret %r +} + +; SplitVecRes same +define @test_copysign_v4f64_v4f64( %a, %b) #0 { +; CHECK-LABEL: test_copysign_v4f64_v4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #-9223372036854775808 +; CHECK-NEXT: mov z4.d, x8 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: bsl z4.d, z4.d, z1.d, z3.d +; CHECK-NEXT: bsl z5.d, z5.d, z0.d, z2.d +; CHECK-NEXT: mov z1.d, z4.d +; CHECK-NEXT: mov z0.d, z5.d +; CHECK-NEXT: ret + %r = call @llvm.copysign.v4f64( %a, %b) + ret %r +} + +declare @llvm.copysign.v4f64( %a, %b) #0 + +;============ v4f16 + +define @test_copysign_v4f16_v4f16( %a, %b) #0 { +; CHECK-LABEL: test_copysign_v4f16_v4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #32768 +; CHECK-NEXT: mov z2.h, w8 +; CHECK-NEXT: bsl z2.d, z2.d, z0.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %r = call @llvm.copysign.v4f16( %a, %b) + ret %r +} + +define @test_copysign_v4f16_v4f32( %a, %b) #0 { +; CHECK-LABEL: test_copysign_v4f16_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #32768 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvt z2.h, p0/m, z1.s +; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: bsl z1.d, z1.d, z0.d, z2.d +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %tmp0 = fptrunc %b to + %r = call @llvm.copysign.v4f16( %a, %tmp0) + ret %r +} + +define @test_copysign_v4f16_v4f64( %a, %b) #0 { +; CHECK-LABEL: test_copysign_v4f16_v4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #32768 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvt z2.h, p0/m, z2.d +; CHECK-NEXT: fcvt z1.h, p0/m, z1.d +; CHECK-NEXT: uzp1 z2.s, z1.s, z2.s +; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: bsl z1.d, z1.d, z0.d, z2.d +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %tmp0 = fptrunc %b to + %r = call @llvm.copysign.v4f16( %a, %tmp0) + ret %r +} + +declare @llvm.copysign.v4f16( %a, %b) #0 + +;============ v8f16 + +define @test_copysign_v8f16_v8f16( %a, %b) #0 { +; CHECK-LABEL: test_copysign_v8f16_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #32768 +; CHECK-NEXT: mov z2.h, w8 +; CHECK-NEXT: bsl z2.d, z2.d, z0.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %r = call @llvm.copysign.v8f16( %a, %b) + ret %r +} + +define @test_copysign_v8f16_v8f32( %a, %b) #0 { +; CHECK-LABEL: test_copysign_v8f16_v8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #32768 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvt z2.h, p0/m, z2.s +; CHECK-NEXT: fcvt z1.h, p0/m, z1.s +; CHECK-NEXT: uzp1 z2.h, z1.h, z2.h +; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: bsl z1.d, z1.d, z0.d, z2.d +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %tmp0 = fptrunc %b to + %r = call @llvm.copysign.v8f16( %a, %tmp0) + ret %r +} + +declare @llvm.copysign.v8f16( %a, %b) #0 + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll --- a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll @@ -11,7 +11,7 @@ ; CHECK-LABEL: test_copysign_v1f32_v1f32: ; CHECK: ; %bb.0: ; CHECK-NEXT: movi.2s v2, #128, lsl #24 -; CHECK-NEXT: bit.8b v0, v1, v2 +; CHECK-NEXT: bif.8b v0, v1, v2 ; CHECK-NEXT: ret %r = call <1 x float> @llvm.copysign.v1f32(<1 x float> %a, <1 x float> %b) ret <1 x float> %r @@ -24,7 +24,7 @@ ; CHECK-NEXT: ; kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: movi.2s v2, #128, lsl #24 ; CHECK-NEXT: fcvtn v1.2s, v1.2d -; CHECK-NEXT: bit.8b v0, v1, v2 +; CHECK-NEXT: bif.8b v0, v1, v2 ; CHECK-NEXT: ret %tmp0 = fptrunc <1 x double> %b to <1 x float> %r = call <1 x float> @llvm.copysign.v1f32(<1 x float> %a, <1 x float> %tmp0) @@ -43,7 +43,7 @@ ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: fcvtl v1.2d, v1.2s ; CHECK-NEXT: fneg.2d v2, v2 -; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: bif.16b v0, v1, v2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp0 = fpext <1 x float> %b to <1 x double> @@ -58,7 +58,7 @@ ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: ; kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: fneg.2d v2, v2 -; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: bif.16b v0, v1, v2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %r = call <1 x double> @llvm.copysign.v1f64(<1 x double> %a, <1 x double> %b) @@ -73,7 +73,7 @@ ; CHECK-LABEL: test_copysign_v2f32_v2f32: ; CHECK: ; %bb.0: ; CHECK-NEXT: movi.2s v2, #128, lsl #24 -; CHECK-NEXT: bit.8b v0, v1, v2 +; CHECK-NEXT: bif.8b v0, v1, v2 ; CHECK-NEXT: ret %r = call <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %b) ret <2 x float> %r @@ -84,7 +84,7 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: movi.2s v2, #128, lsl #24 ; CHECK-NEXT: fcvtn v1.2s, v1.2d -; CHECK-NEXT: bit.8b v0, v1, v2 +; CHECK-NEXT: bif.8b v0, v1, v2 ; CHECK-NEXT: ret %tmp0 = fptrunc <2 x double> %b to <2 x float> %r = call <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %tmp0) @@ -99,7 +99,7 @@ ; CHECK-LABEL: test_copysign_v4f32_v4f32: ; CHECK: ; %bb.0: ; CHECK-NEXT: movi.4s v2, #128, lsl #24 -; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: bif.16b v0, v1, v2 ; CHECK-NEXT: ret %r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %b) ret <4 x float> %r @@ -112,7 +112,7 @@ ; CHECK-NEXT: fcvtn v1.2s, v1.2d ; CHECK-NEXT: movi.4s v3, #128, lsl #24 ; CHECK-NEXT: fcvtn2 v1.4s, v2.2d -; CHECK-NEXT: bit.16b v0, v1, v3 +; CHECK-NEXT: bif.16b v0, v1, v3 ; CHECK-NEXT: ret %tmp0 = fptrunc <4 x double> %b to <4 x float> %r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %tmp0) @@ -129,7 +129,7 @@ ; CHECK-NEXT: movi.2d v2, #0000000000000000 ; CHECK-NEXT: fcvtl v1.2d, v1.2s ; CHECK-NEXT: fneg.2d v2, v2 -; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: bif.16b v0, v1, v2 ; CHECK-NEXT: ret %tmp0 = fpext <2 x float> %b to <2 x double> %r = call <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %tmp0) @@ -141,7 +141,7 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: movi.2d v2, #0000000000000000 ; CHECK-NEXT: fneg.2d v2, v2 -; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: bif.16b v0, v1, v2 ; CHECK-NEXT: ret %r = call <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %b) ret <2 x double> %r @@ -159,8 +159,8 @@ ; CHECK-NEXT: fcvtl2 v4.2d, v2.4s ; CHECK-NEXT: fcvtl v2.2d, v2.2s ; CHECK-NEXT: fneg.2d v3, v3 -; CHECK-NEXT: bit.16b v1, v4, v3 -; CHECK-NEXT: bit.16b v0, v2, v3 +; CHECK-NEXT: bif.16b v1, v4, v3 +; CHECK-NEXT: bif.16b v0, v2, v3 ; CHECK-NEXT: ret %tmp0 = fpext <4 x float> %b to <4 x double> %r = call <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %tmp0) @@ -173,8 +173,8 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: movi.2d v4, #0000000000000000 ; CHECK-NEXT: fneg.2d v4, v4 -; CHECK-NEXT: bit.16b v0, v2, v4 -; CHECK-NEXT: bit.16b v1, v3, v4 +; CHECK-NEXT: bif.16b v0, v2, v4 +; CHECK-NEXT: bif.16b v1, v3, v4 ; CHECK-NEXT: ret %r = call <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %b) ret <4 x double> %r @@ -199,21 +199,21 @@ ; NOFP16-NEXT: fcvt s3, h3 ; NOFP16-NEXT: fcvt s4, h4 ; NOFP16-NEXT: mov h1, v1[3] -; NOFP16-NEXT: bit.16b v6, v5, v2 -; NOFP16-NEXT: fcvt s5, h7 +; NOFP16-NEXT: bit.16b v5, v6, v2 +; NOFP16-NEXT: fcvt s6, h7 ; NOFP16-NEXT: fcvt s7, h16 -; NOFP16-NEXT: bit.16b v4, v3, v2 -; NOFP16-NEXT: mov h3, v0[3] -; NOFP16-NEXT: fcvt h0, s6 +; NOFP16-NEXT: bit.16b v3, v4, v2 +; NOFP16-NEXT: mov h4, v0[3] +; NOFP16-NEXT: fcvt h0, s5 ; NOFP16-NEXT: fcvt s1, h1 -; NOFP16-NEXT: bit.16b v7, v5, v2 -; NOFP16-NEXT: fcvt h4, s4 -; NOFP16-NEXT: fcvt s3, h3 -; NOFP16-NEXT: fcvt h5, s7 -; NOFP16-NEXT: mov.h v0[1], v4[0] -; NOFP16-NEXT: bit.16b v3, v1, v2 +; NOFP16-NEXT: bit.16b v6, v7, v2 +; NOFP16-NEXT: fcvt h3, s3 +; NOFP16-NEXT: fcvt s4, h4 +; NOFP16-NEXT: fcvt h5, s6 +; NOFP16-NEXT: mov.h v0[1], v3[0] +; NOFP16-NEXT: bit.16b v1, v4, v2 ; NOFP16-NEXT: mov.h v0[2], v5[0] -; NOFP16-NEXT: fcvt h1, s3 +; NOFP16-NEXT: fcvt h1, s1 ; NOFP16-NEXT: mov.h v0[3], v1[0] ; NOFP16-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; NOFP16-NEXT: ret @@ -221,7 +221,7 @@ ; FP16-LABEL: test_copysign_v4f16_v4f16: ; FP16: ; %bb.0: ; FP16-NEXT: movi.4h v2, #128, lsl #8 -; FP16-NEXT: bit.8b v0, v1, v2 +; FP16-NEXT: bif.8b v0, v1, v2 ; FP16-NEXT: ret %r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %b) ret <4 x half> %r @@ -242,21 +242,21 @@ ; NOFP16-NEXT: fcvt s3, h3 ; NOFP16-NEXT: mov h1, v1[3] ; NOFP16-NEXT: fcvt s4, h4 -; NOFP16-NEXT: bit.16b v5, v6, v2 +; NOFP16-NEXT: bif.16b v5, v6, v2 ; NOFP16-NEXT: fcvt s6, h7 ; NOFP16-NEXT: fcvt s7, h16 ; NOFP16-NEXT: fcvt s1, h1 -; NOFP16-NEXT: bit.16b v3, v4, v2 +; NOFP16-NEXT: bif.16b v3, v4, v2 ; NOFP16-NEXT: mov h4, v0[3] ; NOFP16-NEXT: fcvt h0, s5 -; NOFP16-NEXT: bit.16b v6, v7, v2 +; NOFP16-NEXT: bif.16b v6, v7, v2 ; NOFP16-NEXT: fcvt h3, s3 ; NOFP16-NEXT: fcvt s4, h4 ; NOFP16-NEXT: fcvt h5, s6 ; NOFP16-NEXT: mov.h v0[1], v3[0] -; NOFP16-NEXT: bit.16b v4, v1, v2 +; NOFP16-NEXT: bit.16b v1, v4, v2 ; NOFP16-NEXT: mov.h v0[2], v5[0] -; NOFP16-NEXT: fcvt h1, s4 +; NOFP16-NEXT: fcvt h1, s1 ; NOFP16-NEXT: mov.h v0[3], v1[0] ; NOFP16-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; NOFP16-NEXT: ret @@ -265,7 +265,7 @@ ; FP16: ; %bb.0: ; FP16-NEXT: movi.4h v2, #128, lsl #8 ; FP16-NEXT: fcvtn v1.4h, v1.4s -; FP16-NEXT: bit.8b v0, v1, v2 +; FP16-NEXT: bif.8b v0, v1, v2 ; FP16-NEXT: ret %tmp0 = fptrunc <4 x float> %b to <4 x half> %r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %tmp0) @@ -284,22 +284,23 @@ ; NOFP16-NEXT: mov h7, v0[2] ; NOFP16-NEXT: fcvt s4, d4 ; NOFP16-NEXT: fcvt s5, h5 -; NOFP16-NEXT: bit.16b v6, v1, v3 -; NOFP16-NEXT: fcvt s1, d2 +; NOFP16-NEXT: bit.16b v1, v6, v3 +; NOFP16-NEXT: fcvt s6, d2 ; NOFP16-NEXT: fcvt s7, h7 -; NOFP16-NEXT: bit.16b v5, v4, v3 +; NOFP16-NEXT: bit.16b v4, v5, v3 ; NOFP16-NEXT: mov d2, v2[1] -; NOFP16-NEXT: mov h4, v0[3] -; NOFP16-NEXT: fcvt h0, s6 -; NOFP16-NEXT: bit.16b v7, v1, v3 -; NOFP16-NEXT: fcvt h1, s5 +; NOFP16-NEXT: mov h5, v0[3] +; NOFP16-NEXT: fcvt h0, s1 +; NOFP16-NEXT: bit.16b v6, v7, v3 +; NOFP16-NEXT: fcvt h1, s4 ; NOFP16-NEXT: fcvt s2, d2 -; NOFP16-NEXT: fcvt s4, h4 -; NOFP16-NEXT: fcvt h5, s7 +; NOFP16-NEXT: fcvt s4, h5 +; NOFP16-NEXT: fcvt h5, s6 ; NOFP16-NEXT: mov.h v0[1], v1[0] -; NOFP16-NEXT: bit.16b v4, v2, v3 +; NOFP16-NEXT: mov.16b v1, v3 ; NOFP16-NEXT: mov.h v0[2], v5[0] -; NOFP16-NEXT: fcvt h1, s4 +; NOFP16-NEXT: bsl.16b v1, v4, v2 +; NOFP16-NEXT: fcvt h1, s1 ; NOFP16-NEXT: mov.h v0[3], v1[0] ; NOFP16-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; NOFP16-NEXT: ret @@ -316,7 +317,7 @@ ; FP16-NEXT: mov.h v1[2], v4[0] ; FP16-NEXT: fcvt h2, d2 ; FP16-NEXT: mov.h v1[3], v2[0] -; FP16-NEXT: bit.8b v0, v1, v3 +; FP16-NEXT: bif.8b v0, v1, v3 ; FP16-NEXT: ret %tmp0 = fptrunc <4 x double> %b to <4 x half> %r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %tmp0) @@ -340,47 +341,48 @@ ; NOFP16-NEXT: fcvt s5, h5 ; NOFP16-NEXT: fcvt s6, h6 ; NOFP16-NEXT: mov h17, v0[3] -; NOFP16-NEXT: bit.16b v4, v2, v3 -; NOFP16-NEXT: mov h2, v1[3] +; NOFP16-NEXT: mov h18, v0[5] +; NOFP16-NEXT: bit.16b v2, v4, v3 +; NOFP16-NEXT: mov h4, v1[3] ; NOFP16-NEXT: fcvt s7, h7 ; NOFP16-NEXT: fcvt s16, h16 -; NOFP16-NEXT: bit.16b v6, v5, v3 +; NOFP16-NEXT: bit.16b v5, v6, v3 ; NOFP16-NEXT: fcvt s17, h17 -; NOFP16-NEXT: fcvt s18, h2 -; NOFP16-NEXT: mov h5, v1[4] -; NOFP16-NEXT: fcvt h2, s4 -; NOFP16-NEXT: bit.16b v16, v7, v3 -; NOFP16-NEXT: mov h7, v0[4] -; NOFP16-NEXT: fcvt h4, s6 -; NOFP16-NEXT: bit.16b v17, v18, v3 -; NOFP16-NEXT: mov h6, v1[5] -; NOFP16-NEXT: mov h18, v0[5] -; NOFP16-NEXT: fcvt s5, h5 +; NOFP16-NEXT: mov.16b v6, v3 +; NOFP16-NEXT: fcvt s4, h4 +; NOFP16-NEXT: fcvt h2, s2 +; NOFP16-NEXT: fcvt h5, s5 +; NOFP16-NEXT: bsl.16b v6, v16, v7 +; NOFP16-NEXT: mov h7, v1[4] +; NOFP16-NEXT: mov h16, v0[4] +; NOFP16-NEXT: bit.16b v4, v17, v3 +; NOFP16-NEXT: mov h17, v1[5] +; NOFP16-NEXT: mov.h v2[1], v5[0] ; NOFP16-NEXT: fcvt s7, h7 -; NOFP16-NEXT: mov.h v2[1], v4[0] -; NOFP16-NEXT: fcvt h4, s16 -; NOFP16-NEXT: fcvt s6, h6 -; NOFP16-NEXT: fcvt s16, h18 -; NOFP16-NEXT: fcvt h17, s17 -; NOFP16-NEXT: bit.16b v7, v5, v3 -; NOFP16-NEXT: mov h5, v0[6] -; NOFP16-NEXT: mov.h v2[2], v4[0] -; NOFP16-NEXT: mov h4, v1[6] -; NOFP16-NEXT: bit.16b v16, v6, v3 +; NOFP16-NEXT: fcvt s16, h16 +; NOFP16-NEXT: fcvt h5, s6 +; NOFP16-NEXT: fcvt s6, h17 +; NOFP16-NEXT: fcvt s17, h18 +; NOFP16-NEXT: fcvt h4, s4 +; NOFP16-NEXT: bit.16b v7, v16, v3 +; NOFP16-NEXT: mov h16, v0[6] +; NOFP16-NEXT: mov.h v2[2], v5[0] +; NOFP16-NEXT: mov h5, v1[6] +; NOFP16-NEXT: bit.16b v6, v17, v3 ; NOFP16-NEXT: mov h1, v1[7] +; NOFP16-NEXT: fcvt s16, h16 +; NOFP16-NEXT: mov.h v2[3], v4[0] +; NOFP16-NEXT: fcvt h4, s7 ; NOFP16-NEXT: fcvt s5, h5 -; NOFP16-NEXT: mov.h v2[3], v17[0] -; NOFP16-NEXT: fcvt h6, s7 -; NOFP16-NEXT: fcvt s4, h4 ; NOFP16-NEXT: mov h0, v0[7] ; NOFP16-NEXT: fcvt s1, h1 -; NOFP16-NEXT: mov.h v2[4], v6[0] -; NOFP16-NEXT: bit.16b v5, v4, v3 -; NOFP16-NEXT: fcvt h4, s16 +; NOFP16-NEXT: mov.h v2[4], v4[0] +; NOFP16-NEXT: fcvt h4, s6 +; NOFP16-NEXT: bit.16b v5, v16, v3 ; NOFP16-NEXT: fcvt s0, h0 -; NOFP16-NEXT: fcvt h5, s5 ; NOFP16-NEXT: mov.h v2[5], v4[0] -; NOFP16-NEXT: bit.16b v0, v1, v3 +; NOFP16-NEXT: fcvt h5, s5 +; NOFP16-NEXT: bif.16b v0, v1, v3 ; NOFP16-NEXT: mov.h v2[6], v5[0] ; NOFP16-NEXT: fcvt h0, s0 ; NOFP16-NEXT: mov.h v2[7], v0[0] @@ -390,7 +392,7 @@ ; FP16-LABEL: test_copysign_v8f16_v8f16: ; FP16: ; %bb.0: ; FP16-NEXT: movi.8h v2, #128, lsl #8 -; FP16-NEXT: bit.16b v0, v1, v2 +; FP16-NEXT: bif.16b v0, v1, v2 ; FP16-NEXT: ret %r = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %b) ret <8 x half> %r @@ -413,42 +415,43 @@ ; NOFP16-NEXT: fcvt s5, h5 ; NOFP16-NEXT: fcvt s17, h17 ; NOFP16-NEXT: fcvt s6, h6 -; NOFP16-NEXT: bit.16b v7, v16, v3 +; NOFP16-NEXT: bif.16b v7, v16, v3 ; NOFP16-NEXT: fcvt s16, h2 ; NOFP16-NEXT: fcvt s18, h18 -; NOFP16-NEXT: bit.16b v4, v6, v3 +; NOFP16-NEXT: bif.16b v4, v6, v3 ; NOFP16-NEXT: mov h6, v0[3] -; NOFP16-NEXT: bit.16b v5, v16, v3 +; NOFP16-NEXT: bif.16b v5, v16, v3 ; NOFP16-NEXT: mov h16, v1[3] ; NOFP16-NEXT: fcvt h1, s7 -; NOFP16-NEXT: mov h7, v0[5] -; NOFP16-NEXT: bit.16b v17, v18, v3 +; NOFP16-NEXT: mov.16b v7, v3 ; NOFP16-NEXT: fcvt h4, s4 ; NOFP16-NEXT: fcvt s6, h6 ; NOFP16-NEXT: fcvt s16, h16 -; NOFP16-NEXT: mov h18, v2[1] -; NOFP16-NEXT: fcvt s7, h7 ; NOFP16-NEXT: fcvt h5, s5 +; NOFP16-NEXT: bsl.16b v7, v17, v18 +; NOFP16-NEXT: mov h17, v0[5] +; NOFP16-NEXT: mov h18, v2[1] ; NOFP16-NEXT: mov.h v1[1], v4[0] -; NOFP16-NEXT: fcvt h4, s17 -; NOFP16-NEXT: bit.16b v6, v16, v3 +; NOFP16-NEXT: bif.16b v6, v16, v3 +; NOFP16-NEXT: fcvt h4, s7 +; NOFP16-NEXT: fcvt s7, h17 ; NOFP16-NEXT: fcvt s17, h18 ; NOFP16-NEXT: mov h16, v2[2] +; NOFP16-NEXT: mov h2, v2[3] +; NOFP16-NEXT: fcvt h6, s6 ; NOFP16-NEXT: mov.h v1[2], v4[0] ; NOFP16-NEXT: mov h4, v0[6] -; NOFP16-NEXT: mov h0, v0[7] -; NOFP16-NEXT: fcvt h6, s6 -; NOFP16-NEXT: mov h2, v2[3] -; NOFP16-NEXT: bit.16b v7, v17, v3 +; NOFP16-NEXT: bif.16b v7, v17, v3 ; NOFP16-NEXT: fcvt s16, h16 +; NOFP16-NEXT: mov h0, v0[7] +; NOFP16-NEXT: fcvt s2, h2 ; NOFP16-NEXT: fcvt s4, h4 -; NOFP16-NEXT: fcvt s0, h0 ; NOFP16-NEXT: mov.h v1[3], v6[0] -; NOFP16-NEXT: fcvt s2, h2 -; NOFP16-NEXT: bit.16b v4, v16, v3 +; NOFP16-NEXT: fcvt s0, h0 +; NOFP16-NEXT: bif.16b v4, v16, v3 ; NOFP16-NEXT: mov.h v1[4], v5[0] ; NOFP16-NEXT: fcvt h5, s7 -; NOFP16-NEXT: bit.16b v0, v2, v3 +; NOFP16-NEXT: bif.16b v0, v2, v3 ; NOFP16-NEXT: fcvt h4, s4 ; NOFP16-NEXT: mov.h v1[5], v5[0] ; NOFP16-NEXT: fcvt h0, s0 @@ -463,7 +466,7 @@ ; FP16-NEXT: fcvtn v1.4h, v1.4s ; FP16-NEXT: movi.8h v3, #128, lsl #8 ; FP16-NEXT: mov.d v1[1], v2[0] -; FP16-NEXT: bit.16b v0, v1, v3 +; FP16-NEXT: bif.16b v0, v1, v3 ; FP16-NEXT: ret %tmp0 = fptrunc <8 x float> %b to <8 x half> %r = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %tmp0)