diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7313,103 +7313,78 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { + if (!Subtarget->hasNEON()) + return SDValue(); + EVT VT = Op.getValueType(); + EVT IntVT = VT.changeTypeToInteger(); SDLoc DL(Op); SDValue In1 = Op.getOperand(0); SDValue In2 = Op.getOperand(1); EVT SrcVT = In2.getValueType(); - if (VT.isScalableVector()) { - if (VT != SrcVT) - return SDValue(); + if (SrcVT.bitsLT(VT)) + In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); + else if (SrcVT.bitsGT(VT)) + In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL)); - // copysign(x,y) -> (y & SIGN_MASK) | (x & ~SIGN_MASK) - // - // A possible alternative sequence involves using FNEG_MERGE_PASSTHRU; - // maybe useful for copysign operations with mismatched VTs. - // - // IntVT here is chosen so it's a legal type with the same element width - // as the input. - EVT IntVT = + if (VT.isScalableVector()) + IntVT = getPackedSVEVectorVT(VT.getVectorElementType().changeTypeToInteger()); - unsigned NumBits = VT.getScalarSizeInBits(); - SDValue SignMask = DAG.getConstant(APInt::getSignMask(NumBits), DL, IntVT); - SDValue InvSignMask = DAG.getNOT(DL, SignMask, IntVT); - SDValue Sign = DAG.getNode(ISD::AND, DL, IntVT, SignMask, - getSVESafeBitCast(IntVT, In2, DAG)); - SDValue Magnitude = DAG.getNode(ISD::AND, DL, IntVT, InvSignMask, - getSVESafeBitCast(IntVT, In1, DAG)); - SDValue IntResult = DAG.getNode(ISD::OR, DL, IntVT, Sign, Magnitude); - return getSVESafeBitCast(VT, IntResult, DAG); - } - if (!Subtarget->hasNEON()) + if (VT != In2.getValueType()) return SDValue(); - if (SrcVT.bitsLT(VT)) - In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); - else if (SrcVT.bitsGT(VT)) - In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL)); + auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) { + if (VT.isScalableVector()) + return getSVESafeBitCast(VT, Op, DAG); - EVT VecVT; - uint64_t EltMask; - SDValue VecVal1, VecVal2; + return DAG.getBitcast(VT, Op); + }; - auto setVecVal = [&] (int Idx) { + SDValue VecVal1, VecVal2; + EVT VecVT; + auto SetVecVal = [&](int Idx = -1) { if (!VT.isVector()) { - VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, - DAG.getUNDEF(VecVT), In1); - VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, - DAG.getUNDEF(VecVT), In2); + VecVal1 = + DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1); + VecVal2 = + DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2); } else { - VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); - VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); + VecVal1 = BitCast(VecVT, In1, DAG); + VecVal2 = BitCast(VecVT, In2, DAG); } }; - - if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { - VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32); - EltMask = 0x80000000ULL; - setVecVal(AArch64::ssub); - } else if (VT == MVT::f64 || VT == MVT::v2f64) { + if (VT.isVector()) { + VecVT = IntVT; + SetVecVal(); + } else if (VT == MVT::f64) { VecVT = MVT::v2i64; - - // We want to materialize a mask with the high bit set, but the AdvSIMD - // immediate moves cannot materialize that in a single instruction for - // 64-bit elements. Instead, materialize zero and then negate it. - EltMask = 0; - - setVecVal(AArch64::dsub); - } else if (VT == MVT::f16 || VT == MVT::v4f16 || VT == MVT::v8f16) { - VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16); - EltMask = 0x8000ULL; - setVecVal(AArch64::hsub); + SetVecVal(AArch64::dsub); + } else if (VT == MVT::f32) { + VecVT = MVT::v4i32; + SetVecVal(AArch64::ssub); + } else if (VT == MVT::f16) { + VecVT = MVT::v8i16; + SetVecVal(AArch64::hsub); } else { llvm_unreachable("Invalid type for copysign!"); } - SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT); - - // If we couldn't materialize the mask above, then the mask vector will be - // the zero vector, and we need to negate it here. - if (VT == MVT::f64 || VT == MVT::v2f64) { - BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec); - BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec); - BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec); - } - - SDValue Sel = - DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec); + unsigned BitWidth = In1.getScalarValueSizeInBits(); + SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT); + SDValue BSP = + DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2); if (VT == MVT::f16) - return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel); + return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP); if (VT == MVT::f32) - return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel); - else if (VT == MVT::f64) - return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel); - else - return DAG.getNode(ISD::BITCAST, DL, VT, Sel); + return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP); + if (VT == MVT::f64) + return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP); + + return BitCast(VT, BSP, DAG); } SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { @@ -17675,6 +17650,32 @@ return SDValue(); } +SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget, + bool fixedSVEVectorVT) { + EVT VT = N->getValueType(0); + + // Don't expand for SVE2 + if (!VT.isScalableVector() || Subtarget->hasSVE2() || + Subtarget->hasStreamingSVE()) + return SDValue(); + + // Don't expand for NEON + if (VT.isFixedLengthVector() && !fixedSVEVectorVT) + return SDValue(); + + SDLoc DL(N); + + SDValue Mask = N->getOperand(0); + SDValue In1 = N->getOperand(1); + SDValue In2 = N->getOperand(2); + + SDValue InvMask = DAG.getNOT(DL, Mask, VT); + SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1); + SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2); + return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -17773,6 +17774,9 @@ return performVectorShiftCombine(N, *this, DCI); case AArch64ISD::SUNPKLO: return performSunpkloCombine(N, DAG); + case AArch64ISD::BSP: + return performBSPExpandForSVE( + N, DAG, Subtarget, useSVEForFixedLengthVectorVT(N->getValueType(0))); case ISD::INSERT_VECTOR_ELT: return performInsertVectorEltCombine(N, DCI); case ISD::EXTRACT_VECTOR_ELT: diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -3091,7 +3091,7 @@ // SVE2 bitwise ternary operations defm EOR3_ZZZZ : sve2_int_bitwise_ternary_op<0b000, "eor3", int_aarch64_sve_eor3>; defm BCAX_ZZZZ : sve2_int_bitwise_ternary_op<0b010, "bcax", int_aarch64_sve_bcax>; - defm BSL_ZZZZ : sve2_int_bitwise_ternary_op<0b001, "bsl", int_aarch64_sve_bsl>; + defm BSL_ZZZZ : sve2_int_bitwise_ternary_op<0b001, "bsl", int_aarch64_sve_bsl, AArch64bsp>; defm BSL1N_ZZZZ : sve2_int_bitwise_ternary_op<0b011, "bsl1n", int_aarch64_sve_bsl1n>; defm BSL2N_ZZZZ : sve2_int_bitwise_ternary_op<0b101, "bsl2n", int_aarch64_sve_bsl2n>; defm NBSL_ZZZZ : sve2_int_bitwise_ternary_op<0b111, "nbsl", int_aarch64_sve_nbsl>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -513,6 +513,14 @@ (vt (AArch64dup (it immL))))), (inst $Pg, $Zs1, imm)>; +// Used to re-order the operands of BSP when lowering to BSL. BSP has the order: +// mask, in1, in2 whereas BSL for SVE2 has them ordered in1, in2, mask + +class SVE_3_Op_BSP_Pat +: Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3)), + (inst $Op2, $Op3, $Op1)>; + // // Pseudo -> Instruction mappings // @@ -4458,7 +4466,8 @@ let ElementSize = ElementSizeNone; } -multiclass sve2_int_bitwise_ternary_op opc, string asm, SDPatternOperator op> { +multiclass sve2_int_bitwise_ternary_op opc, string asm, SDPatternOperator op, + SDPatternOperator ir_op = null_frag> { def NAME : sve2_int_bitwise_ternary_op_d; def : InstAlias(NAME)>; def : SVE_3_Op_Pat(NAME)>; def : SVE_3_Op_Pat(NAME)>; + + + def : SVE_3_Op_BSP_Pat(NAME)>; + def : SVE_3_Op_BSP_Pat(NAME)>; + def : SVE_3_Op_BSP_Pat(NAME)>; + def : SVE_3_Op_BSP_Pat(NAME)>; } class sve2_int_rotate_right_imm tsz8_64, string asm, diff --git a/llvm/test/CodeGen/AArch64/arm64-fcopysign.ll b/llvm/test/CodeGen/AArch64/arm64-fcopysign.ll --- a/llvm/test/CodeGen/AArch64/arm64-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/arm64-fcopysign.ll @@ -6,10 +6,10 @@ define float @test1(float %x, float %y) nounwind { ; CHECK-LABEL: test1: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: movi.4s v2, #128, lsl #24 +; CHECK-NEXT: mvni.4s v2, #128, lsl #24 ; CHECK-NEXT: ; kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: ; kill: def $s1 killed $s1 def $q1 -; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: bif.16b v0, v1, v2 ; CHECK-NEXT: ; kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: ret entry: @@ -20,11 +20,11 @@ define double @test2(double %x, double %y) nounwind { ; CHECK-LABEL: test2: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: movi.2d v2, #0000000000000000 +; CHECK-NEXT: mov x8, #9223372036854775807 ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: ; kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: fneg.2d v2, v2 -; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: dup.2d v2, x8 +; CHECK-NEXT: bif.16b v0, v1, v2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret entry: @@ -36,12 +36,12 @@ define double @test3(double %a, float %b, float %c) nounwind { ; CHECK-LABEL: test3: ; CHECK: ; %bb.0: -; CHECK-NEXT: movi.2d v3, #0000000000000000 -; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: fadd s1, s1, s2 -; CHECK-NEXT: fneg.2d v2, v3 +; CHECK-NEXT: mov x8, #9223372036854775807 +; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup.2d v2, x8 ; CHECK-NEXT: fcvt d1, s1 -; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: bif.16b v0, v1, v2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp1 = fadd float %b, %c @@ -55,11 +55,11 @@ ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill ; CHECK-NEXT: bl _bar -; CHECK-NEXT: movi.4s v1, #128, lsl #24 +; CHECK-NEXT: mvni.4s v1, #128, lsl #24 ; CHECK-NEXT: fcvt s0, d0 ; CHECK-NEXT: fmov s2, #0.50000000 -; CHECK-NEXT: bit.16b v2, v0, v1 -; CHECK-NEXT: fadd s0, s0, s2 +; CHECK-NEXT: bsl.16b v1, v2, v0 +; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll --- a/llvm/test/CodeGen/AArch64/f16-instructions.ll +++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple aarch64-unknown-unknown -aarch64-neon-syntax=apple -asm-verbose=false -disable-post-ra -frame-pointer=non-leaf | FileCheck %s --check-prefix=CHECK-CVT --check-prefix=CHECK-COMMON ; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+fullfp16 -aarch64-neon-syntax=apple -asm-verbose=false -disable-post-ra -frame-pointer=non-leaf | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-FP16 @@ -1100,16 +1101,16 @@ } ; CHECK-CVT-LABEL: test_copysign: -; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 +; CHECK-CVT-NEXT: mvni.4s v2, #128, lsl #24 ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 -; CHECK-CVT-NEXT: bit.16b v0, v1, v2 +; CHECK-CVT-NEXT: bif.16b v0, v1, v2 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_copysign: -; CHECK-FP16-NEXT: movi.8h v2, #128, lsl #8 -; CHECK-FP16-NEXT: bit.16b v0, v1, v2 +; CHECK-FP16-NEXT: mvni.8h v2, #128, lsl #8 +; CHECK-FP16-NEXT: bif.16b v0, v1, v2 ; CHECK-FP16-NEXT: ret define half @test_copysign(half %a, half %b) #0 { @@ -1118,16 +1119,16 @@ } ; CHECK-CVT-LABEL: test_copysign_f32: -; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 +; CHECK-CVT-NEXT: mvni.4s v2, #128, lsl #24 ; CHECK-CVT-NEXT: fcvt s0, h0 -; CHECK-CVT-NEXT: bit.16b v0, v1, v2 +; CHECK-CVT-NEXT: bif.16b v0, v1, v2 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_copysign_f32: -; CHECK-FP16-NEXT: movi.8h v2, #128, lsl #8 +; CHECK-FP16-NEXT: mvni.8h v2, #128, lsl #8 ; CHECK-FP16-NEXT: fcvt h1, s1 -; CHECK-FP16-NEXT: bit.16b v0, v1, v2 +; CHECK-FP16-NEXT: bif.16b v0, v1, v2 ; CHECK-FP16-NEXT: ret define half @test_copysign_f32(half %a, float %b) #0 { @@ -1137,17 +1138,17 @@ } ; CHECK-CVT-LABEL: test_copysign_f64: -; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 +; CHECK-CVT-NEXT: mvni.4s v2, #128, lsl #24 ; CHECK-CVT-NEXT: fcvt s1, d1 ; CHECK-CVT-NEXT: fcvt s0, h0 -; CHECK-CVT-NEXT: bit.16b v0, v1, v2 +; CHECK-CVT-NEXT: bif.16b v0, v1, v2 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_copysign_f64: -; CHECK-FP16-NEXT: movi.8h v2, #128, lsl #8 +; CHECK-FP16-NEXT: mvni.8h v2, #128, lsl #8 ; CHECK-FP16-NEXT: fcvt h1, d1 -; CHECK-FP16-NEXT: bit.16b v0, v1, v2 +; CHECK-FP16-NEXT: bif.16b v0, v1, v2 ; CHECK-FP16-NEXT: ret define half @test_copysign_f64(half %a, double %b) #0 { @@ -1160,15 +1161,15 @@ ; away the (fpext (fp_round )) here. ; CHECK-CVT-LABEL: test_copysign_extended: -; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 +; CHECK-CVT-NEXT: mvni.4s v2, #128, lsl #24 ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 -; CHECK-CVT-NEXT: bit.16b v0, v1, v2 +; CHECK-CVT-NEXT: bif.16b v0, v1, v2 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_copysign_extended: -; CHECK-FP16-NEXT: movi.8h v2, #128, lsl #8 -; CHECK-FP16-NEXT: bit.16b v0, v1, v2 +; CHECK-FP16-NEXT: mvni.8h v2, #128, lsl #8 +; CHECK-FP16-NEXT: bif.16b v0, v1, v2 ; CHECK-FP16-NEXT: fcvt s0, h0 ; CHECK-FP16-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll --- a/llvm/test/CodeGen/AArch64/fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/fcopysign.ll @@ -95,10 +95,10 @@ define float @copysign32(float %a, float %b) { ; CHECK-LABEL: copysign32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v2.4s, #128, lsl #24 +; CHECK-NEXT: mvni v2.4s, #128, lsl #24 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 -; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: ret ; @@ -118,11 +118,11 @@ define double @copysign64(double %a, double %b) { ; CHECK-LABEL: copysign64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: mov x8, #9223372036854775807 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: fneg v2.2d, v2.2d -; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: dup v2.2d, x8 +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret ; @@ -142,10 +142,10 @@ define half @copysign16(half %a, half %b) { ; CHECK-LABEL: copysign16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v2.4s, #128, lsl #24 +; CHECK-NEXT: mvni v2.4s, #128, lsl #24 ; CHECK-NEXT: fcvt s1, h1 ; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: fcvt h0, s0 ; CHECK-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/sve-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-fcopysign.ll --- a/llvm/test/CodeGen/AArch64/sve-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/sve-fcopysign.ll @@ -8,9 +8,9 @@ define @test_copysign_v2f32_v2f32( %a, %b) #0 { ; CHECK-LABEL: test_copysign_v2f32_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.s, z0.s, #0x7fffffff ; CHECK-NEXT: and z1.s, z1.s, #0x80000000 -; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: and z0.s, z0.s, #0x7fffffff +; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: ret %r = call @llvm.copysign.v2f32( %a, %b) ret %r @@ -23,7 +23,7 @@ ; CHECK-NEXT: and z0.s, z0.s, #0x7fffffff ; CHECK-NEXT: fcvt z1.s, p0/m, z1.d ; CHECK-NEXT: and z1.s, z1.s, #0x80000000 -; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: ret %tmp0 = fptrunc %b to %r = call @llvm.copysign.v2f32( %a, %tmp0) @@ -37,9 +37,9 @@ define @test_copysign_v4f32_v4f32( %a, %b) #0 { ; CHECK-LABEL: test_copysign_v4f32_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.s, z0.s, #0x7fffffff ; CHECK-NEXT: and z1.s, z1.s, #0x80000000 -; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: and z0.s, z0.s, #0x7fffffff +; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: ret %r = call @llvm.copysign.v4f32( %a, %b) ret %r @@ -55,7 +55,7 @@ ; CHECK-NEXT: fcvt z1.s, p0/m, z1.d ; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s ; CHECK-NEXT: and z1.s, z1.s, #0x80000000 -; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: ret %tmp0 = fptrunc %b to %r = call @llvm.copysign.v4f32( %a, %tmp0) @@ -73,7 +73,7 @@ ; CHECK-NEXT: and z0.d, z0.d, #0x7fffffffffffffff ; CHECK-NEXT: fcvt z1.d, p0/m, z1.s ; CHECK-NEXT: and z1.d, z1.d, #0x8000000000000000 -; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: ret %tmp0 = fpext %b to %r = call @llvm.copysign.v2f64( %a, %tmp0) @@ -83,9 +83,9 @@ define @test_copysign_v2f64_v2f64( %a, %b) #0 { ; CHECK-LABEL: test_copysign_v2f64_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.d, z0.d, #0x7fffffffffffffff ; CHECK-NEXT: and z1.d, z1.d, #0x8000000000000000 -; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: and z0.d, z0.d, #0x7fffffffffffffff +; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: ret %r = call @llvm.copysign.v2f64( %a, %b) ret %r @@ -108,8 +108,8 @@ ; CHECK-NEXT: and z2.d, z2.d, #0x8000000000000000 ; CHECK-NEXT: and z3.d, z3.d, #0x8000000000000000 ; CHECK-NEXT: and z1.d, z1.d, #0x7fffffffffffffff -; CHECK-NEXT: orr z0.d, z2.d, z0.d -; CHECK-NEXT: orr z1.d, z3.d, z1.d +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: orr z1.d, z1.d, z3.d ; CHECK-NEXT: ret %tmp0 = fpext %b to %r = call @llvm.copysign.v4f64( %a, %tmp0) @@ -122,10 +122,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: and z2.d, z2.d, #0x8000000000000000 ; CHECK-NEXT: and z0.d, z0.d, #0x7fffffffffffffff -; CHECK-NEXT: and z1.d, z1.d, #0x7fffffffffffffff ; CHECK-NEXT: and z3.d, z3.d, #0x8000000000000000 -; CHECK-NEXT: orr z0.d, z2.d, z0.d -; CHECK-NEXT: orr z1.d, z3.d, z1.d +; CHECK-NEXT: and z1.d, z1.d, #0x7fffffffffffffff +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: orr z1.d, z1.d, z3.d ; CHECK-NEXT: ret %r = call @llvm.copysign.v4f64( %a, %b) ret %r @@ -138,9 +138,9 @@ define @test_copysign_v4f16_v4f16( %a, %b) #0 { ; CHECK-LABEL: test_copysign_v4f16_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.h, z0.h, #0x7fff ; CHECK-NEXT: and z1.h, z1.h, #0x8000 -; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: and z0.h, z0.h, #0x7fff +; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: ret %r = call @llvm.copysign.v4f16( %a, %b) ret %r @@ -153,7 +153,7 @@ ; CHECK-NEXT: and z0.h, z0.h, #0x7fff ; CHECK-NEXT: fcvt z1.h, p0/m, z1.s ; CHECK-NEXT: and z1.h, z1.h, #0x8000 -; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: ret %tmp0 = fptrunc %b to %r = call @llvm.copysign.v4f16( %a, %tmp0) @@ -169,7 +169,7 @@ ; CHECK-NEXT: fcvt z1.h, p0/m, z1.d ; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s ; CHECK-NEXT: and z1.h, z1.h, #0x8000 -; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: ret %tmp0 = fptrunc %b to %r = call @llvm.copysign.v4f16( %a, %tmp0) @@ -183,9 +183,9 @@ define @test_copysign_v8f16_v8f16( %a, %b) #0 { ; CHECK-LABEL: test_copysign_v8f16_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.h, z0.h, #0x7fff ; CHECK-NEXT: and z1.h, z1.h, #0x8000 -; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: and z0.h, z0.h, #0x7fff +; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: ret %r = call @llvm.copysign.v8f16( %a, %b) ret %r @@ -200,7 +200,7 @@ ; CHECK-NEXT: fcvt z1.h, p0/m, z1.s ; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h ; CHECK-NEXT: and z1.h, z1.h, #0x8000 -; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: ret %tmp0 = fptrunc %b to %r = call @llvm.copysign.v8f16( %a, %tmp0) diff --git a/llvm/test/CodeGen/AArch64/sve-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve2-fcopysign.ll copy from llvm/test/CodeGen/AArch64/sve-fcopysign.ll copy to llvm/test/CodeGen/AArch64/sve2-fcopysign.ll --- a/llvm/test/CodeGen/AArch64/sve-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/sve2-fcopysign.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple aarch64-eabi -mattr=+sve -o - | FileCheck --check-prefixes=CHECK %s +; RUN: llc < %s -mtriple aarch64-eabi -mattr=+sve2 -o - | FileCheck --check-prefixes=CHECK %s target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" @@ -8,9 +8,9 @@ define @test_copysign_v2f32_v2f32( %a, %b) #0 { ; CHECK-LABEL: test_copysign_v2f32_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.s, z0.s, #0x7fffffff -; CHECK-NEXT: and z1.s, z1.s, #0x80000000 -; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: mov w8, #2147483647 +; CHECK-NEXT: mov z2.s, w8 +; CHECK-NEXT: bsl z0.d, z0.d, z1.d, z2.d ; CHECK-NEXT: ret %r = call @llvm.copysign.v2f32( %a, %b) ret %r @@ -19,11 +19,11 @@ define @test_copysign_v2f32_v2f64( %a, %b) #0 { ; CHECK-LABEL: test_copysign_v2f32_v2f64: ; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #2147483647 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: and z0.s, z0.s, #0x7fffffff ; CHECK-NEXT: fcvt z1.s, p0/m, z1.d -; CHECK-NEXT: and z1.s, z1.s, #0x80000000 -; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: mov z2.s, w8 +; CHECK-NEXT: bsl z0.d, z0.d, z1.d, z2.d ; CHECK-NEXT: ret %tmp0 = fptrunc %b to %r = call @llvm.copysign.v2f32( %a, %tmp0) @@ -37,9 +37,9 @@ define @test_copysign_v4f32_v4f32( %a, %b) #0 { ; CHECK-LABEL: test_copysign_v4f32_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.s, z0.s, #0x7fffffff -; CHECK-NEXT: and z1.s, z1.s, #0x80000000 -; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: mov w8, #2147483647 +; CHECK-NEXT: mov z2.s, w8 +; CHECK-NEXT: bsl z0.d, z0.d, z1.d, z2.d ; CHECK-NEXT: ret %r = call @llvm.copysign.v4f32( %a, %b) ret %r @@ -49,13 +49,13 @@ define @test_copysign_v4f32_v4f64( %a, %b) #0 { ; CHECK-LABEL: test_copysign_v4f32_v4f64: ; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #2147483647 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: and z0.s, z0.s, #0x7fffffff ; CHECK-NEXT: fcvt z2.s, p0/m, z2.d ; CHECK-NEXT: fcvt z1.s, p0/m, z1.d ; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s -; CHECK-NEXT: and z1.s, z1.s, #0x80000000 -; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: mov z2.s, w8 +; CHECK-NEXT: bsl z0.d, z0.d, z1.d, z2.d ; CHECK-NEXT: ret %tmp0 = fptrunc %b to %r = call @llvm.copysign.v4f32( %a, %tmp0) @@ -70,10 +70,9 @@ ; CHECK-LABEL: test_copysign_v2f64_v232: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: and z0.d, z0.d, #0x7fffffffffffffff +; CHECK-NEXT: mov z2.d, #0x7fffffffffffffff ; CHECK-NEXT: fcvt z1.d, p0/m, z1.s -; CHECK-NEXT: and z1.d, z1.d, #0x8000000000000000 -; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: bsl z0.d, z0.d, z1.d, z2.d ; CHECK-NEXT: ret %tmp0 = fpext %b to %r = call @llvm.copysign.v2f64( %a, %tmp0) @@ -83,9 +82,8 @@ define @test_copysign_v2f64_v2f64( %a, %b) #0 { ; CHECK-LABEL: test_copysign_v2f64_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.d, z0.d, #0x7fffffffffffffff -; CHECK-NEXT: and z1.d, z1.d, #0x8000000000000000 -; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: mov z2.d, #0x7fffffffffffffff +; CHECK-NEXT: bsl z0.d, z0.d, z1.d, z2.d ; CHECK-NEXT: ret %r = call @llvm.copysign.v2f64( %a, %b) ret %r @@ -104,12 +102,9 @@ ; CHECK-NEXT: uunpklo z2.d, z2.s ; CHECK-NEXT: fcvt z3.d, p0/m, z3.s ; CHECK-NEXT: fcvt z2.d, p0/m, z2.s -; CHECK-NEXT: and z0.d, z0.d, #0x7fffffffffffffff -; CHECK-NEXT: and z2.d, z2.d, #0x8000000000000000 -; CHECK-NEXT: and z3.d, z3.d, #0x8000000000000000 -; CHECK-NEXT: and z1.d, z1.d, #0x7fffffffffffffff -; CHECK-NEXT: orr z0.d, z2.d, z0.d -; CHECK-NEXT: orr z1.d, z3.d, z1.d +; CHECK-NEXT: mov z4.d, #0x7fffffffffffffff +; CHECK-NEXT: bsl z0.d, z0.d, z2.d, z4.d +; CHECK-NEXT: bsl z1.d, z1.d, z3.d, z4.d ; CHECK-NEXT: ret %tmp0 = fpext %b to %r = call @llvm.copysign.v4f64( %a, %tmp0) @@ -120,12 +115,9 @@ define @test_copysign_v4f64_v4f64( %a, %b) #0 { ; CHECK-LABEL: test_copysign_v4f64_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: and z2.d, z2.d, #0x8000000000000000 -; CHECK-NEXT: and z0.d, z0.d, #0x7fffffffffffffff -; CHECK-NEXT: and z1.d, z1.d, #0x7fffffffffffffff -; CHECK-NEXT: and z3.d, z3.d, #0x8000000000000000 -; CHECK-NEXT: orr z0.d, z2.d, z0.d -; CHECK-NEXT: orr z1.d, z3.d, z1.d +; CHECK-NEXT: mov z4.d, #0x7fffffffffffffff +; CHECK-NEXT: bsl z0.d, z0.d, z2.d, z4.d +; CHECK-NEXT: bsl z1.d, z1.d, z3.d, z4.d ; CHECK-NEXT: ret %r = call @llvm.copysign.v4f64( %a, %b) ret %r @@ -138,9 +130,9 @@ define @test_copysign_v4f16_v4f16( %a, %b) #0 { ; CHECK-LABEL: test_copysign_v4f16_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.h, z0.h, #0x7fff -; CHECK-NEXT: and z1.h, z1.h, #0x8000 -; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: mov w8, #32767 +; CHECK-NEXT: mov z2.h, w8 +; CHECK-NEXT: bsl z0.d, z0.d, z1.d, z2.d ; CHECK-NEXT: ret %r = call @llvm.copysign.v4f16( %a, %b) ret %r @@ -149,11 +141,11 @@ define @test_copysign_v4f16_v4f32( %a, %b) #0 { ; CHECK-LABEL: test_copysign_v4f16_v4f32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #32767 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: and z0.h, z0.h, #0x7fff ; CHECK-NEXT: fcvt z1.h, p0/m, z1.s -; CHECK-NEXT: and z1.h, z1.h, #0x8000 -; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: mov z2.h, w8 +; CHECK-NEXT: bsl z0.d, z0.d, z1.d, z2.d ; CHECK-NEXT: ret %tmp0 = fptrunc %b to %r = call @llvm.copysign.v4f16( %a, %tmp0) @@ -163,13 +155,13 @@ define @test_copysign_v4f16_v4f64( %a, %b) #0 { ; CHECK-LABEL: test_copysign_v4f16_v4f64: ; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #32767 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: and z0.h, z0.h, #0x7fff ; CHECK-NEXT: fcvt z2.h, p0/m, z2.d ; CHECK-NEXT: fcvt z1.h, p0/m, z1.d ; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s -; CHECK-NEXT: and z1.h, z1.h, #0x8000 -; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: mov z2.h, w8 +; CHECK-NEXT: bsl z0.d, z0.d, z1.d, z2.d ; CHECK-NEXT: ret %tmp0 = fptrunc %b to %r = call @llvm.copysign.v4f16( %a, %tmp0) @@ -183,9 +175,9 @@ define @test_copysign_v8f16_v8f16( %a, %b) #0 { ; CHECK-LABEL: test_copysign_v8f16_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.h, z0.h, #0x7fff -; CHECK-NEXT: and z1.h, z1.h, #0x8000 -; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: mov w8, #32767 +; CHECK-NEXT: mov z2.h, w8 +; CHECK-NEXT: bsl z0.d, z0.d, z1.d, z2.d ; CHECK-NEXT: ret %r = call @llvm.copysign.v8f16( %a, %b) ret %r @@ -194,13 +186,13 @@ define @test_copysign_v8f16_v8f32( %a, %b) #0 { ; CHECK-LABEL: test_copysign_v8f16_v8f32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #32767 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: and z0.h, z0.h, #0x7fff ; CHECK-NEXT: fcvt z2.h, p0/m, z2.s ; CHECK-NEXT: fcvt z1.h, p0/m, z1.s ; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h -; CHECK-NEXT: and z1.h, z1.h, #0x8000 -; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: mov z2.h, w8 +; CHECK-NEXT: bsl z0.d, z0.d, z1.d, z2.d ; CHECK-NEXT: ret %tmp0 = fptrunc %b to %r = call @llvm.copysign.v8f16( %a, %tmp0) diff --git a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll --- a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll @@ -10,8 +10,8 @@ define <1 x float> @test_copysign_v1f32_v1f32(<1 x float> %a, <1 x float> %b) #0 { ; CHECK-LABEL: test_copysign_v1f32_v1f32: ; CHECK: ; %bb.0: -; CHECK-NEXT: movi.2s v2, #128, lsl #24 -; CHECK-NEXT: bit.8b v0, v1, v2 +; CHECK-NEXT: mvni.2s v2, #128, lsl #24 +; CHECK-NEXT: bif.8b v0, v1, v2 ; CHECK-NEXT: ret %r = call <1 x float> @llvm.copysign.v1f32(<1 x float> %a, <1 x float> %b) ret <1 x float> %r @@ -22,9 +22,9 @@ ; CHECK-LABEL: test_copysign_v1f32_v1f64: ; CHECK: ; %bb.0: ; CHECK-NEXT: ; kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: movi.2s v2, #128, lsl #24 +; CHECK-NEXT: mvni.2s v2, #128, lsl #24 ; CHECK-NEXT: fcvtn v1.2s, v1.2d -; CHECK-NEXT: bit.8b v0, v1, v2 +; CHECK-NEXT: bif.8b v0, v1, v2 ; CHECK-NEXT: ret %tmp0 = fptrunc <1 x double> %b to <1 x float> %r = call <1 x float> @llvm.copysign.v1f32(<1 x float> %a, <1 x float> %tmp0) @@ -39,11 +39,11 @@ define <1 x double> @test_copysign_v1f64_v1f32(<1 x double> %a, <1 x float> %b) #0 { ; CHECK-LABEL: test_copysign_v1f64_v1f32: ; CHECK: ; %bb.0: -; CHECK-NEXT: movi.2d v2, #0000000000000000 -; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov x8, #9223372036854775807 ; CHECK-NEXT: fcvtl v1.2d, v1.2s -; CHECK-NEXT: fneg.2d v2, v2 -; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup.2d v2, x8 +; CHECK-NEXT: bif.16b v0, v1, v2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp0 = fpext <1 x float> %b to <1 x double> @@ -54,11 +54,11 @@ define <1 x double> @test_copysign_v1f64_v1f64(<1 x double> %a, <1 x double> %b) #0 { ; CHECK-LABEL: test_copysign_v1f64_v1f64: ; CHECK: ; %bb.0: -; CHECK-NEXT: movi.2d v2, #0000000000000000 +; CHECK-NEXT: mov x8, #9223372036854775807 ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: ; kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: fneg.2d v2, v2 -; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: dup.2d v2, x8 +; CHECK-NEXT: bif.16b v0, v1, v2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %r = call <1 x double> @llvm.copysign.v1f64(<1 x double> %a, <1 x double> %b) @@ -72,8 +72,8 @@ define <2 x float> @test_copysign_v2f32_v2f32(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-LABEL: test_copysign_v2f32_v2f32: ; CHECK: ; %bb.0: -; CHECK-NEXT: movi.2s v2, #128, lsl #24 -; CHECK-NEXT: bit.8b v0, v1, v2 +; CHECK-NEXT: mvni.2s v2, #128, lsl #24 +; CHECK-NEXT: bif.8b v0, v1, v2 ; CHECK-NEXT: ret %r = call <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %b) ret <2 x float> %r @@ -82,9 +82,9 @@ define <2 x float> @test_copysign_v2f32_v2f64(<2 x float> %a, <2 x double> %b) #0 { ; CHECK-LABEL: test_copysign_v2f32_v2f64: ; CHECK: ; %bb.0: -; CHECK-NEXT: movi.2s v2, #128, lsl #24 +; CHECK-NEXT: mvni.2s v2, #128, lsl #24 ; CHECK-NEXT: fcvtn v1.2s, v1.2d -; CHECK-NEXT: bit.8b v0, v1, v2 +; CHECK-NEXT: bif.8b v0, v1, v2 ; CHECK-NEXT: ret %tmp0 = fptrunc <2 x double> %b to <2 x float> %r = call <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %tmp0) @@ -98,8 +98,8 @@ define <4 x float> @test_copysign_v4f32_v4f32(<4 x float> %a, <4 x float> %b) #0 { ; CHECK-LABEL: test_copysign_v4f32_v4f32: ; CHECK: ; %bb.0: -; CHECK-NEXT: movi.4s v2, #128, lsl #24 -; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: mvni.4s v2, #128, lsl #24 +; CHECK-NEXT: bif.16b v0, v1, v2 ; CHECK-NEXT: ret %r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %b) ret <4 x float> %r @@ -110,9 +110,9 @@ ; CHECK-LABEL: test_copysign_v4f32_v4f64: ; CHECK: ; %bb.0: ; CHECK-NEXT: fcvtn v1.2s, v1.2d -; CHECK-NEXT: movi.4s v3, #128, lsl #24 +; CHECK-NEXT: mvni.4s v3, #128, lsl #24 ; CHECK-NEXT: fcvtn2 v1.4s, v2.2d -; CHECK-NEXT: bit.16b v0, v1, v3 +; CHECK-NEXT: bif.16b v0, v1, v3 ; CHECK-NEXT: ret %tmp0 = fptrunc <4 x double> %b to <4 x float> %r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %tmp0) @@ -126,10 +126,10 @@ define <2 x double> @test_copysign_v2f64_v232(<2 x double> %a, <2 x float> %b) #0 { ; CHECK-LABEL: test_copysign_v2f64_v232: ; CHECK: ; %bb.0: -; CHECK-NEXT: movi.2d v2, #0000000000000000 +; CHECK-NEXT: mov x8, #9223372036854775807 ; CHECK-NEXT: fcvtl v1.2d, v1.2s -; CHECK-NEXT: fneg.2d v2, v2 -; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: dup.2d v2, x8 +; CHECK-NEXT: bif.16b v0, v1, v2 ; CHECK-NEXT: ret %tmp0 = fpext <2 x float> %b to <2 x double> %r = call <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %tmp0) @@ -139,9 +139,9 @@ define <2 x double> @test_copysign_v2f64_v2f64(<2 x double> %a, <2 x double> %b) #0 { ; CHECK-LABEL: test_copysign_v2f64_v2f64: ; CHECK: ; %bb.0: -; CHECK-NEXT: movi.2d v2, #0000000000000000 -; CHECK-NEXT: fneg.2d v2, v2 -; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: mov x8, #9223372036854775807 +; CHECK-NEXT: dup.2d v2, x8 +; CHECK-NEXT: bif.16b v0, v1, v2 ; CHECK-NEXT: ret %r = call <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %b) ret <2 x double> %r @@ -155,12 +155,12 @@ define <4 x double> @test_copysign_v4f64_v4f32(<4 x double> %a, <4 x float> %b) #0 { ; CHECK-LABEL: test_copysign_v4f64_v4f32: ; CHECK: ; %bb.0: -; CHECK-NEXT: movi.2d v3, #0000000000000000 -; CHECK-NEXT: fcvtl2 v4.2d, v2.4s +; CHECK-NEXT: mov x8, #9223372036854775807 +; CHECK-NEXT: fcvtl2 v3.2d, v2.4s ; CHECK-NEXT: fcvtl v2.2d, v2.2s -; CHECK-NEXT: fneg.2d v3, v3 -; CHECK-NEXT: bit.16b v1, v4, v3 -; CHECK-NEXT: bit.16b v0, v2, v3 +; CHECK-NEXT: dup.2d v4, x8 +; CHECK-NEXT: bif.16b v1, v3, v4 +; CHECK-NEXT: bif.16b v0, v2, v4 ; CHECK-NEXT: ret %tmp0 = fpext <4 x float> %b to <4 x double> %r = call <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %tmp0) @@ -171,10 +171,10 @@ define <4 x double> @test_copysign_v4f64_v4f64(<4 x double> %a, <4 x double> %b) #0 { ; CHECK-LABEL: test_copysign_v4f64_v4f64: ; CHECK: ; %bb.0: -; CHECK-NEXT: movi.2d v4, #0000000000000000 -; CHECK-NEXT: fneg.2d v4, v4 -; CHECK-NEXT: bit.16b v0, v2, v4 -; CHECK-NEXT: bit.16b v1, v3, v4 +; CHECK-NEXT: mov x8, #9223372036854775807 +; CHECK-NEXT: dup.2d v4, x8 +; CHECK-NEXT: bif.16b v0, v2, v4 +; CHECK-NEXT: bif.16b v1, v3, v4 ; CHECK-NEXT: ret %r = call <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %b) ret <4 x double> %r @@ -191,7 +191,7 @@ ; NOFP16-NEXT: ; kill: def $d0 killed $d0 def $q0 ; NOFP16-NEXT: mov h3, v1[1] ; NOFP16-NEXT: mov h4, v0[1] -; NOFP16-NEXT: movi.4s v2, #128, lsl #24 +; NOFP16-NEXT: mvni.4s v2, #128, lsl #24 ; NOFP16-NEXT: fcvt s5, h1 ; NOFP16-NEXT: fcvt s6, h0 ; NOFP16-NEXT: mov h7, v1[2] @@ -199,29 +199,29 @@ ; NOFP16-NEXT: fcvt s3, h3 ; NOFP16-NEXT: fcvt s4, h4 ; NOFP16-NEXT: mov h1, v1[3] -; NOFP16-NEXT: bit.16b v6, v5, v2 -; NOFP16-NEXT: fcvt s5, h7 +; NOFP16-NEXT: bit.16b v5, v6, v2 +; NOFP16-NEXT: fcvt s6, h7 ; NOFP16-NEXT: fcvt s7, h16 -; NOFP16-NEXT: bit.16b v4, v3, v2 -; NOFP16-NEXT: mov h3, v0[3] -; NOFP16-NEXT: fcvt h0, s6 +; NOFP16-NEXT: bit.16b v3, v4, v2 +; NOFP16-NEXT: mov h4, v0[3] +; NOFP16-NEXT: fcvt h0, s5 ; NOFP16-NEXT: fcvt s1, h1 -; NOFP16-NEXT: bit.16b v7, v5, v2 -; NOFP16-NEXT: fcvt h4, s4 -; NOFP16-NEXT: fcvt s3, h3 -; NOFP16-NEXT: fcvt h5, s7 -; NOFP16-NEXT: mov.h v0[1], v4[0] -; NOFP16-NEXT: bit.16b v3, v1, v2 +; NOFP16-NEXT: bit.16b v6, v7, v2 +; NOFP16-NEXT: fcvt h3, s3 +; NOFP16-NEXT: fcvt s4, h4 +; NOFP16-NEXT: fcvt h5, s6 +; NOFP16-NEXT: mov.h v0[1], v3[0] +; NOFP16-NEXT: bit.16b v1, v4, v2 ; NOFP16-NEXT: mov.h v0[2], v5[0] -; NOFP16-NEXT: fcvt h1, s3 +; NOFP16-NEXT: fcvt h1, s1 ; NOFP16-NEXT: mov.h v0[3], v1[0] ; NOFP16-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; NOFP16-NEXT: ret ; ; FP16-LABEL: test_copysign_v4f16_v4f16: ; FP16: ; %bb.0: -; FP16-NEXT: movi.4h v2, #128, lsl #8 -; FP16-NEXT: bit.8b v0, v1, v2 +; FP16-NEXT: mvni.4h v2, #128, lsl #8 +; FP16-NEXT: bif.8b v0, v1, v2 ; FP16-NEXT: ret %r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %b) ret <4 x half> %r @@ -233,7 +233,7 @@ ; NOFP16-NEXT: fcvtn v1.4h, v1.4s ; NOFP16-NEXT: ; kill: def $d0 killed $d0 def $q0 ; NOFP16-NEXT: mov h3, v0[1] -; NOFP16-NEXT: movi.4s v2, #128, lsl #24 +; NOFP16-NEXT: mvni.4s v2, #128, lsl #24 ; NOFP16-NEXT: fcvt s5, h0 ; NOFP16-NEXT: mov h7, v0[2] ; NOFP16-NEXT: mov h4, v1[1] @@ -242,30 +242,30 @@ ; NOFP16-NEXT: fcvt s3, h3 ; NOFP16-NEXT: mov h1, v1[3] ; NOFP16-NEXT: fcvt s4, h4 -; NOFP16-NEXT: bit.16b v5, v6, v2 +; NOFP16-NEXT: bif.16b v5, v6, v2 ; NOFP16-NEXT: fcvt s6, h7 ; NOFP16-NEXT: fcvt s7, h16 ; NOFP16-NEXT: fcvt s1, h1 -; NOFP16-NEXT: bit.16b v3, v4, v2 +; NOFP16-NEXT: bif.16b v3, v4, v2 ; NOFP16-NEXT: mov h4, v0[3] ; NOFP16-NEXT: fcvt h0, s5 -; NOFP16-NEXT: bit.16b v6, v7, v2 +; NOFP16-NEXT: bif.16b v6, v7, v2 ; NOFP16-NEXT: fcvt h3, s3 ; NOFP16-NEXT: fcvt s4, h4 ; NOFP16-NEXT: fcvt h5, s6 ; NOFP16-NEXT: mov.h v0[1], v3[0] -; NOFP16-NEXT: bit.16b v4, v1, v2 +; NOFP16-NEXT: bit.16b v1, v4, v2 ; NOFP16-NEXT: mov.h v0[2], v5[0] -; NOFP16-NEXT: fcvt h1, s4 +; NOFP16-NEXT: fcvt h1, s1 ; NOFP16-NEXT: mov.h v0[3], v1[0] ; NOFP16-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; NOFP16-NEXT: ret ; ; FP16-LABEL: test_copysign_v4f16_v4f32: ; FP16: ; %bb.0: -; FP16-NEXT: movi.4h v2, #128, lsl #8 +; FP16-NEXT: mvni.4h v2, #128, lsl #8 ; FP16-NEXT: fcvtn v1.4h, v1.4s -; FP16-NEXT: bit.8b v0, v1, v2 +; FP16-NEXT: bif.8b v0, v1, v2 ; FP16-NEXT: ret %tmp0 = fptrunc <4 x float> %b to <4 x half> %r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %tmp0) @@ -278,28 +278,29 @@ ; NOFP16-NEXT: ; kill: def $d0 killed $d0 def $q0 ; NOFP16-NEXT: mov d4, v1[1] ; NOFP16-NEXT: mov h5, v0[1] -; NOFP16-NEXT: movi.4s v3, #128, lsl #24 +; NOFP16-NEXT: mvni.4s v3, #128, lsl #24 ; NOFP16-NEXT: fcvt s1, d1 ; NOFP16-NEXT: fcvt s6, h0 ; NOFP16-NEXT: mov h7, v0[2] ; NOFP16-NEXT: fcvt s4, d4 ; NOFP16-NEXT: fcvt s5, h5 -; NOFP16-NEXT: bit.16b v6, v1, v3 -; NOFP16-NEXT: fcvt s1, d2 +; NOFP16-NEXT: bit.16b v1, v6, v3 +; NOFP16-NEXT: fcvt s6, d2 ; NOFP16-NEXT: fcvt s7, h7 -; NOFP16-NEXT: bit.16b v5, v4, v3 +; NOFP16-NEXT: bit.16b v4, v5, v3 ; NOFP16-NEXT: mov d2, v2[1] -; NOFP16-NEXT: mov h4, v0[3] -; NOFP16-NEXT: fcvt h0, s6 -; NOFP16-NEXT: bit.16b v7, v1, v3 -; NOFP16-NEXT: fcvt h1, s5 +; NOFP16-NEXT: mov h5, v0[3] +; NOFP16-NEXT: fcvt h0, s1 +; NOFP16-NEXT: bit.16b v6, v7, v3 +; NOFP16-NEXT: fcvt h1, s4 ; NOFP16-NEXT: fcvt s2, d2 -; NOFP16-NEXT: fcvt s4, h4 -; NOFP16-NEXT: fcvt h5, s7 +; NOFP16-NEXT: fcvt s4, h5 +; NOFP16-NEXT: fcvt h5, s6 ; NOFP16-NEXT: mov.h v0[1], v1[0] -; NOFP16-NEXT: bit.16b v4, v2, v3 +; NOFP16-NEXT: mov.16b v1, v3 ; NOFP16-NEXT: mov.h v0[2], v5[0] -; NOFP16-NEXT: fcvt h1, s4 +; NOFP16-NEXT: bsl.16b v1, v4, v2 +; NOFP16-NEXT: fcvt h1, s1 ; NOFP16-NEXT: mov.h v0[3], v1[0] ; NOFP16-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; NOFP16-NEXT: ret @@ -308,7 +309,7 @@ ; FP16: ; %bb.0: ; FP16-NEXT: mov d4, v1[1] ; FP16-NEXT: fcvt h1, d1 -; FP16-NEXT: movi.4h v3, #128, lsl #8 +; FP16-NEXT: mvni.4h v3, #128, lsl #8 ; FP16-NEXT: fcvt h4, d4 ; FP16-NEXT: mov.h v1[1], v4[0] ; FP16-NEXT: fcvt h4, d2 @@ -316,7 +317,7 @@ ; FP16-NEXT: mov.h v1[2], v4[0] ; FP16-NEXT: fcvt h2, d2 ; FP16-NEXT: mov.h v1[3], v2[0] -; FP16-NEXT: bit.8b v0, v1, v3 +; FP16-NEXT: bif.8b v0, v1, v3 ; FP16-NEXT: ret %tmp0 = fptrunc <4 x double> %b to <4 x half> %r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %tmp0) @@ -332,7 +333,7 @@ ; NOFP16: ; %bb.0: ; NOFP16-NEXT: mov h5, v1[1] ; NOFP16-NEXT: mov h6, v0[1] -; NOFP16-NEXT: movi.4s v3, #128, lsl #24 +; NOFP16-NEXT: mvni.4s v3, #128, lsl #24 ; NOFP16-NEXT: fcvt s2, h1 ; NOFP16-NEXT: fcvt s4, h0 ; NOFP16-NEXT: mov h7, v1[2] @@ -340,47 +341,48 @@ ; NOFP16-NEXT: fcvt s5, h5 ; NOFP16-NEXT: fcvt s6, h6 ; NOFP16-NEXT: mov h17, v0[3] -; NOFP16-NEXT: bit.16b v4, v2, v3 -; NOFP16-NEXT: mov h2, v1[3] +; NOFP16-NEXT: mov h18, v0[5] +; NOFP16-NEXT: bit.16b v2, v4, v3 +; NOFP16-NEXT: mov h4, v1[3] ; NOFP16-NEXT: fcvt s7, h7 ; NOFP16-NEXT: fcvt s16, h16 -; NOFP16-NEXT: bit.16b v6, v5, v3 +; NOFP16-NEXT: bit.16b v5, v6, v3 ; NOFP16-NEXT: fcvt s17, h17 -; NOFP16-NEXT: fcvt s18, h2 -; NOFP16-NEXT: mov h5, v1[4] -; NOFP16-NEXT: fcvt h2, s4 -; NOFP16-NEXT: bit.16b v16, v7, v3 -; NOFP16-NEXT: mov h7, v0[4] -; NOFP16-NEXT: fcvt h4, s6 -; NOFP16-NEXT: bit.16b v17, v18, v3 -; NOFP16-NEXT: mov h6, v1[5] -; NOFP16-NEXT: mov h18, v0[5] -; NOFP16-NEXT: fcvt s5, h5 +; NOFP16-NEXT: mov.16b v6, v3 +; NOFP16-NEXT: fcvt s4, h4 +; NOFP16-NEXT: fcvt h2, s2 +; NOFP16-NEXT: fcvt h5, s5 +; NOFP16-NEXT: bsl.16b v6, v16, v7 +; NOFP16-NEXT: mov h7, v1[4] +; NOFP16-NEXT: mov h16, v0[4] +; NOFP16-NEXT: bit.16b v4, v17, v3 +; NOFP16-NEXT: mov h17, v1[5] +; NOFP16-NEXT: mov.h v2[1], v5[0] ; NOFP16-NEXT: fcvt s7, h7 -; NOFP16-NEXT: mov.h v2[1], v4[0] -; NOFP16-NEXT: fcvt h4, s16 -; NOFP16-NEXT: fcvt s6, h6 -; NOFP16-NEXT: fcvt s16, h18 -; NOFP16-NEXT: fcvt h17, s17 -; NOFP16-NEXT: bit.16b v7, v5, v3 -; NOFP16-NEXT: mov h5, v0[6] -; NOFP16-NEXT: mov.h v2[2], v4[0] -; NOFP16-NEXT: mov h4, v1[6] -; NOFP16-NEXT: bit.16b v16, v6, v3 +; NOFP16-NEXT: fcvt s16, h16 +; NOFP16-NEXT: fcvt h5, s6 +; NOFP16-NEXT: fcvt s6, h17 +; NOFP16-NEXT: fcvt s17, h18 +; NOFP16-NEXT: fcvt h4, s4 +; NOFP16-NEXT: bit.16b v7, v16, v3 +; NOFP16-NEXT: mov h16, v0[6] +; NOFP16-NEXT: mov.h v2[2], v5[0] +; NOFP16-NEXT: mov h5, v1[6] +; NOFP16-NEXT: bit.16b v6, v17, v3 ; NOFP16-NEXT: mov h1, v1[7] +; NOFP16-NEXT: fcvt s16, h16 +; NOFP16-NEXT: mov.h v2[3], v4[0] +; NOFP16-NEXT: fcvt h4, s7 ; NOFP16-NEXT: fcvt s5, h5 -; NOFP16-NEXT: mov.h v2[3], v17[0] -; NOFP16-NEXT: fcvt h6, s7 -; NOFP16-NEXT: fcvt s4, h4 ; NOFP16-NEXT: mov h0, v0[7] ; NOFP16-NEXT: fcvt s1, h1 -; NOFP16-NEXT: mov.h v2[4], v6[0] -; NOFP16-NEXT: bit.16b v5, v4, v3 -; NOFP16-NEXT: fcvt h4, s16 +; NOFP16-NEXT: mov.h v2[4], v4[0] +; NOFP16-NEXT: fcvt h4, s6 +; NOFP16-NEXT: bit.16b v5, v16, v3 ; NOFP16-NEXT: fcvt s0, h0 -; NOFP16-NEXT: fcvt h5, s5 ; NOFP16-NEXT: mov.h v2[5], v4[0] -; NOFP16-NEXT: bit.16b v0, v1, v3 +; NOFP16-NEXT: fcvt h5, s5 +; NOFP16-NEXT: bif.16b v0, v1, v3 ; NOFP16-NEXT: mov.h v2[6], v5[0] ; NOFP16-NEXT: fcvt h0, s0 ; NOFP16-NEXT: mov.h v2[7], v0[0] @@ -389,8 +391,8 @@ ; ; FP16-LABEL: test_copysign_v8f16_v8f16: ; FP16: ; %bb.0: -; FP16-NEXT: movi.8h v2, #128, lsl #8 -; FP16-NEXT: bit.16b v0, v1, v2 +; FP16-NEXT: mvni.8h v2, #128, lsl #8 +; FP16-NEXT: bif.16b v0, v1, v2 ; FP16-NEXT: ret %r = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %b) ret <8 x half> %r @@ -401,7 +403,7 @@ ; NOFP16: ; %bb.0: ; NOFP16-NEXT: fcvtn v1.4h, v1.4s ; NOFP16-NEXT: fcvtn v2.4h, v2.4s -; NOFP16-NEXT: movi.4s v3, #128, lsl #24 +; NOFP16-NEXT: mvni.4s v3, #128, lsl #24 ; NOFP16-NEXT: mov h4, v0[1] ; NOFP16-NEXT: mov h5, v0[4] ; NOFP16-NEXT: fcvt s7, h0 @@ -413,42 +415,43 @@ ; NOFP16-NEXT: fcvt s5, h5 ; NOFP16-NEXT: fcvt s17, h17 ; NOFP16-NEXT: fcvt s6, h6 -; NOFP16-NEXT: bit.16b v7, v16, v3 +; NOFP16-NEXT: bif.16b v7, v16, v3 ; NOFP16-NEXT: fcvt s16, h2 ; NOFP16-NEXT: fcvt s18, h18 -; NOFP16-NEXT: bit.16b v4, v6, v3 +; NOFP16-NEXT: bif.16b v4, v6, v3 ; NOFP16-NEXT: mov h6, v0[3] -; NOFP16-NEXT: bit.16b v5, v16, v3 +; NOFP16-NEXT: bif.16b v5, v16, v3 ; NOFP16-NEXT: mov h16, v1[3] ; NOFP16-NEXT: fcvt h1, s7 -; NOFP16-NEXT: mov h7, v0[5] -; NOFP16-NEXT: bit.16b v17, v18, v3 +; NOFP16-NEXT: mov.16b v7, v3 ; NOFP16-NEXT: fcvt h4, s4 ; NOFP16-NEXT: fcvt s6, h6 ; NOFP16-NEXT: fcvt s16, h16 -; NOFP16-NEXT: mov h18, v2[1] -; NOFP16-NEXT: fcvt s7, h7 ; NOFP16-NEXT: fcvt h5, s5 +; NOFP16-NEXT: bsl.16b v7, v17, v18 +; NOFP16-NEXT: mov h17, v0[5] +; NOFP16-NEXT: mov h18, v2[1] ; NOFP16-NEXT: mov.h v1[1], v4[0] -; NOFP16-NEXT: fcvt h4, s17 -; NOFP16-NEXT: bit.16b v6, v16, v3 +; NOFP16-NEXT: bif.16b v6, v16, v3 +; NOFP16-NEXT: fcvt h4, s7 +; NOFP16-NEXT: fcvt s7, h17 ; NOFP16-NEXT: fcvt s17, h18 ; NOFP16-NEXT: mov h16, v2[2] +; NOFP16-NEXT: mov h2, v2[3] +; NOFP16-NEXT: fcvt h6, s6 ; NOFP16-NEXT: mov.h v1[2], v4[0] ; NOFP16-NEXT: mov h4, v0[6] -; NOFP16-NEXT: mov h0, v0[7] -; NOFP16-NEXT: fcvt h6, s6 -; NOFP16-NEXT: mov h2, v2[3] -; NOFP16-NEXT: bit.16b v7, v17, v3 +; NOFP16-NEXT: bif.16b v7, v17, v3 ; NOFP16-NEXT: fcvt s16, h16 +; NOFP16-NEXT: mov h0, v0[7] +; NOFP16-NEXT: fcvt s2, h2 ; NOFP16-NEXT: fcvt s4, h4 -; NOFP16-NEXT: fcvt s0, h0 ; NOFP16-NEXT: mov.h v1[3], v6[0] -; NOFP16-NEXT: fcvt s2, h2 -; NOFP16-NEXT: bit.16b v4, v16, v3 +; NOFP16-NEXT: fcvt s0, h0 +; NOFP16-NEXT: bif.16b v4, v16, v3 ; NOFP16-NEXT: mov.h v1[4], v5[0] ; NOFP16-NEXT: fcvt h5, s7 -; NOFP16-NEXT: bit.16b v0, v2, v3 +; NOFP16-NEXT: bif.16b v0, v2, v3 ; NOFP16-NEXT: fcvt h4, s4 ; NOFP16-NEXT: mov.h v1[5], v5[0] ; NOFP16-NEXT: fcvt h0, s0 @@ -461,9 +464,9 @@ ; FP16: ; %bb.0: ; FP16-NEXT: fcvtn v2.4h, v2.4s ; FP16-NEXT: fcvtn v1.4h, v1.4s -; FP16-NEXT: movi.8h v3, #128, lsl #8 +; FP16-NEXT: mvni.8h v3, #128, lsl #8 ; FP16-NEXT: mov.d v1[1], v2[0] -; FP16-NEXT: bit.16b v0, v1, v3 +; FP16-NEXT: bif.16b v0, v1, v3 ; FP16-NEXT: ret %tmp0 = fptrunc <8 x float> %b to <8 x half> %r = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %tmp0)