diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -429,6 +429,57 @@ default: break; + case AArch64::BSPv8i8: + case AArch64::BSPv16i8: { + Register DstReg = MI.getOperand(0).getReg(); + if (DstReg == MI.getOperand(3).getReg()) { + // Expand to BIT + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BITv8i8 + : AArch64::BITv16i8)) + .add(MI.getOperand(0)) + .add(MI.getOperand(3)) + .add(MI.getOperand(2)) + .add(MI.getOperand(1)); + } else if (DstReg == MI.getOperand(2).getReg()) { + // Expand to BIF + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BIFv8i8 + : AArch64::BIFv16i8)) + .add(MI.getOperand(0)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(1)); + } else { + // Expand to BSL, use additional move if required + if (DstReg == MI.getOperand(1).getReg()) { + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8 + : AArch64::BSLv16i8)) + .add(MI.getOperand(0)) + .add(MI.getOperand(1)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)); + } else { + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::ORRv8i8 + : AArch64::ORRv16i8)) + .addReg(DstReg) + .add(MI.getOperand(1)) + .add(MI.getOperand(1)); + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8 + : AArch64::BSLv16i8)) + .add(MI.getOperand(0)) + .addReg(DstReg) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)); + } + } + MI.eraseFromParent(); + return true; + } + case AArch64::ADDWrr: case AArch64::SUBWrr: case AArch64::ADDXrr: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -90,9 +90,9 @@ BICi, ORRi, - // Vector bit select: similar to ISD::VSELECT but not all bits within an + // Vector bitwise select: similar to ISD::VSELECT but not all bits within an // element must be identical. - BSL, + BSP, // Vector arithmetic negation NEG, @@ -166,7 +166,7 @@ // Vector bitwise negation NOT, - // Vector bitwise selection + // Vector bitwise insertion BIT, // Compare-and-branch diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1287,7 +1287,7 @@ case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl"; case AArch64ISD::BICi: return "AArch64ISD::BICi"; case AArch64ISD::ORRi: return "AArch64ISD::ORRi"; - case AArch64ISD::BSL: return "AArch64ISD::BSL"; + case AArch64ISD::BSP: return "AArch64ISD::BSP"; case AArch64ISD::NEG: return "AArch64ISD::NEG"; case AArch64ISD::EXTR: return "AArch64ISD::EXTR"; case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1"; @@ -10229,7 +10229,7 @@ } if (FoundMatch) - return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0), + return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0), N0->getOperand(1 - i), N1->getOperand(1 - j)); } diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -5207,6 +5207,47 @@ let Inst{4-0} = Rd; } +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDThreeSameVectorPseudo pattern> + : Pseudo<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), pattern>, + Sched<[WriteV]>; + +multiclass SIMDLogicalThreeVectorPseudo { + def v8i8 : BaseSIMDThreeSameVectorPseudo; + def v16i8 : BaseSIMDThreeSameVectorPseudo; + + def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS), + (v4i16 V64:$RHS))), + (!cast(NAME#"v8i8") + V64:$LHS, V64:$MHS, V64:$RHS)>; + def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS), + (v2i32 V64:$RHS))), + (!cast(NAME#"v8i8") + V64:$LHS, V64:$MHS, V64:$RHS)>; + def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS), + (v1i64 V64:$RHS))), + (!cast(NAME#"v8i8") + V64:$LHS, V64:$MHS, V64:$RHS)>; + + def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS), + (v8i16 V128:$RHS))), + (!cast(NAME#"v16i8") + V128:$LHS, V128:$MHS, V128:$RHS)>; + def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS), + (v4i32 V128:$RHS))), + (!cast(NAME#"v16i8") + V128:$LHS, V128:$MHS, V128:$RHS)>; + def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS), + (v2i64 V128:$RHS))), + (!cast(NAME#"v16i8") + V128:$LHS, V128:$MHS, V128:$RHS)>; +} + // All operand sizes distinguished in the encoding. multiclass SIMDThreeSameVector opc, string asm, SDPatternOperator OpNode> { @@ -5427,7 +5468,7 @@ } multiclass SIMDLogicalThreeVectorTied size, - string asm, SDPatternOperator OpNode> { + string asm, SDPatternOperator OpNode = null_frag> { def v8i8 : BaseSIMDThreeSameVectorTied<0, U, {size,1}, 0b00011, V64, asm, ".8b", [(set (v8i8 V64:$dst), diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -468,7 +468,7 @@ def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>; def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>; -def AArch64bsl: SDNode<"AArch64ISD::BSL", SDT_AArch64trivec>; +def AArch64bsp: SDNode<"AArch64ISD::BSP", SDT_AArch64trivec>; def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>; def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>; @@ -3955,33 +3955,53 @@ defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>; defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic", BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >; -defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">; -defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>; -defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl", - TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>; defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>; defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn", BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >; defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>; - -def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm), - (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm), - (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(AArch64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm), - (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(AArch64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm), - (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; - -def : Pat<(AArch64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm), - (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; -def : Pat<(AArch64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm), - (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; -def : Pat<(AArch64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm), - (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; -def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm), - (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +// Pseudo bitwise select pattern BSP. +// It is expanded into BSL/BIT/BIF after register allocation. +defm BSP : SIMDLogicalThreeVectorPseudo>; +defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl">; +defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>; +defm BIF : SIMDLogicalThreeVectorTied<1, 0b11, "bif">; + +def : Pat<(AArch64bsp (v8i8 V64:$Rd), V64:$Rn, V64:$Rm), + (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bsp (v4i16 V64:$Rd), V64:$Rn, V64:$Rm), + (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bsp (v2i32 V64:$Rd), V64:$Rn, V64:$Rm), + (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bsp (v1i64 V64:$Rd), V64:$Rn, V64:$Rm), + (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; + +def : Pat<(AArch64bsp (v16i8 V128:$Rd), V128:$Rn, V128:$Rm), + (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bsp (v8i16 V128:$Rd), V128:$Rn, V128:$Rm), + (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bsp (v4i32 V128:$Rd), V128:$Rn, V128:$Rm), + (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bsp (v2i64 V128:$Rd), V128:$Rn, V128:$Rm), + (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; + +def : Pat<(AArch64bit (v8i8 V64:$Rd), V64:$Rn, V64:$Rm), + (BITv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bit (v4i16 V64:$Rd), V64:$Rn, V64:$Rm), + (BITv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bit (v2i32 V64:$Rd), V64:$Rn, V64:$Rm), + (BITv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bit (v1i64 V64:$Rd), V64:$Rn, V64:$Rm), + (BITv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; + +def : Pat<(AArch64bit (v16i8 V128:$Rd), V128:$Rn, V128:$Rm), + (BITv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bit (v8i16 V128:$Rd), V128:$Rn, V128:$Rm), + (BITv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bit (v4i32 V128:$Rd), V128:$Rn, V128:$Rm), + (BITv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bit (v2i64 V128:$Rd), V128:$Rn, V128:$Rm), + (BITv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}", (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedA57.td b/llvm/lib/Target/AArch64/AArch64SchedA57.td --- a/llvm/lib/Target/AArch64/AArch64SchedA57.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA57.td @@ -501,7 +501,7 @@ // Q form - v16i8, v8i16, v4i32, v2i64 // ASIMD bitwise insert, Q-form -def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL)v16i8")>; +def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL|BSP)v16i8")>; // ASIMD duplicate, gen reg, D-form and Q-form def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^CPY")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td --- a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td +++ b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td @@ -494,7 +494,7 @@ // WriteV includes: // SHLL,SSHLL,USHLL // SLI,SRI -// BIF,BIT,BSL +// BIF,BIT,BSL,BSP // EXT // CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN // XTN2 diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td @@ -660,7 +660,7 @@ // ASIMD miscellaneous instructions. def : InstRW<[M3WriteNALU1], (instregex "^RBITv")>; -def : InstRW<[M3WriteNALU1], (instregex "^(BIF|BIT|BSL)v")>; +def : InstRW<[M3WriteNALU1], (instregex "^(BIF|BIT|BSL|BSP)v")>; def : InstRW<[M3WriteNEONB], (instregex "^DUPv.+gpr")>; def : InstRW<[M3WriteNSHF1], (instregex "^DUPv.+lane")>; def : InstRW<[M3WriteNSHF1], (instregex "^EXTv")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td @@ -803,7 +803,7 @@ // ASIMD miscellaneous instructions. def : InstRW<[M4WriteNALU1], (instregex "^RBITv")>; -def : InstRW<[M4WriteNALU1], (instregex "^(BIF|BIT|BSL)v")>; +def : InstRW<[M4WriteNALU1], (instregex "^(BIF|BIT|BSL|BSP)v")>; def : InstRW<[M4WriteNALU1], (instregex "^CL[STZ]v")>; def : InstRW<[M4WriteNEONB], (instregex "^DUPv.+gpr")>; def : InstRW<[M4WriteNSHF1], (instregex "^CPY")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td @@ -841,7 +841,7 @@ // ASIMD miscellaneous instructions. def : InstRW<[M5WriteNALU2], (instregex "^RBITv")>; -def : InstRW<[M5WriteNALU2], (instregex "^(BIF|BIT|BSL)v")>; +def : InstRW<[M5WriteNALU2], (instregex "^(BIF|BIT|BSL|BSP)v")>; def : InstRW<[M5WriteNALU2], (instregex "^CL[STZ]v")>; def : InstRW<[M5WriteNEONB], (instregex "^DUPv.+gpr")>; def : InstRW<[M5WriteNSHF2], (instregex "^CPY")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td --- a/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td +++ b/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td @@ -911,7 +911,7 @@ def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^CPY(i8|i16|i32|i64)$")>; def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^INSv(i8|i16)(gpr|lane)$")>; def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^(S|U)MOVv.*$")>; -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v8i8$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(BIF|BIT|BSL|BSP)v8i8$")>; def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs EXTv8i8)>; def : InstRW<[FalkorWr_1VXVY_0cyc], (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)$")>; // imm fwd def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs TBLv8i8One)>; @@ -935,7 +935,7 @@ def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc], (instregex "^INSv(i32|i64)(gpr|lane)$")>; def : InstRW<[FalkorWr_2GTOV_1cyc], (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>; -def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v16i8$")>; +def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(BIF|BIT|BSL|BSP)v16i8$")>; def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs EXTv16i8)>; def : InstRW<[FalkorWr_2VXVY_0cyc], (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)$")>; // imm fwd def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs NOTv16i8)>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td b/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td --- a/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td +++ b/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td @@ -462,13 +462,13 @@ let Latency = 1; let NumMicroOps = 2; } def : InstRW<[KryoWrite_1cyc_X_noRSV_74ln], - (instrs BIFv8i8, BITv8i8, BSLv8i8)>; + (instrs BIFv8i8, BITv8i8, BSLv8i8, BSPv8i8)>; def KryoWrite_1cyc_X_X_75ln : SchedWriteRes<[KryoUnitX, KryoUnitX]> { let Latency = 1; let NumMicroOps = 2; } def : InstRW<[KryoWrite_1cyc_X_X_75ln], - (instrs BIFv16i8, BITv16i8, BSLv16i8)>; + (instrs BIFv16i8, BITv16i8, BSLv16i8, BSPv16i8)>; def KryoWrite_0cyc_noRSV_11ln : SchedWriteRes<[]> { let Latency = 0; let NumMicroOps = 1; diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td --- a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td +++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td @@ -1482,7 +1482,7 @@ // ASIMD bitwise insert, D-form // ASIMD bitwise insert, Q-form def : InstRW<[THX2T99Write_5Cyc_F01], - (instregex "^BIFv", "^BITv", "^BSLv")>; + (instregex "^BIFv", "^BITv", "^BSLv", "^BSPv")>; // ASIMD count, D-form // ASIMD count, Q-form diff --git a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll @@ -0,0 +1,146 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +; BIF Bitwise Insert if False +; +; 8-bit vectors tests + +define <1 x i8> @test_bitf_v1i8(<1 x i8> %A, <1 x i8> %B, <1 x i8> %C) { +; CHECK-LABEL: test_bitf_v1i8: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %neg = xor <1 x i8> %C, + %and = and <1 x i8> %neg, %B + %and1 = and <1 x i8> %C, %A + %or = or <1 x i8> %and, %and1 + ret <1 x i8> %or +} + +; 16-bit vectors tests + +define <1 x i16> @test_bitf_v1i16(<1 x i16> %A, <1 x i16> %B, <1 x i16> %C) { +; CHECK-LABEL: test_bitf_v1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %neg = xor <1 x i16> %C, + %and = and <1 x i16> %neg, %B + %and1 = and <1 x i16> %C, %A + %or = or <1 x i16> %and, %and1 + ret <1 x i16> %or +} + +; 32-bit vectors tests + +define <1 x i32> @test_bitf_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) { +; CHECK-LABEL: test_bitf_v1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %neg = xor <1 x i32> %C, + %and = and <1 x i32> %neg, %B + %and1 = and <1 x i32> %C, %A + %or = or <1 x i32> %and, %and1 + ret <1 x i32> %or +} + +; 64-bit vectors tests + +define <1 x i64> @test_bitf_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C) { +; CHECK-LABEL: test_bitf_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %neg = xor <1 x i64> %C, + %and = and <1 x i64> %neg, %B + %and1 = and <1 x i64> %C, %A + %or = or <1 x i64> %and, %and1 + ret <1 x i64> %or +} + +define <2 x i32> @test_bitf_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) { +; CHECK-LABEL: test_bitf_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %neg = xor <2 x i32> %C, + %and = and <2 x i32> %neg, %B + %and1 = and <2 x i32> %C, %A + %or = or <2 x i32> %and, %and1 + ret <2 x i32> %or +} + +define <4 x i16> @test_bitf_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) { +; CHECK-LABEL: test_bitf_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %neg = xor <4 x i16> %C, + %and = and <4 x i16> %neg, %B + %and1 = and <4 x i16> %C, %A + %or = or <4 x i16> %and, %and1 + ret <4 x i16> %or +} + +define <8 x i8> @test_bitf_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) { +; CHECK-LABEL: test_bitf_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %neg = xor <8 x i8> %C, + %and = and <8 x i8> %neg, %B + %and1 = and <8 x i8> %C, %A + %or = or <8 x i8> %and, %and1 + ret <8 x i8> %or +} + +; 128-bit vectors tests + +define <2 x i64> @test_bitf_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C) { +; CHECK-LABEL: test_bitf_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %neg = xor <2 x i64> %C, + %and = and <2 x i64> %neg, %B + %and1 = and <2 x i64> %C, %A + %or = or <2 x i64> %and, %and1 + ret <2 x i64> %or +} + +define <4 x i32> @test_bitf_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_bitf_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %neg = xor <4 x i32> %C, + %and = and <4 x i32> %neg, %B + %and1 = and <4 x i32> %C, %A + %or = or <4 x i32> %and, %and1 + ret <4 x i32> %or +} + +define <8 x i16> @test_bitf_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) { +; CHECK-LABEL: test_bitf_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %neg = xor <8 x i16> %C, + %and = and <8 x i16> %neg, %B + %and1 = and <8 x i16> %C, %A + %or = or <8 x i16> %and, %and1 + ret <8 x i16> %or +} + +define <16 x i8> @test_bitf_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) { +; CHECK-LABEL: test_bitf_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %neg = xor <16 x i8> %C, + %and = and <16 x i8> %neg, %B + %and1 = and <16 x i8> %C, %A + %or = or <16 x i8> %and, %and1 + ret <16 x i8> %or +} diff --git a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll @@ -0,0 +1,146 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +; BIT Bitwise Insert if True +; +; 8-bit vectors tests + +define <1 x i8> @test_bit_v1i8(<1 x i8> %A, <1 x i8> %B, <1 x i8> %C) { +; CHECK-LABEL: test_bit_v1i8: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %and = and <1 x i8> %C, %B + %neg = xor <1 x i8> %C, + %and1 = and <1 x i8> %neg, %A + %or = or <1 x i8> %and, %and1 + ret <1 x i8> %or +} + +; 16-bit vectors tests + +define <1 x i16> @test_bit_v1i16(<1 x i16> %A, <1 x i16> %B, <1 x i16> %C) { +; CHECK-LABEL: test_bit_v1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %and = and <1 x i16> %C, %B + %neg = xor <1 x i16> %C, + %and1 = and <1 x i16> %neg, %A + %or = or <1 x i16> %and, %and1 + ret <1 x i16> %or +} + +; 32-bit vectors tests + +define <1 x i32> @test_bit_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) { +; CHECK-LABEL: test_bit_v1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %and = and <1 x i32> %C, %B + %neg = xor <1 x i32> %C, + %and1 = and <1 x i32> %neg, %A + %or = or <1 x i32> %and, %and1 + ret <1 x i32> %or +} + +; 64-bit vectors tests + +define <1 x i64> @test_bit_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C) { +; CHECK-LABEL: test_bit_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %and = and <1 x i64> %C, %B + %neg = xor <1 x i64> %C, + %and1 = and <1 x i64> %neg, %A + %or = or <1 x i64> %and, %and1 + ret <1 x i64> %or +} + +define <2 x i32> @test_bit_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) { +; CHECK-LABEL: test_bit_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %and = and <2 x i32> %C, %B + %neg = xor <2 x i32> %C, + %and1 = and <2 x i32> %neg, %A + %or = or <2 x i32> %and, %and1 + ret <2 x i32> %or +} + +define <4 x i16> @test_bit_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) { +; CHECK-LABEL: test_bit_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %and = and <4 x i16> %C, %B + %neg = xor <4 x i16> %C, + %and1 = and <4 x i16> %neg, %A + %or = or <4 x i16> %and, %and1 + ret <4 x i16> %or +} + +define <8 x i8> @test_bit_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) { +; CHECK-LABEL: test_bit_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %and = and <8 x i8> %C, %B + %neg = xor <8 x i8> %C, + %and1 = and <8 x i8> %neg, %A + %or = or <8 x i8> %and, %and1 + ret <8 x i8> %or +} + +; 128-bit vectors tests + +define <2 x i64> @test_bit_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C) { +; CHECK-LABEL: test_bit_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %and = and <2 x i64> %C, %B + %neg = xor <2 x i64> %C, + %and1 = and <2 x i64> %neg, %A + %or = or <2 x i64> %and, %and1 + ret <2 x i64> %or +} + +define <4 x i32> @test_bit_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_bit_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %and = and <4 x i32> %C, %B + %neg = xor <4 x i32> %C, + %and1 = and <4 x i32> %neg, %A + %or = or <4 x i32> %and, %and1 + ret <4 x i32> %or +} + +define <8 x i16> @test_bit_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) { +; CHECK-LABEL: test_bit_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %and = and <8 x i16> %C, %B + %neg = xor <8 x i16> %C, + %and1 = and <8 x i16> %neg, %A + %or = or <8 x i16> %and, %and1 + ret <8 x i16> %or +} + +define <16 x i8> @test_bit_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) { +; CHECK-LABEL: test_bit_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %and = and <16 x i8> %C, %B + %neg = xor <16 x i8> %C, + %and1 = and <16 x i8> %neg, %A + %or = or <16 x i8> %and, %and1 + ret <16 x i8> %or +} diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll b/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll @@ -9,8 +9,7 @@ ; CHECK-NEXT: fmov s3, w0 ; CHECK-NEXT: cmeq v2.8b, v3.8b, v2.8b ; CHECK-NEXT: dup v2.8b, v2.b[0] -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %cmp31 = icmp eq i8 %a, %b %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d @@ -49,8 +48,7 @@ ; CHECK-NEXT: fmov s3, w0 ; CHECK-NEXT: cmeq v2.16b, v3.16b, v2.16b ; CHECK-NEXT: dup v2.16b, v2.b[0] -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %cmp31 = icmp eq i8 %a, %b %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d @@ -92,8 +90,7 @@ ; CHECK-NEXT: fmov s3, w0 ; CHECK-NEXT: cmeq v2.4h, v3.4h, v2.4h ; CHECK-NEXT: dup v2.4h, v2.h[0] -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %cmp31 = icmp eq i16 %a, %b %e = select i1 %cmp31, <4x i16> %c, <4x i16> %d @@ -107,8 +104,7 @@ ; CHECK-NEXT: fmov s3, w0 ; CHECK-NEXT: cmeq v2.8h, v3.8h, v2.8h ; CHECK-NEXT: dup v2.8h, v2.h[0] -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %cmp31 = icmp eq i16 %a, %b %e = select i1 %cmp31, <8x i16> %c, <8x i16> %d @@ -122,8 +118,7 @@ ; CHECK-NEXT: fmov s3, w0 ; CHECK-NEXT: cmeq v2.2s, v3.2s, v2.2s ; CHECK-NEXT: dup v2.2s, v2.s[0] -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %cmp31 = icmp eq i32 %a, %b %e = select i1 %cmp31, <2x i32> %c, <2x i32> %d @@ -137,8 +132,7 @@ ; CHECK-NEXT: fmov s3, w0 ; CHECK-NEXT: cmeq v2.4s, v3.4s, v2.4s ; CHECK-NEXT: dup v2.4s, v2.s[0] -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %cmp31 = icmp eq i32 %a, %b %e = select i1 %cmp31, <4x i32> %c, <4x i32> %d @@ -151,8 +145,7 @@ ; CHECK-NEXT: fmov d2, x1 ; CHECK-NEXT: fmov d3, x0 ; CHECK-NEXT: cmeq d2, d3, d2 -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %cmp31 = icmp eq i64 %a, %b %e = select i1 %cmp31, <1x i64> %c, <1x i64> %d @@ -166,8 +159,7 @@ ; CHECK-NEXT: fmov d3, x0 ; CHECK-NEXT: cmeq v2.2d, v3.2d, v2.2d ; CHECK-NEXT: dup v2.2d, v2.d[0] -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %cmp31 = icmp eq i64 %a, %b %e = select i1 %cmp31, <2x i64> %c, <2x i64> %d @@ -222,8 +214,7 @@ ; CHECK-NEXT: fmov s3, w0 ; CHECK-NEXT: cmeq v2.4s, v3.4s, v2.4s ; CHECK-NEXT: dup v2.4s, v2.s[0] -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %cmp31 = icmp eq i32 %a, %b %e = select i1 %cmp31, <4x float> %c, <4x float> %d @@ -247,8 +238,7 @@ ; CHECK-NEXT: fmov d2, x1 ; CHECK-NEXT: fmov d3, x0 ; CHECK-NEXT: cmeq d2, d3, d2 -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %cmp31 = icmp eq i64 %a, %b %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d @@ -278,8 +268,7 @@ ; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: csetm w8, ne ; CHECK-NEXT: dup v2.2s, w8 -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %cmp = icmp ne i1 %cc, 0 %e = select i1 %cmp, <2 x i32> %a, <2 x i32> %b @@ -294,8 +283,7 @@ ; CHECK-NEXT: // kill: def $s3 killed $s3 def $q3 ; CHECK-NEXT: fcmeq v2.4s, v2.4s, v3.4s ; CHECK-NEXT: dup v2.4s, v2.s[0] -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %cc = fcmp oeq float %c1, %c2 %r = select i1 %cc, <3 x float> %a, <3 x float> %b @@ -309,8 +297,7 @@ ; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3 ; CHECK-NEXT: fcmeq v2.2d, v2.2d, v3.2d ; CHECK-NEXT: dup v2.2d, v2.d[0] -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %cc = fcmp oeq double %c1, %c2 %r = select i1 %cc, <3 x float> %a, <3 x float> %b diff --git a/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll --- a/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll @@ -5,8 +5,7 @@ define <4 x half> @select_64(<4 x half> %a, <4 x half> %b, <4 x i16> %c) #0 { ; CHECK-LABEL: select_64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret entry: %0 = bitcast <4 x half> %a to <4 x i16> @@ -23,8 +22,7 @@ define <8 x half> @select_128(<8 x half> %a, <8 x half> %b, <8 x i16> %c) #0 { ; CHECK-LABEL: select_128: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret entry: %0 = bitcast <8 x half> %a to <8 x i16> diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll --- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll +++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll @@ -61,8 +61,7 @@ ; CHECK-LABEL: bsl8xi8_const: ; CHECK: // %bb.0: ; CHECK-NEXT: movi d2, #0x00ffff0000ffff -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %tmp1 = and <8 x i8> %a, < i8 -1, i8 -1, i8 0, i8 0, i8 -1, i8 -1, i8 0, i8 0 > %tmp2 = and <8 x i8> %b, < i8 0, i8 0, i8 -1, i8 -1, i8 0, i8 0, i8 -1, i8 -1 > @@ -74,8 +73,7 @@ ; CHECK-LABEL: bsl16xi8_const: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v2.2d, #0x000000ffffffff -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %tmp1 = and <16 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0 > %tmp2 = and <16 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1 > @@ -664,8 +662,7 @@ ; CHECK-LABEL: bsl2xi32_const: ; CHECK: // %bb.0: ; CHECK-NEXT: movi d2, #0x000000ffffffff -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %tmp1 = and <2 x i32> %a, < i32 -1, i32 0 > %tmp2 = and <2 x i32> %b, < i32 0, i32 -1 > @@ -678,8 +675,7 @@ ; CHECK-LABEL: bsl4xi16_const: ; CHECK: // %bb.0: ; CHECK-NEXT: movi d2, #0x00ffff0000ffff -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %tmp1 = and <4 x i16> %a, < i16 -1, i16 0, i16 -1,i16 0 > %tmp2 = and <4 x i16> %b, < i16 0, i16 -1,i16 0, i16 -1 > @@ -691,8 +687,7 @@ ; CHECK-LABEL: bsl1xi64_const: ; CHECK: // %bb.0: ; CHECK-NEXT: movi d2, #0xffffffffffffff00 -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %tmp1 = and <1 x i64> %a, < i64 -256 > %tmp2 = and <1 x i64> %b, < i64 255 > @@ -704,8 +699,7 @@ ; CHECK-LABEL: bsl4xi32_const: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v2.2d, #0x000000ffffffff -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %tmp1 = and <4 x i32> %a, < i32 -1, i32 0, i32 -1, i32 0 > %tmp2 = and <4 x i32> %b, < i32 0, i32 -1, i32 0, i32 -1 > @@ -717,8 +711,7 @@ ; CHECK-LABEL: bsl8xi16_const: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v2.2d, #0x000000ffffffff -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %tmp1 = and <8 x i16> %a, < i16 -1, i16 -1, i16 0,i16 0, i16 -1, i16 -1, i16 0,i16 0 > %tmp2 = and <8 x i16> %b, < i16 0, i16 0, i16 -1, i16 -1, i16 0, i16 0, i16 -1, i16 -1 > @@ -731,8 +724,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI75_0 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI75_0] -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %tmp1 = and <2 x i64> %a, < i64 -1, i64 0 > %tmp2 = and <2 x i64> %b, < i64 0, i64 -1 > diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll --- a/llvm/test/CodeGen/AArch64/sat-add.ll +++ b/llvm/test/CodeGen/AArch64/sat-add.ll @@ -480,9 +480,9 @@ ; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: mov w9, #42 ; CHECK-NEXT: cmhi v2.2d, v1.2d, v0.2d -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: dup v0.2d, x9 -; CHECK-NEXT: add v0.2d, v2.2d, v0.2d +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: dup v1.2d, x9 +; CHECK-NEXT: add v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %c = icmp ult <2 x i64> %x, %s = select <2 x i1> %c, <2 x i64> %x, <2 x i64> @@ -653,8 +653,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mvn v2.16b, v1.16b ; CHECK-NEXT: cmhi v3.2d, v2.2d, v0.2d -; CHECK-NEXT: bsl v3.16b, v0.16b, v2.16b -; CHECK-NEXT: add v0.2d, v3.2d, v1.2d +; CHECK-NEXT: bif v0.16b, v2.16b, v3.16b +; CHECK-NEXT: add v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %noty = xor <2 x i64> %y, %c = icmp ult <2 x i64> %x, %noty diff --git a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll --- a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll +++ b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll @@ -71,10 +71,9 @@ ; CHECK-NEXT: fmul v2.2s, v1.2s, v1.2s ; CHECK-NEXT: frsqrts v2.2s, v0.2s, v2.2s ; CHECK-NEXT: fmul v2.2s, v2.2s, v0.2s -; CHECK-NEXT: fmul v2.2s, v1.2s, v2.2s -; CHECK-NEXT: fcmeq v1.2s, v0.2s, #0.0 -; CHECK-NEXT: bsl v1.8b, v0.8b, v2.8b -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: fmul v1.2s, v1.2s, v2.2s +; CHECK-NEXT: fcmeq v2.2s, v0.2s, #0.0 +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %1 = tail call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> %a) ret <2 x float> %1 @@ -95,10 +94,9 @@ ; CHECK-NEXT: fmul v2.4s, v1.4s, v1.4s ; CHECK-NEXT: frsqrts v2.4s, v0.4s, v2.4s ; CHECK-NEXT: fmul v2.4s, v2.4s, v0.4s -; CHECK-NEXT: fmul v2.4s, v1.4s, v2.4s -; CHECK-NEXT: fcmeq v1.4s, v0.4s, #0.0 -; CHECK-NEXT: bsl v1.16b, v0.16b, v2.16b -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: fmul v1.4s, v1.4s, v2.4s +; CHECK-NEXT: fcmeq v2.4s, v0.4s, #0.0 +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %1 = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) ret <4 x float> %1 @@ -120,21 +118,19 @@ ; CHECK-NEXT: fmul v3.4s, v2.4s, v2.4s ; CHECK-NEXT: frsqrts v3.4s, v0.4s, v3.4s ; CHECK-NEXT: fmul v3.4s, v3.4s, v0.4s -; CHECK-NEXT: fmul v3.4s, v2.4s, v3.4s -; CHECK-NEXT: fcmeq v2.4s, v0.4s, #0.0 -; CHECK-NEXT: bsl v2.16b, v0.16b, v3.16b -; CHECK-NEXT: frsqrte v0.4s, v1.4s -; CHECK-NEXT: fmul v3.4s, v0.4s, v0.4s +; CHECK-NEXT: fmul v2.4s, v2.4s, v3.4s +; CHECK-NEXT: fcmeq v3.4s, v0.4s, #0.0 +; CHECK-NEXT: bif v0.16b, v2.16b, v3.16b +; CHECK-NEXT: frsqrte v2.4s, v1.4s +; CHECK-NEXT: fmul v3.4s, v2.4s, v2.4s ; CHECK-NEXT: frsqrts v3.4s, v1.4s, v3.4s -; CHECK-NEXT: fmul v0.4s, v0.4s, v3.4s -; CHECK-NEXT: fmul v3.4s, v0.4s, v0.4s +; CHECK-NEXT: fmul v2.4s, v2.4s, v3.4s +; CHECK-NEXT: fmul v3.4s, v2.4s, v2.4s ; CHECK-NEXT: frsqrts v3.4s, v1.4s, v3.4s ; CHECK-NEXT: fmul v3.4s, v3.4s, v1.4s -; CHECK-NEXT: fmul v0.4s, v0.4s, v3.4s +; CHECK-NEXT: fmul v2.4s, v2.4s, v3.4s ; CHECK-NEXT: fcmeq v3.4s, v1.4s, #0.0 -; CHECK-NEXT: bsl v3.16b, v1.16b, v0.16b -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: mov v1.16b, v3.16b +; CHECK-NEXT: bif v1.16b, v2.16b, v3.16b ; CHECK-NEXT: ret %1 = tail call fast <8 x float> @llvm.sqrt.v8f32(<8 x float> %a) ret <8 x float> %1 @@ -210,10 +206,9 @@ ; CHECK-NEXT: fmul v2.2d, v1.2d, v1.2d ; CHECK-NEXT: frsqrts v2.2d, v0.2d, v2.2d ; CHECK-NEXT: fmul v2.2d, v2.2d, v0.2d -; CHECK-NEXT: fmul v2.2d, v1.2d, v2.2d -; CHECK-NEXT: fcmeq v1.2d, v0.2d, #0.0 -; CHECK-NEXT: bsl v1.16b, v0.16b, v2.16b -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: fmul v1.2d, v1.2d, v2.2d +; CHECK-NEXT: fcmeq v2.2d, v0.2d, #0.0 +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %1 = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) ret <2 x double> %1 @@ -238,24 +233,22 @@ ; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d ; CHECK-NEXT: frsqrts v3.2d, v0.2d, v3.2d ; CHECK-NEXT: fmul v3.2d, v3.2d, v0.2d -; CHECK-NEXT: fmul v3.2d, v2.2d, v3.2d -; CHECK-NEXT: fcmeq v2.2d, v0.2d, #0.0 -; CHECK-NEXT: bsl v2.16b, v0.16b, v3.16b -; CHECK-NEXT: frsqrte v0.2d, v1.2d -; CHECK-NEXT: fmul v3.2d, v0.2d, v0.2d +; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d +; CHECK-NEXT: fcmeq v3.2d, v0.2d, #0.0 +; CHECK-NEXT: bif v0.16b, v2.16b, v3.16b +; CHECK-NEXT: frsqrte v2.2d, v1.2d +; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d ; CHECK-NEXT: frsqrts v3.2d, v1.2d, v3.2d -; CHECK-NEXT: fmul v0.2d, v0.2d, v3.2d -; CHECK-NEXT: fmul v3.2d, v0.2d, v0.2d +; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d +; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d ; CHECK-NEXT: frsqrts v3.2d, v1.2d, v3.2d -; CHECK-NEXT: fmul v0.2d, v0.2d, v3.2d -; CHECK-NEXT: fmul v3.2d, v0.2d, v0.2d +; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d +; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d ; CHECK-NEXT: frsqrts v3.2d, v1.2d, v3.2d ; CHECK-NEXT: fmul v3.2d, v3.2d, v1.2d -; CHECK-NEXT: fmul v0.2d, v0.2d, v3.2d +; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d ; CHECK-NEXT: fcmeq v3.2d, v1.2d, #0.0 -; CHECK-NEXT: bsl v3.16b, v1.16b, v0.16b -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: mov v1.16b, v3.16b +; CHECK-NEXT: bif v1.16b, v2.16b, v3.16b ; CHECK-NEXT: ret %1 = tail call fast <4 x double> @llvm.sqrt.v4f64(<4 x double> %a) ret <4 x double> %1 diff --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll --- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll +++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll @@ -62,8 +62,7 @@ ; CHECK-LABEL: out_constant_varx_42: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.4s, #42 -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %notmask = xor <4 x i32> %mask, %mx = and <4 x i32> %mask, %x @@ -76,8 +75,7 @@ ; CHECK-LABEL: in_constant_varx_42: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.4s, #42 -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <4 x i32> %x, ; %x %n1 = and <4 x i32> %n0, %mask @@ -90,8 +88,7 @@ ; CHECK-LABEL: out_constant_varx_42_invmask: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.4s, #42 -; CHECK-NEXT: bsl v2.16b, v1.16b, v0.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %notmask = xor <4 x i32> %mask, %mx = and <4 x i32> %notmask, %x @@ -105,8 +102,7 @@ ; CHECK-LABEL: in_constant_varx_42_invmask: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.4s, #42 -; CHECK-NEXT: bsl v2.16b, v1.16b, v0.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %notmask = xor <4 x i32> %mask, %n0 = xor <4 x i32> %x, ; %x @@ -169,9 +165,8 @@ define <4 x i32> @out_constant_42_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { ; CHECK-LABEL: out_constant_42_vary: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: movi v2.4s, #42 -; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v0.4s, #42 +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %notmask = xor <4 x i32> %mask, %mx = and <4 x i32> %mask, @@ -183,9 +178,8 @@ define <4 x i32> @in_constant_42_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { ; CHECK-LABEL: in_constant_42_vary: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: movi v2.4s, #42 -; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v0.4s, #42 +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <4 x i32> , %y ; %x %n1 = and <4 x i32> %n0, %mask @@ -197,9 +191,8 @@ define <4 x i32> @out_constant_42_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { ; CHECK-LABEL: out_constant_42_vary_invmask: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: movi v2.4s, #42 -; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b +; CHECK-NEXT: movi v0.4s, #42 +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %notmask = xor <4 x i32> %mask, %mx = and <4 x i32> %notmask, @@ -212,9 +205,8 @@ define <4 x i32> @in_constant_42_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { ; CHECK-LABEL: in_constant_42_vary_invmask: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: movi v2.4s, #42 -; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b +; CHECK-NEXT: movi v0.4s, #42 +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %notmask = xor <4 x i32> %mask, %n0 = xor <4 x i32> , %y ; %x diff --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll --- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll +++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll @@ -13,8 +13,7 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { ; CHECK-LABEL: out_v1i8: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %mx = and <1 x i8> %x, %mask %notmask = xor <1 x i8> %mask, @@ -46,8 +45,7 @@ define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind { ; CHECK-LABEL: out_v1i16: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %mx = and <1 x i16> %x, %mask %notmask = xor <1 x i16> %mask, @@ -111,8 +109,7 @@ define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind { ; CHECK-LABEL: out_v1i32: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %mx = and <1 x i32> %x, %mask %notmask = xor <1 x i32> %mask, @@ -128,8 +125,7 @@ define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { ; CHECK-LABEL: out_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %mx = and <8 x i8> %x, %mask %notmask = xor <8 x i8> %mask, @@ -141,8 +137,7 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind { ; CHECK-LABEL: out_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %mx = and <4 x i16> %x, %mask %notmask = xor <4 x i16> %mask, @@ -154,8 +149,7 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind { ; CHECK-LABEL: out_v4i16_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %mx = and <4 x i16> %x, %mask %notmask = xor <4 x i16> %mask, @@ -167,8 +161,7 @@ define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind { ; CHECK-LABEL: out_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %mx = and <2 x i32> %x, %mask %notmask = xor <2 x i32> %mask, @@ -180,8 +173,7 @@ define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind { ; CHECK-LABEL: out_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %mx = and <1 x i64> %x, %mask %notmask = xor <1 x i64> %mask, @@ -197,8 +189,7 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind { ; CHECK-LABEL: out_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %mx = and <16 x i8> %x, %mask %notmask = xor <16 x i8> %mask, @@ -210,8 +201,7 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind { ; CHECK-LABEL: out_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %mx = and <8 x i16> %x, %mask %notmask = xor <8 x i16> %mask, @@ -223,8 +213,7 @@ define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind { ; CHECK-LABEL: out_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %mx = and <4 x i32> %x, %mask %notmask = xor <4 x i32> %mask, @@ -236,8 +225,7 @@ define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind { ; CHECK-LABEL: out_v4i32_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %mx = and <4 x i32> %x, %mask %notmask = xor <4 x i32> %mask, @@ -249,8 +237,7 @@ define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind { ; CHECK-LABEL: out_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %mx = and <2 x i64> %x, %mask %notmask = xor <2 x i64> %mask, @@ -270,8 +257,7 @@ define <1 x i8> @in_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { ; CHECK-LABEL: in_v1i8: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <1 x i8> %x, %y %n1 = and <1 x i8> %n0, %mask @@ -286,8 +272,7 @@ define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind { ; CHECK-LABEL: in_v2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <2 x i8> %x, %y %n1 = and <2 x i8> %n0, %mask @@ -298,8 +283,7 @@ define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind { ; CHECK-LABEL: in_v1i16: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <1 x i16> %x, %y %n1 = and <1 x i16> %n0, %mask @@ -314,8 +298,7 @@ define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { ; CHECK-LABEL: in_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <4 x i8> %x, %y %n1 = and <4 x i8> %n0, %mask @@ -326,8 +309,7 @@ define <2 x i16> @in_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind { ; CHECK-LABEL: in_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <2 x i16> %x, %y %n1 = and <2 x i16> %n0, %mask @@ -338,8 +320,7 @@ define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind { ; CHECK-LABEL: in_v1i32: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <1 x i32> %x, %y %n1 = and <1 x i32> %n0, %mask @@ -354,8 +335,7 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { ; CHECK-LABEL: in_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <8 x i8> %x, %y %n1 = and <8 x i8> %n0, %mask @@ -366,8 +346,7 @@ define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind { ; CHECK-LABEL: in_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <4 x i16> %x, %y %n1 = and <4 x i16> %n0, %mask @@ -378,8 +357,7 @@ define <2 x i32> @in_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind { ; CHECK-LABEL: in_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <2 x i32> %x, %y %n1 = and <2 x i32> %n0, %mask @@ -390,8 +368,7 @@ define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind { ; CHECK-LABEL: in_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <1 x i64> %x, %y %n1 = and <1 x i64> %n0, %mask @@ -406,8 +383,7 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind { ; CHECK-LABEL: in_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <16 x i8> %x, %y %n1 = and <16 x i8> %n0, %mask @@ -418,8 +394,7 @@ define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind { ; CHECK-LABEL: in_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <8 x i16> %x, %y %n1 = and <8 x i16> %n0, %mask @@ -430,8 +405,7 @@ define <4 x i32> @in_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind { ; CHECK-LABEL: in_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <4 x i32> %x, %y %n1 = and <4 x i32> %n0, %mask @@ -442,8 +416,7 @@ define <2 x i64> @in_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind { ; CHECK-LABEL: in_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <2 x i64> %x, %y %n1 = and <2 x i64> %n0, %mask diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll @@ -318,8 +318,8 @@ ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI11_4] ; CHECK-NEXT: neg v3.4s, v3.4s ; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s +; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -351,8 +351,8 @@ ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI12_4] ; CHECK-NEXT: neg v3.4s, v3.4s ; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s +; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -577,8 +577,8 @@ ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI20_4] ; CHECK-NEXT: neg v3.4s, v3.4s ; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s +; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -606,8 +606,8 @@ ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI21_3] ; CHECK-NEXT: neg v2.4s, v2.4s ; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s +; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -637,8 +637,8 @@ ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI22_3] ; CHECK-NEXT: neg v2.4s, v2.4s ; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s +; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -670,8 +670,8 @@ ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI23_4] ; CHECK-NEXT: neg v3.4s, v3.4s ; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s +; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -699,8 +699,8 @@ ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI24_3] ; CHECK-NEXT: neg v2.4s, v2.4s ; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s +; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -729,8 +729,8 @@ ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI25_3] ; CHECK-NEXT: neg v2.4s, v2.4s ; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s +; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -761,8 +761,8 @@ ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI26_4] ; CHECK-NEXT: neg v3.4s, v3.4s ; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s +; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll @@ -104,8 +104,8 @@ ; CHECK-NEXT: uzp2 v1.4s, v1.4s, v5.4s ; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI4_4] ; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s +; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, v5.4s ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret