diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -429,6 +429,57 @@ default: break; + case AArch64::BSLPv8i8: + case AArch64::BSLPv16i8: { + Register DstReg = MI.getOperand(0).getReg(); + if (DstReg == MI.getOperand(3).getReg()) { + // Expand to BIT + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSLPv8i8 ? AArch64::BITv8i8 + : AArch64::BITv16i8)) + .add(MI.getOperand(0)) + .add(MI.getOperand(3)) + .add(MI.getOperand(2)) + .add(MI.getOperand(1)); + } else if (DstReg == MI.getOperand(2).getReg()) { + // Expand to BIF + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSLPv8i8 ? AArch64::BIFv8i8 + : AArch64::BIFv16i8)) + .add(MI.getOperand(0)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(1)); + } else { + // Expand to BSL, use additional move if required + if (DstReg == MI.getOperand(1).getReg()) { + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSLPv8i8 ? AArch64::BSLv8i8 + : AArch64::BSLv16i8)) + .add(MI.getOperand(0)) + .add(MI.getOperand(1)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)); + } else { + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSLPv8i8 ? AArch64::ORRv8i8 + : AArch64::ORRv16i8)) + .addReg(DstReg) + .add(MI.getOperand(1)) + .add(MI.getOperand(1)); + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSLPv8i8 ? AArch64::BSLv8i8 + : AArch64::BSLv16i8)) + .add(MI.getOperand(0)) + .addReg(DstReg) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)); + } + } + MI.eraseFromParent(); + return true; + } + case AArch64::ADDWrr: case AArch64::SUBWrr: case AArch64::ADDXrr: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -92,7 +92,7 @@ // Vector bit select: similar to ISD::VSELECT but not all bits within an // element must be identical. - BSL, + BSLP, // Vector arithmetic negation NEG, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1287,7 +1287,7 @@ case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl"; case AArch64ISD::BICi: return "AArch64ISD::BICi"; case AArch64ISD::ORRi: return "AArch64ISD::ORRi"; - case AArch64ISD::BSL: return "AArch64ISD::BSL"; + case AArch64ISD::BSLP: return "AArch64ISD::BSLP"; case AArch64ISD::NEG: return "AArch64ISD::NEG"; case AArch64ISD::EXTR: return "AArch64ISD::EXTR"; case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1"; @@ -10228,7 +10228,7 @@ } if (FoundMatch) - return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0), + return DAG.getNode(AArch64ISD::BSLP, DL, VT, SDValue(BVN0, 0), N0->getOperand(1 - i), N1->getOperand(1 - j)); } diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -5207,6 +5207,47 @@ let Inst{4-0} = Rd; } +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDThreeSameVectorPseudo pattern> + : Pseudo<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), pattern>, + Sched<[WriteV]>; + +multiclass SIMDLogicalThreeVectorPseudo { + def v8i8 : BaseSIMDThreeSameVectorPseudo; + def v16i8 : BaseSIMDThreeSameVectorPseudo; + + def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS), + (v4i16 V64:$RHS))), + (!cast(NAME#"v8i8") + V64:$LHS, V64:$MHS, V64:$RHS)>; + def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS), + (v2i32 V64:$RHS))), + (!cast(NAME#"v8i8") + V64:$LHS, V64:$MHS, V64:$RHS)>; + def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS), + (v1i64 V64:$RHS))), + (!cast(NAME#"v8i8") + V64:$LHS, V64:$MHS, V64:$RHS)>; + + def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS), + (v8i16 V128:$RHS))), + (!cast(NAME#"v16i8") + V128:$LHS, V128:$MHS, V128:$RHS)>; + def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS), + (v4i32 V128:$RHS))), + (!cast(NAME#"v16i8") + V128:$LHS, V128:$MHS, V128:$RHS)>; + def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS), + (v2i64 V128:$RHS))), + (!cast(NAME#"v16i8") + V128:$LHS, V128:$MHS, V128:$RHS)>; +} + // All operand sizes distinguished in the encoding. multiclass SIMDThreeSameVector opc, string asm, SDPatternOperator OpNode> { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -468,7 +468,7 @@ def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>; def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>; -def AArch64bsl: SDNode<"AArch64ISD::BSL", SDT_AArch64trivec>; +def AArch64bslp: SDNode<"AArch64ISD::BSLP", SDT_AArch64trivec>; def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>; def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>; @@ -3955,34 +3955,90 @@ defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>; defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic", BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >; -defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">; -defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>; -defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl", - TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>; defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>; defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn", BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >; defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>; - -def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm), +// Pseudo bitwise bsl-like pattern BSLP. +// It is expanded into BSL/BIT/BIF after register allocation. +defm BSLP : SIMDLogicalThreeVectorPseudo>; +defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl", AArch64bslp>; +defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>; +defm BIF : SIMDLogicalThreeVectorTied<1, 0b11, "bif", AArch64bslp>; + +def : Pat<(AArch64bslp (v8i8 V64:$Rd), V64:$Rn, V64:$Rm), + (BSLPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bslp (v4i16 V64:$Rd), V64:$Rn, V64:$Rm), + (BSLPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bslp (v2i32 V64:$Rd), V64:$Rn, V64:$Rm), + (BSLPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bslp (v1i64 V64:$Rd), V64:$Rn, V64:$Rm), + (BSLPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; + +def : Pat<(AArch64bslp (v16i8 V128:$Rd), V128:$Rn, V128:$Rm), + (BSLPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bslp (v8i16 V128:$Rd), V128:$Rn, V128:$Rm), + (BSLPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bslp (v4i32 V128:$Rd), V128:$Rn, V128:$Rm), + (BSLPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bslp (v2i64 V128:$Rd), V128:$Rn, V128:$Rm), + (BSLPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; + +def : Pat<(AArch64bslp (v8i8 V64:$Rd), V64:$Rn, V64:$Rm), (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm), +def : Pat<(AArch64bslp (v4i16 V64:$Rd), V64:$Rn, V64:$Rm), (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(AArch64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm), +def : Pat<(AArch64bslp (v2i32 V64:$Rd), V64:$Rn, V64:$Rm), (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(AArch64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm), +def : Pat<(AArch64bslp (v1i64 V64:$Rd), V64:$Rn, V64:$Rm), (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(AArch64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm), +def : Pat<(AArch64bslp (v16i8 V128:$Rd), V128:$Rn, V128:$Rm), (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; -def : Pat<(AArch64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm), +def : Pat<(AArch64bslp (v8i16 V128:$Rd), V128:$Rn, V128:$Rm), (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; -def : Pat<(AArch64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm), +def : Pat<(AArch64bslp (v4i32 V128:$Rd), V128:$Rn, V128:$Rm), (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; -def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm), +def : Pat<(AArch64bslp (v2i64 V128:$Rd), V128:$Rn, V128:$Rm), (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bit (v8i8 V64:$Rd), V64:$Rn, V64:$Rm), + (BITv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bit (v4i16 V64:$Rd), V64:$Rn, V64:$Rm), + (BITv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bit (v2i32 V64:$Rd), V64:$Rn, V64:$Rm), + (BITv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bit (v1i64 V64:$Rd), V64:$Rn, V64:$Rm), + (BITv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; + +def : Pat<(AArch64bit (v16i8 V128:$Rd), V128:$Rn, V128:$Rm), + (BITv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bit (v8i16 V128:$Rd), V128:$Rn, V128:$Rm), + (BITv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bit (v4i32 V128:$Rd), V128:$Rn, V128:$Rm), + (BITv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bit (v2i64 V128:$Rd), V128:$Rn, V128:$Rm), + (BITv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; + +def : Pat<(AArch64bslp (v8i8 V64:$Rd), V64:$Rn, V64:$Rm), + (BIFv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bslp (v4i16 V64:$Rd), V64:$Rn, V64:$Rm), + (BIFv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bslp (v2i32 V64:$Rd), V64:$Rn, V64:$Rm), + (BIFv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bslp (v1i64 V64:$Rd), V64:$Rn, V64:$Rm), + (BIFv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; + +def : Pat<(AArch64bslp (v16i8 V128:$Rd), V128:$Rn, V128:$Rm), + (BIFv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bslp (v8i16 V128:$Rd), V128:$Rn, V128:$Rm), + (BIFv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bslp (v4i32 V128:$Rd), V128:$Rn, V128:$Rm), + (BIFv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bslp (v2i64 V128:$Rd), V128:$Rn, V128:$Rm), + (BIFv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; + def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}", (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>; def : InstAlias<"mov{\t$dst.8h, $src.8h|.8h\t$dst, $src}", diff --git a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll @@ -0,0 +1,146 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +; BIF Bitwise Insert if False +; +; 8-bit vectors tests + +define <1 x i8> @test_bitf_v1i8(<1 x i8> %A, <1 x i8> %B, <1 x i8> %C) { +; CHECK-LABEL: test_bitf_v1i8: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %neg = xor <1 x i8> %C, + %and = and <1 x i8> %neg, %B + %and1 = and <1 x i8> %C, %A + %or = or <1 x i8> %and, %and1 + ret <1 x i8> %or +} + +; 16-bit vectors tests + +define <1 x i16> @test_bitf_v1i16(<1 x i16> %A, <1 x i16> %B, <1 x i16> %C) { +; CHECK-LABEL: test_bitf_v1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %neg = xor <1 x i16> %C, + %and = and <1 x i16> %neg, %B + %and1 = and <1 x i16> %C, %A + %or = or <1 x i16> %and, %and1 + ret <1 x i16> %or +} + +; 32-bit vectors tests + +define <1 x i32> @test_bitf_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) { +; CHECK-LABEL: test_bitf_v1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %neg = xor <1 x i32> %C, + %and = and <1 x i32> %neg, %B + %and1 = and <1 x i32> %C, %A + %or = or <1 x i32> %and, %and1 + ret <1 x i32> %or +} + +; 64-bit vectors tests + +define <1 x i64> @test_bitf_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C) { +; CHECK-LABEL: test_bitf_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %neg = xor <1 x i64> %C, + %and = and <1 x i64> %neg, %B + %and1 = and <1 x i64> %C, %A + %or = or <1 x i64> %and, %and1 + ret <1 x i64> %or +} + +define <2 x i32> @test_bitf_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) { +; CHECK-LABEL: test_bitf_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %neg = xor <2 x i32> %C, + %and = and <2 x i32> %neg, %B + %and1 = and <2 x i32> %C, %A + %or = or <2 x i32> %and, %and1 + ret <2 x i32> %or +} + +define <4 x i16> @test_bitf_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) { +; CHECK-LABEL: test_bitf_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %neg = xor <4 x i16> %C, + %and = and <4 x i16> %neg, %B + %and1 = and <4 x i16> %C, %A + %or = or <4 x i16> %and, %and1 + ret <4 x i16> %or +} + +define <8 x i8> @test_bitf_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) { +; CHECK-LABEL: test_bitf_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %neg = xor <8 x i8> %C, + %and = and <8 x i8> %neg, %B + %and1 = and <8 x i8> %C, %A + %or = or <8 x i8> %and, %and1 + ret <8 x i8> %or +} + +; 128-bit vectors tests + +define <2 x i64> @test_bitf_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C) { +; CHECK-LABEL: test_bitf_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %neg = xor <2 x i64> %C, + %and = and <2 x i64> %neg, %B + %and1 = and <2 x i64> %C, %A + %or = or <2 x i64> %and, %and1 + ret <2 x i64> %or +} + +define <4 x i32> @test_bitf_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_bitf_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %neg = xor <4 x i32> %C, + %and = and <4 x i32> %neg, %B + %and1 = and <4 x i32> %C, %A + %or = or <4 x i32> %and, %and1 + ret <4 x i32> %or +} + +define <8 x i16> @test_bitf_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) { +; CHECK-LABEL: test_bitf_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %neg = xor <8 x i16> %C, + %and = and <8 x i16> %neg, %B + %and1 = and <8 x i16> %C, %A + %or = or <8 x i16> %and, %and1 + ret <8 x i16> %or +} + +define <16 x i8> @test_bitf_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) { +; CHECK-LABEL: test_bitf_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %neg = xor <16 x i8> %C, + %and = and <16 x i8> %neg, %B + %and1 = and <16 x i8> %C, %A + %or = or <16 x i8> %and, %and1 + ret <16 x i8> %or +} diff --git a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll @@ -0,0 +1,146 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +; BIT Bitwise Insert if True +; +; 8-bit vectors tests + +define <1 x i8> @test_bit_v1i8(<1 x i8> %A, <1 x i8> %B, <1 x i8> %C) { +; CHECK-LABEL: test_bit_v1i8: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %and = and <1 x i8> %C, %B + %neg = xor <1 x i8> %C, + %and1 = and <1 x i8> %neg, %A + %or = or <1 x i8> %and, %and1 + ret <1 x i8> %or +} + +; 16-bit vectors tests + +define <1 x i16> @test_bit_v1i16(<1 x i16> %A, <1 x i16> %B, <1 x i16> %C) { +; CHECK-LABEL: test_bit_v1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %and = and <1 x i16> %C, %B + %neg = xor <1 x i16> %C, + %and1 = and <1 x i16> %neg, %A + %or = or <1 x i16> %and, %and1 + ret <1 x i16> %or +} + +; 32-bit vectors tests + +define <1 x i32> @test_bit_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) { +; CHECK-LABEL: test_bit_v1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %and = and <1 x i32> %C, %B + %neg = xor <1 x i32> %C, + %and1 = and <1 x i32> %neg, %A + %or = or <1 x i32> %and, %and1 + ret <1 x i32> %or +} + +; 64-bit vectors tests + +define <1 x i64> @test_bit_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C) { +; CHECK-LABEL: test_bit_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %and = and <1 x i64> %C, %B + %neg = xor <1 x i64> %C, + %and1 = and <1 x i64> %neg, %A + %or = or <1 x i64> %and, %and1 + ret <1 x i64> %or +} + +define <2 x i32> @test_bit_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) { +; CHECK-LABEL: test_bit_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %and = and <2 x i32> %C, %B + %neg = xor <2 x i32> %C, + %and1 = and <2 x i32> %neg, %A + %or = or <2 x i32> %and, %and1 + ret <2 x i32> %or +} + +define <4 x i16> @test_bit_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) { +; CHECK-LABEL: test_bit_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %and = and <4 x i16> %C, %B + %neg = xor <4 x i16> %C, + %and1 = and <4 x i16> %neg, %A + %or = or <4 x i16> %and, %and1 + ret <4 x i16> %or +} + +define <8 x i8> @test_bit_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) { +; CHECK-LABEL: test_bit_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %and = and <8 x i8> %C, %B + %neg = xor <8 x i8> %C, + %and1 = and <8 x i8> %neg, %A + %or = or <8 x i8> %and, %and1 + ret <8 x i8> %or +} + +; 128-bit vectors tests + +define <2 x i64> @test_bit_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C) { +; CHECK-LABEL: test_bit_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %and = and <2 x i64> %C, %B + %neg = xor <2 x i64> %C, + %and1 = and <2 x i64> %neg, %A + %or = or <2 x i64> %and, %and1 + ret <2 x i64> %or +} + +define <4 x i32> @test_bit_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_bit_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %and = and <4 x i32> %C, %B + %neg = xor <4 x i32> %C, + %and1 = and <4 x i32> %neg, %A + %or = or <4 x i32> %and, %and1 + ret <4 x i32> %or +} + +define <8 x i16> @test_bit_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) { +; CHECK-LABEL: test_bit_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %and = and <8 x i16> %C, %B + %neg = xor <8 x i16> %C, + %and1 = and <8 x i16> %neg, %A + %or = or <8 x i16> %and, %and1 + ret <8 x i16> %or +} + +define <16 x i8> @test_bit_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) { +; CHECK-LABEL: test_bit_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %and = and <16 x i8> %C, %B + %neg = xor <16 x i8> %C, + %and1 = and <16 x i8> %neg, %A + %or = or <16 x i8> %and, %and1 + ret <16 x i8> %or +} diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll b/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll @@ -9,8 +9,7 @@ ; CHECK-NEXT: fmov s3, w0 ; CHECK-NEXT: cmeq v2.8b, v3.8b, v2.8b ; CHECK-NEXT: dup v2.8b, v2.b[0] -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %cmp31 = icmp eq i8 %a, %b %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d @@ -49,8 +48,7 @@ ; CHECK-NEXT: fmov s3, w0 ; CHECK-NEXT: cmeq v2.16b, v3.16b, v2.16b ; CHECK-NEXT: dup v2.16b, v2.b[0] -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %cmp31 = icmp eq i8 %a, %b %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d @@ -92,8 +90,7 @@ ; CHECK-NEXT: fmov s3, w0 ; CHECK-NEXT: cmeq v2.4h, v3.4h, v2.4h ; CHECK-NEXT: dup v2.4h, v2.h[0] -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %cmp31 = icmp eq i16 %a, %b %e = select i1 %cmp31, <4x i16> %c, <4x i16> %d @@ -107,8 +104,7 @@ ; CHECK-NEXT: fmov s3, w0 ; CHECK-NEXT: cmeq v2.8h, v3.8h, v2.8h ; CHECK-NEXT: dup v2.8h, v2.h[0] -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %cmp31 = icmp eq i16 %a, %b %e = select i1 %cmp31, <8x i16> %c, <8x i16> %d @@ -122,8 +118,7 @@ ; CHECK-NEXT: fmov s3, w0 ; CHECK-NEXT: cmeq v2.2s, v3.2s, v2.2s ; CHECK-NEXT: dup v2.2s, v2.s[0] -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %cmp31 = icmp eq i32 %a, %b %e = select i1 %cmp31, <2x i32> %c, <2x i32> %d @@ -137,8 +132,7 @@ ; CHECK-NEXT: fmov s3, w0 ; CHECK-NEXT: cmeq v2.4s, v3.4s, v2.4s ; CHECK-NEXT: dup v2.4s, v2.s[0] -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %cmp31 = icmp eq i32 %a, %b %e = select i1 %cmp31, <4x i32> %c, <4x i32> %d @@ -151,8 +145,7 @@ ; CHECK-NEXT: fmov d2, x1 ; CHECK-NEXT: fmov d3, x0 ; CHECK-NEXT: cmeq d2, d3, d2 -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %cmp31 = icmp eq i64 %a, %b %e = select i1 %cmp31, <1x i64> %c, <1x i64> %d @@ -166,8 +159,7 @@ ; CHECK-NEXT: fmov d3, x0 ; CHECK-NEXT: cmeq v2.2d, v3.2d, v2.2d ; CHECK-NEXT: dup v2.2d, v2.d[0] -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %cmp31 = icmp eq i64 %a, %b %e = select i1 %cmp31, <2x i64> %c, <2x i64> %d @@ -222,8 +214,7 @@ ; CHECK-NEXT: fmov s3, w0 ; CHECK-NEXT: cmeq v2.4s, v3.4s, v2.4s ; CHECK-NEXT: dup v2.4s, v2.s[0] -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %cmp31 = icmp eq i32 %a, %b %e = select i1 %cmp31, <4x float> %c, <4x float> %d @@ -247,8 +238,7 @@ ; CHECK-NEXT: fmov d2, x1 ; CHECK-NEXT: fmov d3, x0 ; CHECK-NEXT: cmeq d2, d3, d2 -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %cmp31 = icmp eq i64 %a, %b %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d @@ -278,8 +268,7 @@ ; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: csetm w8, ne ; CHECK-NEXT: dup v2.2s, w8 -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %cmp = icmp ne i1 %cc, 0 %e = select i1 %cmp, <2 x i32> %a, <2 x i32> %b @@ -294,8 +283,7 @@ ; CHECK-NEXT: // kill: def $s3 killed $s3 def $q3 ; CHECK-NEXT: fcmeq v2.4s, v2.4s, v3.4s ; CHECK-NEXT: dup v2.4s, v2.s[0] -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %cc = fcmp oeq float %c1, %c2 %r = select i1 %cc, <3 x float> %a, <3 x float> %b @@ -309,8 +297,7 @@ ; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3 ; CHECK-NEXT: fcmeq v2.2d, v2.2d, v3.2d ; CHECK-NEXT: dup v2.2d, v2.d[0] -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %cc = fcmp oeq double %c1, %c2 %r = select i1 %cc, <3 x float> %a, <3 x float> %b diff --git a/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll --- a/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll @@ -5,8 +5,7 @@ define <4 x half> @select_64(<4 x half> %a, <4 x half> %b, <4 x i16> %c) #0 { ; CHECK-LABEL: select_64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret entry: %0 = bitcast <4 x half> %a to <4 x i16> @@ -23,8 +22,7 @@ define <8 x half> @select_128(<8 x half> %a, <8 x half> %b, <8 x i16> %c) #0 { ; CHECK-LABEL: select_128: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret entry: %0 = bitcast <8 x half> %a to <8 x i16> diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll --- a/llvm/test/CodeGen/AArch64/sat-add.ll +++ b/llvm/test/CodeGen/AArch64/sat-add.ll @@ -480,9 +480,9 @@ ; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: mov w9, #42 ; CHECK-NEXT: cmhi v2.2d, v1.2d, v0.2d -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: dup v0.2d, x9 -; CHECK-NEXT: add v0.2d, v2.2d, v0.2d +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: dup v1.2d, x9 +; CHECK-NEXT: add v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %c = icmp ult <2 x i64> %x, %s = select <2 x i1> %c, <2 x i64> %x, <2 x i64> @@ -653,8 +653,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mvn v2.16b, v1.16b ; CHECK-NEXT: cmhi v3.2d, v2.2d, v0.2d -; CHECK-NEXT: bsl v3.16b, v0.16b, v2.16b -; CHECK-NEXT: add v0.2d, v3.2d, v1.2d +; CHECK-NEXT: bif v0.16b, v2.16b, v3.16b +; CHECK-NEXT: add v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %noty = xor <2 x i64> %y, %c = icmp ult <2 x i64> %x, %noty diff --git a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll --- a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll +++ b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll @@ -71,10 +71,9 @@ ; CHECK-NEXT: fmul v2.2s, v1.2s, v1.2s ; CHECK-NEXT: frsqrts v2.2s, v0.2s, v2.2s ; CHECK-NEXT: fmul v2.2s, v2.2s, v0.2s -; CHECK-NEXT: fmul v2.2s, v1.2s, v2.2s -; CHECK-NEXT: fcmeq v1.2s, v0.2s, #0.0 -; CHECK-NEXT: bsl v1.8b, v0.8b, v2.8b -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: fmul v1.2s, v1.2s, v2.2s +; CHECK-NEXT: fcmeq v2.2s, v0.2s, #0.0 +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %1 = tail call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> %a) ret <2 x float> %1 @@ -95,10 +94,9 @@ ; CHECK-NEXT: fmul v2.4s, v1.4s, v1.4s ; CHECK-NEXT: frsqrts v2.4s, v0.4s, v2.4s ; CHECK-NEXT: fmul v2.4s, v2.4s, v0.4s -; CHECK-NEXT: fmul v2.4s, v1.4s, v2.4s -; CHECK-NEXT: fcmeq v1.4s, v0.4s, #0.0 -; CHECK-NEXT: bsl v1.16b, v0.16b, v2.16b -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: fmul v1.4s, v1.4s, v2.4s +; CHECK-NEXT: fcmeq v2.4s, v0.4s, #0.0 +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %1 = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) ret <4 x float> %1 @@ -120,21 +118,19 @@ ; CHECK-NEXT: fmul v3.4s, v2.4s, v2.4s ; CHECK-NEXT: frsqrts v3.4s, v0.4s, v3.4s ; CHECK-NEXT: fmul v3.4s, v3.4s, v0.4s -; CHECK-NEXT: fmul v3.4s, v2.4s, v3.4s -; CHECK-NEXT: fcmeq v2.4s, v0.4s, #0.0 -; CHECK-NEXT: bsl v2.16b, v0.16b, v3.16b -; CHECK-NEXT: frsqrte v0.4s, v1.4s -; CHECK-NEXT: fmul v3.4s, v0.4s, v0.4s +; CHECK-NEXT: fmul v2.4s, v2.4s, v3.4s +; CHECK-NEXT: fcmeq v3.4s, v0.4s, #0.0 +; CHECK-NEXT: bif v0.16b, v2.16b, v3.16b +; CHECK-NEXT: frsqrte v2.4s, v1.4s +; CHECK-NEXT: fmul v3.4s, v2.4s, v2.4s ; CHECK-NEXT: frsqrts v3.4s, v1.4s, v3.4s -; CHECK-NEXT: fmul v0.4s, v0.4s, v3.4s -; CHECK-NEXT: fmul v3.4s, v0.4s, v0.4s +; CHECK-NEXT: fmul v2.4s, v2.4s, v3.4s +; CHECK-NEXT: fmul v3.4s, v2.4s, v2.4s ; CHECK-NEXT: frsqrts v3.4s, v1.4s, v3.4s ; CHECK-NEXT: fmul v3.4s, v3.4s, v1.4s -; CHECK-NEXT: fmul v0.4s, v0.4s, v3.4s +; CHECK-NEXT: fmul v2.4s, v2.4s, v3.4s ; CHECK-NEXT: fcmeq v3.4s, v1.4s, #0.0 -; CHECK-NEXT: bsl v3.16b, v1.16b, v0.16b -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: mov v1.16b, v3.16b +; CHECK-NEXT: bif v1.16b, v2.16b, v3.16b ; CHECK-NEXT: ret %1 = tail call fast <8 x float> @llvm.sqrt.v8f32(<8 x float> %a) ret <8 x float> %1 @@ -210,10 +206,9 @@ ; CHECK-NEXT: fmul v2.2d, v1.2d, v1.2d ; CHECK-NEXT: frsqrts v2.2d, v0.2d, v2.2d ; CHECK-NEXT: fmul v2.2d, v2.2d, v0.2d -; CHECK-NEXT: fmul v2.2d, v1.2d, v2.2d -; CHECK-NEXT: fcmeq v1.2d, v0.2d, #0.0 -; CHECK-NEXT: bsl v1.16b, v0.16b, v2.16b -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: fmul v1.2d, v1.2d, v2.2d +; CHECK-NEXT: fcmeq v2.2d, v0.2d, #0.0 +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %1 = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) ret <2 x double> %1 @@ -238,24 +233,22 @@ ; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d ; CHECK-NEXT: frsqrts v3.2d, v0.2d, v3.2d ; CHECK-NEXT: fmul v3.2d, v3.2d, v0.2d -; CHECK-NEXT: fmul v3.2d, v2.2d, v3.2d -; CHECK-NEXT: fcmeq v2.2d, v0.2d, #0.0 -; CHECK-NEXT: bsl v2.16b, v0.16b, v3.16b -; CHECK-NEXT: frsqrte v0.2d, v1.2d -; CHECK-NEXT: fmul v3.2d, v0.2d, v0.2d +; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d +; CHECK-NEXT: fcmeq v3.2d, v0.2d, #0.0 +; CHECK-NEXT: bif v0.16b, v2.16b, v3.16b +; CHECK-NEXT: frsqrte v2.2d, v1.2d +; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d ; CHECK-NEXT: frsqrts v3.2d, v1.2d, v3.2d -; CHECK-NEXT: fmul v0.2d, v0.2d, v3.2d -; CHECK-NEXT: fmul v3.2d, v0.2d, v0.2d +; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d +; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d ; CHECK-NEXT: frsqrts v3.2d, v1.2d, v3.2d -; CHECK-NEXT: fmul v0.2d, v0.2d, v3.2d -; CHECK-NEXT: fmul v3.2d, v0.2d, v0.2d +; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d +; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d ; CHECK-NEXT: frsqrts v3.2d, v1.2d, v3.2d ; CHECK-NEXT: fmul v3.2d, v3.2d, v1.2d -; CHECK-NEXT: fmul v0.2d, v0.2d, v3.2d +; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d ; CHECK-NEXT: fcmeq v3.2d, v1.2d, #0.0 -; CHECK-NEXT: bsl v3.16b, v1.16b, v0.16b -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: mov v1.16b, v3.16b +; CHECK-NEXT: bif v1.16b, v2.16b, v3.16b ; CHECK-NEXT: ret %1 = tail call fast <4 x double> @llvm.sqrt.v4f64(<4 x double> %a) ret <4 x double> %1 diff --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll --- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll +++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll @@ -62,8 +62,7 @@ ; CHECK-LABEL: out_constant_varx_42: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.4s, #42 -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %notmask = xor <4 x i32> %mask, %mx = and <4 x i32> %mask, %x @@ -76,8 +75,7 @@ ; CHECK-LABEL: in_constant_varx_42: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.4s, #42 -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <4 x i32> %x, ; %x %n1 = and <4 x i32> %n0, %mask @@ -90,8 +88,7 @@ ; CHECK-LABEL: out_constant_varx_42_invmask: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.4s, #42 -; CHECK-NEXT: bsl v2.16b, v1.16b, v0.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %notmask = xor <4 x i32> %mask, %mx = and <4 x i32> %notmask, %x @@ -105,8 +102,7 @@ ; CHECK-LABEL: in_constant_varx_42_invmask: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.4s, #42 -; CHECK-NEXT: bsl v2.16b, v1.16b, v0.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %notmask = xor <4 x i32> %mask, %n0 = xor <4 x i32> %x, ; %x @@ -169,9 +165,8 @@ define <4 x i32> @out_constant_42_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { ; CHECK-LABEL: out_constant_42_vary: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: movi v2.4s, #42 -; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v0.4s, #42 +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %notmask = xor <4 x i32> %mask, %mx = and <4 x i32> %mask, @@ -183,9 +178,8 @@ define <4 x i32> @in_constant_42_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { ; CHECK-LABEL: in_constant_42_vary: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: movi v2.4s, #42 -; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v0.4s, #42 +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <4 x i32> , %y ; %x %n1 = and <4 x i32> %n0, %mask @@ -197,9 +191,8 @@ define <4 x i32> @out_constant_42_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { ; CHECK-LABEL: out_constant_42_vary_invmask: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: movi v2.4s, #42 -; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b +; CHECK-NEXT: movi v0.4s, #42 +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %notmask = xor <4 x i32> %mask, %mx = and <4 x i32> %notmask, @@ -212,9 +205,8 @@ define <4 x i32> @in_constant_42_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { ; CHECK-LABEL: in_constant_42_vary_invmask: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: movi v2.4s, #42 -; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b +; CHECK-NEXT: movi v0.4s, #42 +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %notmask = xor <4 x i32> %mask, %n0 = xor <4 x i32> , %y ; %x diff --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll --- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll +++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll @@ -13,8 +13,7 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { ; CHECK-LABEL: out_v1i8: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %mx = and <1 x i8> %x, %mask %notmask = xor <1 x i8> %mask, @@ -46,8 +45,7 @@ define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind { ; CHECK-LABEL: out_v1i16: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %mx = and <1 x i16> %x, %mask %notmask = xor <1 x i16> %mask, @@ -111,8 +109,7 @@ define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind { ; CHECK-LABEL: out_v1i32: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %mx = and <1 x i32> %x, %mask %notmask = xor <1 x i32> %mask, @@ -128,8 +125,7 @@ define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { ; CHECK-LABEL: out_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %mx = and <8 x i8> %x, %mask %notmask = xor <8 x i8> %mask, @@ -141,8 +137,7 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind { ; CHECK-LABEL: out_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %mx = and <4 x i16> %x, %mask %notmask = xor <4 x i16> %mask, @@ -154,8 +149,7 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind { ; CHECK-LABEL: out_v4i16_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %mx = and <4 x i16> %x, %mask %notmask = xor <4 x i16> %mask, @@ -167,8 +161,7 @@ define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind { ; CHECK-LABEL: out_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %mx = and <2 x i32> %x, %mask %notmask = xor <2 x i32> %mask, @@ -180,8 +173,7 @@ define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind { ; CHECK-LABEL: out_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %mx = and <1 x i64> %x, %mask %notmask = xor <1 x i64> %mask, @@ -197,8 +189,7 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind { ; CHECK-LABEL: out_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %mx = and <16 x i8> %x, %mask %notmask = xor <16 x i8> %mask, @@ -210,8 +201,7 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind { ; CHECK-LABEL: out_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %mx = and <8 x i16> %x, %mask %notmask = xor <8 x i16> %mask, @@ -223,8 +213,7 @@ define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind { ; CHECK-LABEL: out_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %mx = and <4 x i32> %x, %mask %notmask = xor <4 x i32> %mask, @@ -236,8 +225,7 @@ define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind { ; CHECK-LABEL: out_v4i32_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %mx = and <4 x i32> %x, %mask %notmask = xor <4 x i32> %mask, @@ -249,8 +237,7 @@ define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind { ; CHECK-LABEL: out_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %mx = and <2 x i64> %x, %mask %notmask = xor <2 x i64> %mask, @@ -270,8 +257,7 @@ define <1 x i8> @in_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { ; CHECK-LABEL: in_v1i8: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <1 x i8> %x, %y %n1 = and <1 x i8> %n0, %mask @@ -286,8 +272,7 @@ define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind { ; CHECK-LABEL: in_v2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <2 x i8> %x, %y %n1 = and <2 x i8> %n0, %mask @@ -298,8 +283,7 @@ define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind { ; CHECK-LABEL: in_v1i16: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <1 x i16> %x, %y %n1 = and <1 x i16> %n0, %mask @@ -314,8 +298,7 @@ define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { ; CHECK-LABEL: in_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <4 x i8> %x, %y %n1 = and <4 x i8> %n0, %mask @@ -326,8 +309,7 @@ define <2 x i16> @in_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind { ; CHECK-LABEL: in_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <2 x i16> %x, %y %n1 = and <2 x i16> %n0, %mask @@ -338,8 +320,7 @@ define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind { ; CHECK-LABEL: in_v1i32: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <1 x i32> %x, %y %n1 = and <1 x i32> %n0, %mask @@ -354,8 +335,7 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { ; CHECK-LABEL: in_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <8 x i8> %x, %y %n1 = and <8 x i8> %n0, %mask @@ -366,8 +346,7 @@ define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind { ; CHECK-LABEL: in_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <4 x i16> %x, %y %n1 = and <4 x i16> %n0, %mask @@ -378,8 +357,7 @@ define <2 x i32> @in_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind { ; CHECK-LABEL: in_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <2 x i32> %x, %y %n1 = and <2 x i32> %n0, %mask @@ -390,8 +368,7 @@ define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind { ; CHECK-LABEL: in_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <1 x i64> %x, %y %n1 = and <1 x i64> %n0, %mask @@ -406,8 +383,7 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind { ; CHECK-LABEL: in_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <16 x i8> %x, %y %n1 = and <16 x i8> %n0, %mask @@ -418,8 +394,7 @@ define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind { ; CHECK-LABEL: in_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <8 x i16> %x, %y %n1 = and <8 x i16> %n0, %mask @@ -430,8 +405,7 @@ define <4 x i32> @in_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind { ; CHECK-LABEL: in_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <4 x i32> %x, %y %n1 = and <4 x i32> %n0, %mask @@ -442,8 +416,7 @@ define <2 x i64> @in_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind { ; CHECK-LABEL: in_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <2 x i64> %x, %y %n1 = and <2 x i64> %n0, %mask