diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -8579,7 +8579,7 @@ (f16 (vector_extract (v8f16 V128:$Rn), (i64 0))), (f16 (vector_extract (v8f16 V128:$Rm), VectorIndexH:$idx)))), (!cast(NAME # v1i16_indexed) - (EXTRACT_SUBREG V128:$Rn, hsub), V128:$Rm, VectorIndexH:$idx)>; + (f16 (EXTRACT_SUBREG V128:$Rn, hsub)), V128:$Rm, VectorIndexH:$idx)>; } let Predicates = [HasNEON] in { @@ -9135,7 +9135,7 @@ (i64 0))))), (!cast(NAME # v1i32_indexed) FPR32Op:$Rd, - (EXTRACT_SUBREG V64:$Rn, hsub), + (bf16 (EXTRACT_SUBREG V64:$Rn, hsub)), (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), (i64 0))>; @@ -9148,7 +9148,7 @@ (i64 0))))), (!cast(NAME # v1i32_indexed) FPR32Op:$Rd, - (EXTRACT_SUBREG V64:$Rn, hsub), + (bf16 (EXTRACT_SUBREG V64:$Rn, hsub)), V128_lo:$Rm, VectorIndexH:$idx)>; diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td --- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td +++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td @@ -304,27 +304,37 @@ (STXPX GPR64:$lo, GPR64:$hi, GPR64:$addr)>; multiclass SIMDAcrossLanesSignedIntrinsicBHS { - def : Pat<(i32 (intOp (v8i8 V64:$Rn))), + def : Pat<(i32 (sext (i8 (intOp (v8i8 V64:$Rn))))), (i32 (SMOVvi8to32 (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), + (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), (i64 0)))>; - def : Pat<(i32 (intOp (v16i8 V128:$Rn))), + def : Pat<(i8 (intOp (v8i8 V64:$Rn))), + (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn)>; + + def : Pat<(i32 (sext (i8 (intOp (v16i8 V128:$Rn))))), (i32 (SMOVvi8to32 (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), (i64 0)))>; + def : Pat<(i8 (intOp (v16i8 V128:$Rn))), + (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn)>; - def : Pat<(i32 (intOp (v4i16 V64:$Rn))), + def : Pat<(i32 (sext (i16 (intOp (v4i16 V64:$Rn))))), (i32 (SMOVvi16to32 (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub), (i64 0)))>; - def : Pat<(i32 (intOp (v8i16 V128:$Rn))), + def : Pat<(i16 (intOp (v4i16 V64:$Rn))), + (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn)>; + + def : Pat<(i32 (sext (i16 (intOp (v8i16 V128:$Rn))))), (i32 (SMOVvi16to32 (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), (!cast(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub), (i64 0)))>; + def : Pat<(i16 (intOp (v8i16 V128:$Rn))), + (!cast(!strconcat(baseOpc, "v8i16v")) V128:$Rn)>; def : Pat<(i32 (intOp (v4i32 V128:$Rn))), (i32 (EXTRACT_SUBREG @@ -335,27 +345,46 @@ multiclass SIMDAcrossLanesUnsignedIntrinsicBHS { - def : Pat<(i32 (intOp (v8i8 V64:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), - ssub))>; - def : Pat<(i32 (intOp (v16i8 V128:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), - ssub))>; + def : Pat<(i32 (zext (i8 (intOp (v8i8 V64:$Rn))))), + (COPY_TO_REGCLASS + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), + ssub)), + GPR32)>; + def : Pat<(i8 (intOp (v8i8 V64:$Rn))), + (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn)>; + + def : Pat<(i32 (zext (i8 (intOp (v16i8 V128:$Rn))))), + (COPY_TO_REGCLASS + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), + ssub)), + GPR32)>; + def : Pat<(i8 (intOp (v16i8 V128:$Rn))), + (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn)>; + - def : Pat<(i32 (intOp (v4i16 V64:$Rn))), + def : Pat<(i32 (zext (i16 (intOp (v4i16 V64:$Rn))))), + (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub), - ssub))>; - def : Pat<(i32 (intOp (v8i16 V128:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub), - ssub))>; + ssub)), + GPR32)>; + def : Pat<(i16 (intOp (v4i16 V64:$Rn))), + (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn)>; + + def : Pat<(i32 (zext (i16 (intOp (v8i16 V128:$Rn))))), + (COPY_TO_REGCLASS + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub), + ssub)), + GPR32)>; + def : Pat<(i16 (intOp (v8i16 V128:$Rn))), + (!cast(!strconcat(baseOpc, "v8i16v")) V128:$Rn)>; def : Pat<(i32 (intOp (v4i32 V128:$Rn))), (i32 (EXTRACT_SUBREG @@ -364,7 +393,6 @@ ssub))>; } - defm : SIMDAcrossLanesSignedIntrinsicBHS<"ADDV", int_aarch64_neon_saddv>; // vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm def : Pat<(i32 (int_aarch64_neon_saddv (v2i32 V64:$Rn))), @@ -373,12 +401,23 @@ (ADDPv2i32 V64:$Rn, V64:$Rn), dsub), ssub))>; +def : Pat<(i64 (int_aarch64_neon_saddv (v2i64 V128:$Rn))), + (i64 (EXTRACT_SUBREG + (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), + (ADDPv2i64p V128:$Rn), dsub), + dsub))>; + defm : SIMDAcrossLanesUnsignedIntrinsicBHS<"ADDV", int_aarch64_neon_uaddv>; def : Pat<(i32 (int_aarch64_neon_uaddv (v2i32 V64:$Rn))), (i32 (EXTRACT_SUBREG (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), (ADDPv2i32 V64:$Rn, V64:$Rn), dsub), ssub))>; +def : Pat<(i64 (int_aarch64_neon_uaddv (v2i64 V128:$Rn))), + (i64 (EXTRACT_SUBREG + (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), + (ADDPv2i64p V128:$Rn), dsub), + dsub))>; defm : SIMDAcrossLanesSignedIntrinsicBHS<"SMAXV", int_aarch64_neon_smaxv>; def : Pat<(i32 (int_aarch64_neon_smaxv (v2i32 V64:$Rn))), diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -3671,27 +3671,28 @@ // Match stores from lane 0 to the appropriate subreg's store. multiclass VecROStoreLane0Pat { def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)), (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)), - (STRW (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx), + (STRW (SubRegTy (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx)), GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>; def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)), (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)), - (STRX (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx), + (STRX (SubRegTy (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx)), GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>; } let AddedComplexity = 19 in { - defm : VecROStoreLane0Pat; - defm : VecROStoreLane0Pat; - defm : VecROStoreLane0Pat; - defm : VecROStoreLane0Pat; - defm : VecROStoreLane0Pat; - defm : VecROStoreLane0Pat; + defm : VecROStoreLane0Pat; + defm : VecROStoreLane0Pat; + defm : VecROStoreLane0Pat; + defm : VecROStoreLane0Pat; + defm : VecROStoreLane0Pat; + defm : VecROStoreLane0Pat; } //--- @@ -3810,21 +3811,22 @@ // Match stores from lane 0 to the appropriate subreg's store. multiclass VecStoreLane0Pat { def : Pat<(storeop (STy (vector_extract (VTy VecListOne128:$Vt), 0)), (UIAddrMode GPR64sp:$Rn, IndexType:$offset)), - (STR (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx), + (STR (SubRegTy (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx)), GPR64sp:$Rn, IndexType:$offset)>; } let AddedComplexity = 19 in { - defm : VecStoreLane0Pat; - defm : VecStoreLane0Pat; - defm : VecStoreLane0Pat; - defm : VecStoreLane0Pat; - defm : VecStoreLane0Pat; - defm : VecStoreLane0Pat; + defm : VecStoreLane0Pat; + defm : VecStoreLane0Pat; + defm : VecStoreLane0Pat; + defm : VecStoreLane0Pat; + defm : VecStoreLane0Pat; + defm : VecStoreLane0Pat; } //--- @@ -3953,17 +3955,18 @@ // Match stores from lane 0 to the appropriate subreg's store. multiclass VecStoreULane0Pat { - defm : VecStoreLane0Pat; + defm : VecStoreLane0Pat; } let AddedComplexity = 19 in { - defm : VecStoreULane0Pat; - defm : VecStoreULane0Pat; - defm : VecStoreULane0Pat; - defm : VecStoreULane0Pat; - defm : VecStoreULane0Pat; - defm : VecStoreULane0Pat; + defm : VecStoreULane0Pat; + defm : VecStoreULane0Pat; + defm : VecStoreULane0Pat; + defm : VecStoreULane0Pat; + defm : VecStoreULane0Pat; + defm : VecStoreULane0Pat; } //--- @@ -4460,7 +4463,7 @@ def : Pat<(f16 (OpNode (f16 FPR16:$Rn), (f16 (vector_extract (v8f16 V128:$Rm), (i64 0))))), (!cast(inst # inst_f16_suffix) - FPR16:$Rn, (EXTRACT_SUBREG V128:$Rm, hsub))>; + FPR16:$Rn, (bf16 (EXTRACT_SUBREG V128:$Rm, hsub)))>; } let Predicates = preds in { def : Pat<(f32 (OpNode (f32 FPR32:$Rn), @@ -7028,19 +7031,19 @@ // Patterns for FP16 Intrinsics - requires reg copy to/from as i16s not supported. def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i32 (sext_inreg FPR32:$Rn, i16)), vecshiftR16:$imm)), - (SCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>; + (SCVTFh (bf16 (EXTRACT_SUBREG FPR32:$Rn, hsub)), vecshiftR16:$imm)>; def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i32 FPR32:$Rn), vecshiftR16:$imm)), - (SCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>; + (SCVTFh (bf16 (EXTRACT_SUBREG FPR32:$Rn, hsub)), vecshiftR16:$imm)>; def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR16:$imm)), - (SCVTFh (EXTRACT_SUBREG FPR64:$Rn, hsub), vecshiftR16:$imm)>; + (SCVTFh (bf16 (EXTRACT_SUBREG FPR64:$Rn, hsub)), vecshiftR16:$imm)>; def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp (and FPR32:$Rn, (i32 65535)), vecshiftR16:$imm)), - (UCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>; + (UCVTFh (bf16 (EXTRACT_SUBREG FPR32:$Rn, hsub)), vecshiftR16:$imm)>; def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR16:$imm)), - (UCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>; + (UCVTFh (bf16 (EXTRACT_SUBREG FPR32:$Rn, hsub)), vecshiftR16:$imm)>; def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR16:$imm)), - (UCVTFh (EXTRACT_SUBREG FPR64:$Rn, hsub), vecshiftR16:$imm)>; + (UCVTFh (bf16 (EXTRACT_SUBREG FPR64:$Rn, hsub)), vecshiftR16:$imm)>; def : Pat<(i32 (int_aarch64_neon_vcvtfp2fxs (f16 FPR16:$Rn), vecshiftR32:$imm)), (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -438,7 +438,7 @@ def FPR8 : RegisterClass<"AArch64", [i8], 8, (sequence "B%u", 0, 31)> { let Size = 8; } -def FPR16 : RegisterClass<"AArch64", [f16, bf16], 16, (sequence "H%u", 0, 31)> { +def FPR16 : RegisterClass<"AArch64", [f16, bf16, i16], 16, (sequence "H%u", 0, 31)> { let Size = 16; } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -1127,7 +1127,8 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const { - switch (cast(MI).getIntrinsicID()) { + Intrinsic::ID IntrinsicID = cast(MI).getIntrinsicID(); + switch (IntrinsicID) { case Intrinsic::vacopy: { unsigned PtrSize = ST->isTargetILP32() ? 4 : 8; unsigned VaListSize = @@ -1207,6 +1208,36 @@ MI.eraseFromParent(); return true; } + case Intrinsic::aarch64_neon_uaddv: + case Intrinsic::aarch64_neon_saddv: + case Intrinsic::aarch64_neon_umaxv: + case Intrinsic::aarch64_neon_smaxv: + case Intrinsic::aarch64_neon_uminv: + case Intrinsic::aarch64_neon_sminv: { + MachineIRBuilder MIB(MI); + MachineRegisterInfo &MRI = *MIB.getMRI(); + bool IsSigned = IntrinsicID == Intrinsic::aarch64_neon_saddv || + IntrinsicID == Intrinsic::aarch64_neon_smaxv || + IntrinsicID == Intrinsic::aarch64_neon_sminv; + + auto OldDst = MI.getOperand(0).getReg(); + auto OldDstTy = MRI.getType(OldDst); + LLT NewDstTy = MRI.getType(MI.getOperand(2).getReg()).getElementType(); + if (OldDstTy == NewDstTy) + return true; + + auto NewDst = MRI.createGenericVirtualRegister(NewDstTy); + + Helper.Observer.changingInstr(MI); + MI.getOperand(0).setReg(NewDst); + Helper.Observer.changedInstr(MI); + + MIB.setInsertPt(MIB.getMBB(), ++MIB.getInsertPt()); + MIB.buildExtOrTrunc(IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT, + OldDst, NewDst); + + return true; + } } return true; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp --- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -492,8 +492,12 @@ return false; case Intrinsic::aarch64_neon_uaddlv: case Intrinsic::aarch64_neon_uaddv: + case Intrinsic::aarch64_neon_saddv: case Intrinsic::aarch64_neon_umaxv: + case Intrinsic::aarch64_neon_smaxv: case Intrinsic::aarch64_neon_uminv: + case Intrinsic::aarch64_neon_sminv: + case Intrinsic::aarch64_neon_faddv: case Intrinsic::aarch64_neon_fmaxv: case Intrinsic::aarch64_neon_fminv: case Intrinsic::aarch64_neon_fmaxnmv: @@ -504,13 +508,6 @@ return SrcTy.getElementType().getSizeInBits() >= 16 && SrcTy.getElementCount().getFixedValue() >= 4; } - case Intrinsic::aarch64_neon_saddv: - case Intrinsic::aarch64_neon_smaxv: - case Intrinsic::aarch64_neon_sminv: { - const LLT SrcTy = MRI.getType(MI.getOperand(2).getReg()); - return SrcTy.getElementType().getSizeInBits() >= 32 && - SrcTy.getElementCount().getFixedValue() >= 2; - } } } @@ -739,6 +736,22 @@ OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR}; break; } + case TargetOpcode::G_ZEXT: + case TargetOpcode::G_SEXT: { + // Allow G_SEXT/G_ZEXT from small FPR scalars to select across lane + // intrinsics. + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + if (getRegBank(MI.getOperand(1).getReg(), MRI, TRI) == + &AArch64::FPRRegBank) { + if ((DstTy.getSizeInBits() == 32 || DstTy.getSizeInBits() == 64) && + (SrcTy.getSizeInBits() == 8 || SrcTy.getSizeInBits() == 16) && + SrcTy.getSizeInBits() < DstTy.getSizeInBits()) { + OpRegBankIdx[1] = PMI_FirstFPR; + } + } + break; + } case TargetOpcode::G_SITOFP: case TargetOpcode::G_UITOFP: { if (MRI.getType(MI.getOperand(0).getReg()).isVector()) diff --git a/llvm/test/CodeGen/AArch64/arm64-smaxv.ll b/llvm/test/CodeGen/AArch64/arm64-smaxv.ll --- a/llvm/test/CodeGen/AArch64/arm64-smaxv.ll +++ b/llvm/test/CodeGen/AArch64/arm64-smaxv.ll @@ -1,4 +1,6 @@ ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s +; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s +; RUN: llc < %s -global-isel=1 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s define signext i8 @test_vmaxv_s8(<8 x i8> %a1) { ; CHECK: test_vmaxv_s8 diff --git a/llvm/test/CodeGen/AArch64/arm64-sminv.ll b/llvm/test/CodeGen/AArch64/arm64-sminv.ll --- a/llvm/test/CodeGen/AArch64/arm64-sminv.ll +++ b/llvm/test/CodeGen/AArch64/arm64-sminv.ll @@ -1,14 +1,11 @@ -; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s --check-prefix=CHECK --check-prefix=SDAG -; RUN: llc < %s -global-isel=1 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s --check-prefix=CHECK --check-prefix=GISEL +; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s +; RUN: llc < %s -global-isel=1 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s define signext i8 @test_vminv_s8(<8 x i8> %a1) { ; CHECK: test_vminv_s8 ; CHECK: sminv.8b b[[REGNUM:[0-9]+]], v0 -; SDAG-NEXT: smov.b w0, v[[REGNUM]][0] -; SDAG-NEXT: ret -; GISEL-NEXT: smov.b w8, v[[REGNUM]][0] -; GISEL-NEXT: sxtb w0, w8 -; GISEL-NEXT: ret +; CHECK-NEXT: smov.b w0, v[[REGNUM]][0] +; CHECK-NEXT: ret entry: %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> %a1) %0 = trunc i32 %vminv.i to i8 @@ -18,11 +15,8 @@ define signext i16 @test_vminv_s16(<4 x i16> %a1) { ; CHECK: test_vminv_s16 ; CHECK: sminv.4h h[[REGNUM:[0-9]+]], v0 -; SDAG-NEXT: smov.h w0, v[[REGNUM]][0] -; SDAG-NEXT: ret -; GISEL-NEXT: smov.h w8, v[[REGNUM]][0] -; GISEL-NEXT: sxth w0, w8 -; GISEL-NEXT: ret +; CHECK-NEXT: smov.h w0, v[[REGNUM]][0] +; CHECK-NEXT: ret entry: %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> %a1) %0 = trunc i32 %vminv.i to i16 @@ -43,11 +37,8 @@ define signext i8 @test_vminvq_s8(<16 x i8> %a1) { ; CHECK: test_vminvq_s8 ; CHECK: sminv.16b b[[REGNUM:[0-9]+]], v0 -; SDAG-NEXT: smov.b w0, v[[REGNUM]][0] -; SDAG-NEXT: ret -; GISEL-NEXT: smov.b w8, v[[REGNUM]][0] -; GISEL-NEXT: sxtb w0, w8 -; GISEL-NEXT: ret +; CHECK-NEXT: smov.b w0, v[[REGNUM]][0] +; CHECK-NEXT: ret entry: %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> %a1) %0 = trunc i32 %vminv.i to i8 @@ -57,11 +48,8 @@ define signext i16 @test_vminvq_s16(<8 x i16> %a1) { ; CHECK: test_vminvq_s16 ; CHECK: sminv.8h h[[REGNUM:[0-9]+]], v0 -; SDAG-NEXT: smov.h w0, v[[REGNUM]][0] -; SDAG-NEXT: ret -; GISEL-NEXT: smov.h w8, v[[REGNUM]][0] -; GISEL-NEXT: sxth w0, w8 -; GISEL-NEXT: ret +; CHECK-NEXT: smov.h w0, v[[REGNUM]][0] +; CHECK-NEXT: ret entry: %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> %a1) %0 = trunc i32 %vminv.i to i16 @@ -81,11 +69,8 @@ define <8 x i8> @test_vminv_s8_used_by_laneop(<8 x i8> %a1, <8 x i8> %a2) { ; CHECK-LABEL: test_vminv_s8_used_by_laneop: ; CHECK: sminv.8b b[[REGNUM:[0-9]+]], v1 -; SDAG-NEXT: mov.b v0[3], v[[REGNUM]][0] -; SDAG-NEXT: ret -; GISEL-NEXT: smov.b w8, v[[REGNUM]][0] -; GISEL-NEXT: mov.b v0[3], w8 -; GISEL-NEXT: ret +; CHECK-NEXT: mov.b v0[3], v[[REGNUM]][0] +; CHECK-NEXT: ret entry: %0 = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> %a2) %1 = trunc i32 %0 to i8 @@ -96,11 +81,8 @@ define <4 x i16> @test_vminv_s16_used_by_laneop(<4 x i16> %a1, <4 x i16> %a2) { ; CHECK-LABEL: test_vminv_s16_used_by_laneop: ; CHECK: sminv.4h h[[REGNUM:[0-9]+]], v1 -; SDAG-NEXT: mov.h v0[3], v[[REGNUM]][0] -; SDAG-NEXT: ret -; GISEL-NEXT: smov.h w8, v[[REGNUM]][0] -; GISEL-NEXT: mov.h v0[3], w8 -; GISEL-NEXT: ret +; CHECK-NEXT: mov.h v0[3], v[[REGNUM]][0] +; CHECK-NEXT: ret entry: %0 = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> %a2) %1 = trunc i32 %0 to i16 @@ -122,11 +104,8 @@ define <16 x i8> @test_vminvq_s8_used_by_laneop(<16 x i8> %a1, <16 x i8> %a2) { ; CHECK-LABEL: test_vminvq_s8_used_by_laneop: ; CHECK: sminv.16b b[[REGNUM:[0-9]+]], v1 -; SDAG-NEXT: mov.b v0[3], v[[REGNUM]][0] -; SDAG-NEXT: ret -; GISEL-NEXT: smov.b w8, v[[REGNUM]][0] -; GISEL-NEXT: mov.b v0[3], w8 -; GISEL-NEXT: ret +; CHECK-NEXT: mov.b v0[3], v[[REGNUM]][0] +; CHECK-NEXT: ret entry: %0 = tail call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> %a2) %1 = trunc i32 %0 to i8 @@ -137,11 +116,8 @@ define <8 x i16> @test_vminvq_s16_used_by_laneop(<8 x i16> %a1, <8 x i16> %a2) { ; CHECK-LABEL: test_vminvq_s16_used_by_laneop: ; CHECK: sminv.8h h[[REGNUM:[0-9]+]], v1 -; SDAG-NEXT: mov.h v0[3], v[[REGNUM]][0] -; SDAG-NEXT: ret -; GISEL-NEXT: smov.h w8, v[[REGNUM]][0] -; GISEL-NEXT: mov.h v0[3], w8 -; GISEL-NEXT: ret +; CHECK-NEXT: mov.h v0[3], v[[REGNUM]][0] +; CHECK-NEXT: ret entry: %0 = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> %a2) %1 = trunc i32 %0 to i16 diff --git a/llvm/test/CodeGen/AArch64/arm64-umaxv.ll b/llvm/test/CodeGen/AArch64/arm64-umaxv.ll --- a/llvm/test/CodeGen/AArch64/arm64-umaxv.ll +++ b/llvm/test/CodeGen/AArch64/arm64-umaxv.ll @@ -1,13 +1,12 @@ -; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s --check-prefix CHECK --check-prefix SDAG -; RUN: llc < %s -global-isel=1 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s --check-prefix CHECK --check-prefix GISEL +; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s +; RUN: llc < %s -global-isel=1 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s define i32 @vmax_u8x8(<8 x i8> %a) nounwind ssp { ; CHECK-LABEL: vmax_u8x8: ; CHECK: umaxv.8b b[[REG:[0-9]+]], v0 ; CHECK: fmov [[REG2:w[0-9]+]], s[[REG]] ; CHECK-NOT: and -; SDAG: cbz [[REG2]], -; GISEL: b +; CHECK: cbz [[REG2]], entry: %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %a) nounwind %tmp = trunc i32 %vmaxv.i to i8 @@ -30,8 +29,7 @@ ; CHECK: umaxv.4h h[[REG:[0-9]+]], v0 ; CHECK: fmov [[REG2:w[0-9]+]], s[[REG]] ; CHECK-NOT: and -; SDAG: cbz [[REG2]], -; GISEL: b +; CHECK: cbz [[REG2]], entry: %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> %a) nounwind %tmp = trunc i32 %vmaxv.i to i16 @@ -52,8 +50,7 @@ ; CHECK: umaxv.8h h[[REG:[0-9]+]], v0 ; CHECK: fmov [[REG2:w[0-9]+]], s[[REG]] ; CHECK-NOT: and -; SDAG: cbz [[REG2]], -; GISEL: b +; CHECK: cbz [[REG2]], entry: %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> %a) nounwind %tmp = trunc i32 %vmaxv.i to i16 @@ -74,8 +71,7 @@ ; CHECK: umaxv.16b b[[REG:[0-9]+]], v0 ; CHECK: fmov [[REG2:w[0-9]+]], s[[REG]] ; CHECK-NOT: and -; SDAG: cbz [[REG2]], -; GISEL: b +; CHECK: cbz [[REG2]], entry: %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %a) nounwind %tmp = trunc i32 %vmaxv.i to i8 diff --git a/llvm/test/CodeGen/AArch64/arm64-uminv.ll b/llvm/test/CodeGen/AArch64/arm64-uminv.ll --- a/llvm/test/CodeGen/AArch64/arm64-uminv.ll +++ b/llvm/test/CodeGen/AArch64/arm64-uminv.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s +; RUN: llc < %s -global-isel=1 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s define i32 @vmin_u8x8(<8 x i8> %a) nounwind ssp { ; CHECK-LABEL: vmin_u8x8: diff --git a/llvm/test/CodeGen/AArch64/arm64-vaddv.ll b/llvm/test/CodeGen/AArch64/arm64-vaddv.ll --- a/llvm/test/CodeGen/AArch64/arm64-vaddv.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vaddv.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false -mcpu=cyclone | FileCheck %s +; RUN: llc < %s -global-isel -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false -mcpu=cyclone | FileCheck %s define signext i8 @test_vaddv_s8(<8 x i8> %a1) { ; CHECK-LABEL: test_vaddv_s8: