diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td --- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td +++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td @@ -302,3 +302,108 @@ (STLXPX GPR64:$lo, GPR64:$hi, GPR64:$addr)>; def : Pat<(int_aarch64_stxp GPR64:$lo, GPR64:$hi, GPR64:$addr), (STXPX GPR64:$lo, GPR64:$hi, GPR64:$addr)>; + +multiclass SIMDAcrossLanesSignedIntrinsicBHS { + def : Pat<(i32 (intOp (v8i8 V64:$Rn))), + (i32 (SMOVvi8to32 + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), + (i64 0)))>; + def : Pat<(i32 (intOp (v16i8 V128:$Rn))), + (i32 (SMOVvi8to32 + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), + (i64 0)))>; + + def : Pat<(i32 (intOp (v4i16 V64:$Rn))), + (i32 (SMOVvi16to32 + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub), + (i64 0)))>; + def : Pat<(i32 (intOp (v8i16 V128:$Rn))), + (i32 (SMOVvi16to32 + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub), + (i64 0)))>; + + def : Pat<(i32 (intOp (v4i32 V128:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub), + ssub))>; +} + +multiclass SIMDAcrossLanesUnsignedIntrinsicBHS { + def : Pat<(i32 (intOp (v8i8 V64:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), + ssub))>; + def : Pat<(i32 (intOp (v16i8 V128:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), + ssub))>; + + def : Pat<(i32 (intOp (v4i16 V64:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub), + ssub))>; + def : Pat<(i32 (intOp (v8i16 V128:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub), + ssub))>; + + def : Pat<(i32 (intOp (v4i32 V128:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub), + ssub))>; +} + + +defm : SIMDAcrossLanesSignedIntrinsicBHS<"ADDV", int_aarch64_neon_saddv>; +// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm +def : Pat<(i32 (int_aarch64_neon_saddv (v2i32 V64:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (ADDPv2i32 V64:$Rn, V64:$Rn), dsub), + ssub))>; + +defm : SIMDAcrossLanesUnsignedIntrinsicBHS<"ADDV", int_aarch64_neon_uaddv>; +def : Pat<(i32 (int_aarch64_neon_uaddv (v2i32 V64:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (ADDPv2i32 V64:$Rn, V64:$Rn), dsub), + ssub))>; + +defm : SIMDAcrossLanesSignedIntrinsicBHS<"SMAXV", int_aarch64_neon_smaxv>; +def : Pat<(i32 (int_aarch64_neon_smaxv (v2i32 V64:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (SMAXPv2i32 V64:$Rn, V64:$Rn), dsub), + ssub))>; + +defm : SIMDAcrossLanesSignedIntrinsicBHS<"SMINV", int_aarch64_neon_sminv>; +def : Pat<(i32 (int_aarch64_neon_sminv (v2i32 V64:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (SMINPv2i32 V64:$Rn, V64:$Rn), dsub), + ssub))>; + +defm : SIMDAcrossLanesUnsignedIntrinsicBHS<"UMAXV", int_aarch64_neon_umaxv>; +def : Pat<(i32 (int_aarch64_neon_umaxv (v2i32 V64:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (UMAXPv2i32 V64:$Rn, V64:$Rn), dsub), + ssub))>; + +defm : SIMDAcrossLanesUnsignedIntrinsicBHS<"UMINV", int_aarch64_neon_uminv>; +def : Pat<(i32 (int_aarch64_neon_uminv (v2i32 V64:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (UMINPv2i32 V64:$Rn, V64:$Rn), dsub), + ssub))>; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -2804,7 +2804,7 @@ defm LDRX : Load64RO<0b11, 0, 0b01, GPR64, "ldr", i64, load>; // Floating-point -defm LDRB : Load8RO<0b00, 1, 0b01, FPR8Op, "ldr", untyped, load>; +defm LDRB : Load8RO<0b00, 1, 0b01, FPR8Op, "ldr", i8, load>; defm LDRH : Load16RO<0b01, 1, 0b01, FPR16Op, "ldr", f16, load>; defm LDRS : Load32RO<0b10, 1, 0b01, FPR32Op, "ldr", f32, load>; defm LDRD : Load64RO<0b11, 1, 0b01, FPR64Op, "ldr", f64, load>; @@ -3569,7 +3569,7 @@ // Floating-point -defm STRB : Store8RO< 0b00, 1, 0b00, FPR8Op, "str", untyped, store>; +defm STRB : Store8RO< 0b00, 1, 0b00, FPR8Op, "str", i8, store>; defm STRH : Store16RO<0b01, 1, 0b00, FPR16Op, "str", f16, store>; defm STRS : Store32RO<0b10, 1, 0b00, FPR32Op, "str", f32, store>; defm STRD : Store64RO<0b11, 1, 0b00, FPR64Op, "str", f64, store>; @@ -3979,7 +3979,7 @@ // (immediate pre-indexed) def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32z, "str", pre_store, i32>; def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64z, "str", pre_store, i64>; -def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8Op, "str", pre_store, untyped>; +def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8Op, "str", pre_store, i8>; def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16Op, "str", pre_store, f16>; def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32Op, "str", pre_store, f32>; def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64Op, "str", pre_store, f64>; @@ -4033,7 +4033,7 @@ // (immediate post-indexed) def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32z, "str", post_store, i32>; def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64z, "str", post_store, i64>; -def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8Op, "str", post_store, untyped>; +def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8Op, "str", post_store, i8>; def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16Op, "str", post_store, f16>; def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32Op, "str", post_store, f32>; def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64Op, "str", post_store, f64>; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -435,7 +435,7 @@ def Q31 : AArch64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias; } -def FPR8 : RegisterClass<"AArch64", [untyped], 8, (sequence "B%u", 0, 31)> { +def FPR8 : RegisterClass<"AArch64", [i8], 8, (sequence "B%u", 0, 31)> { let Size = 8; } def FPR16 : RegisterClass<"AArch64", [f16, bf16], 16, (sequence "H%u", 0, 31)> { diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -3166,15 +3166,15 @@ let Predicates = [NotInStreamingSVEMode] in { def : Pat<(sext_inreg (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index), i8), (i32 (SMOVvi8to32 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>; - def : Pat<(sext_inreg (anyext (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index)), i8), + def : Pat<(sext_inreg (anyext (i32 (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index))), i8), (i64 (SMOVvi8to64 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>; def : Pat<(sext_inreg (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index), i16), (i32 (SMOVvi16to32 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index))>; - def : Pat<(sext_inreg (anyext (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index)), i16), + def : Pat<(sext_inreg (anyext (i32 (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index))), i16), (i64 (SMOVvi16to64 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index))>; - def : Pat<(sext (vector_extract (nxv4i32 ZPR:$vec), VectorIndexS:$index)), + def : Pat<(sext (i32 (vector_extract (nxv4i32 ZPR:$vec), VectorIndexS:$index))), (i64 (SMOVvi32to64 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index))>; } // End NotInStreamingSVEMode diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp --- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -481,14 +481,35 @@ getValueMapping(RBIdx, Size), NumOperands); } -/// \returns true if a given intrinsic \p ID only uses and defines FPRs. -static bool isFPIntrinsic(unsigned ID) { +/// \returns true if a given intrinsic only uses and defines FPRs. +static bool isFPIntrinsic(const MachineRegisterInfo &MRI, + const MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC); // TODO: Add more intrinsics. - switch (ID) { + switch (MI.getIntrinsicID()) { default: return false; case Intrinsic::aarch64_neon_uaddlv: + case Intrinsic::aarch64_neon_uaddv: + case Intrinsic::aarch64_neon_umaxv: + case Intrinsic::aarch64_neon_uminv: + case Intrinsic::aarch64_neon_fmaxv: + case Intrinsic::aarch64_neon_fminv: + case Intrinsic::aarch64_neon_fmaxnmv: + case Intrinsic::aarch64_neon_fminnmv: return true; + case Intrinsic::aarch64_neon_saddlv: { + const LLT SrcTy = MRI.getType(MI.getOperand(2).getReg()); + return SrcTy.getElementType().getSizeInBits() >= 16 && + SrcTy.getElementCount().getFixedValue() >= 4; + } + case Intrinsic::aarch64_neon_saddv: + case Intrinsic::aarch64_neon_smaxv: + case Intrinsic::aarch64_neon_sminv: { + const LLT SrcTy = MRI.getType(MI.getOperand(2).getReg()); + return SrcTy.getElementType().getSizeInBits() >= 32 && + SrcTy.getElementCount().getFixedValue() >= 2; + } } } @@ -497,7 +518,7 @@ const TargetRegisterInfo &TRI, unsigned Depth) const { unsigned Op = MI.getOpcode(); - if (Op == TargetOpcode::G_INTRINSIC && isFPIntrinsic(MI.getIntrinsicID())) + if (Op == TargetOpcode::G_INTRINSIC && isFPIntrinsic(MRI, MI)) return true; // Do we have an explicit floating point instruction? @@ -996,9 +1017,8 @@ case TargetOpcode::G_INTRINSIC: { // Check if we know that the intrinsic has any constraints on its register // banks. If it does, then update the mapping accordingly. - unsigned ID = MI.getIntrinsicID(); unsigned Idx = 0; - if (!isFPIntrinsic(ID)) + if (!isFPIntrinsic(MRI, MI)) break; for (const auto &Op : MI.explicit_operands()) { if (Op.isReg()) diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll --- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=generic | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=generic | FileCheck %s -check-prefixes=CHECK,SDAG +; RUN: llc < %s -global-isel=1 -global-isel-abort=2 -mtriple=aarch64-eabi -aarch64-neon-syntax=generic 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL ; Function Attrs: nounwind readnone declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) @@ -9,6 +10,14 @@ declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>) declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) +; GISEL-NOT: Instruction selection used fallback path for add_B +; GISEL-NOT: Instruction selection used fallback path for add_H +; GISEL-NOT: Instruction selection used fallback path for add_S +; GISEL-NOT: Instruction selection used fallback path for add_D +; GISEL-NOT: Instruction selection used fallback path for oversized_ADDV_512 +; GISEL-NOT: Instruction selection used fallback path for addv_combine_i32 +; GISEL-NOT: Instruction selection used fallback path for addv_combine_i64 + define i8 @add_B(ptr %arr) { ; CHECK-LABEL: add_B: ; CHECK: // %bb.0: @@ -84,16 +93,27 @@ declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) define i32 @oversized_ADDV_512(ptr %arr) { -; CHECK-LABEL: oversized_ADDV_512: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #32] -; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: add v0.4s, v3.4s, v0.4s -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: addv s0, v0.4s -; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: ret +; SDAG-LABEL: oversized_ADDV_512: +; SDAG: // %bb.0: +; SDAG-NEXT: ldp q0, q1, [x0, #32] +; SDAG-NEXT: ldp q3, q2, [x0] +; SDAG-NEXT: add v0.4s, v3.4s, v0.4s +; SDAG-NEXT: add v1.4s, v2.4s, v1.4s +; SDAG-NEXT: add v0.4s, v0.4s, v1.4s +; SDAG-NEXT: addv s0, v0.4s +; SDAG-NEXT: fmov w0, s0 +; SDAG-NEXT: ret +; +; GISEL-LABEL: oversized_ADDV_512: +; GISEL: // %bb.0: +; GISEL-NEXT: ldp q0, q1, [x0] +; GISEL-NEXT: ldp q2, q3, [x0, #32] +; GISEL-NEXT: add v0.4s, v0.4s, v1.4s +; GISEL-NEXT: add v1.4s, v2.4s, v3.4s +; GISEL-NEXT: add v0.4s, v0.4s, v1.4s +; GISEL-NEXT: addv s0, v0.4s +; GISEL-NEXT: fmov w0, s0 +; GISEL-NEXT: ret %bin.rdx = load <16 x i32>, ptr %arr %r = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %bin.rdx) ret i32 %r @@ -128,12 +148,21 @@ } define i32 @addv_combine_i32(<4 x i32> %a1, <4 x i32> %a2) { -; CHECK-LABEL: addv_combine_i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: addv s0, v0.4s -; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: ret +; SDAG-LABEL: addv_combine_i32: +; SDAG: // %bb.0: // %entry +; SDAG-NEXT: add v0.4s, v0.4s, v1.4s +; SDAG-NEXT: addv s0, v0.4s +; SDAG-NEXT: fmov w0, s0 +; SDAG-NEXT: ret +; +; GISEL-LABEL: addv_combine_i32: +; GISEL: // %bb.0: // %entry +; GISEL-NEXT: addv s0, v0.4s +; GISEL-NEXT: addv s1, v1.4s +; GISEL-NEXT: fmov w8, s0 +; GISEL-NEXT: fmov w9, s1 +; GISEL-NEXT: add w0, w8, w9 +; GISEL-NEXT: ret entry: %rdx.1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a1) %rdx.2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2) @@ -142,12 +171,21 @@ } define i64 @addv_combine_i64(<2 x i64> %a1, <2 x i64> %a2) { -; CHECK-LABEL: addv_combine_i64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: add v0.2d, v0.2d, v1.2d -; CHECK-NEXT: addp d0, v0.2d -; CHECK-NEXT: fmov x0, d0 -; CHECK-NEXT: ret +; SDAG-LABEL: addv_combine_i64: +; SDAG: // %bb.0: // %entry +; SDAG-NEXT: add v0.2d, v0.2d, v1.2d +; SDAG-NEXT: addp d0, v0.2d +; SDAG-NEXT: fmov x0, d0 +; SDAG-NEXT: ret +; +; GISEL-LABEL: addv_combine_i64: +; GISEL: // %bb.0: // %entry +; GISEL-NEXT: addp d0, v0.2d +; GISEL-NEXT: addp d1, v1.2d +; GISEL-NEXT: fmov x8, d0 +; GISEL-NEXT: fmov x9, d1 +; GISEL-NEXT: add x0, x8, x9 +; GISEL-NEXT: ret entry: %rdx.1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1) %rdx.2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a2) diff --git a/llvm/test/CodeGen/AArch64/arm64-fminv.ll b/llvm/test/CodeGen/AArch64/arm64-fminv.ll --- a/llvm/test/CodeGen/AArch64/arm64-fminv.ll +++ b/llvm/test/CodeGen/AArch64/arm64-fminv.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s +; RUN: llc -global-isel=1 -mtriple=arm64-linux-gnu -o - %s | FileCheck %s define float @test_fminv_v2f32(<2 x float> %in) { ; CHECK: test_fminv_v2f32: diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-across.ll b/llvm/test/CodeGen/AArch64/arm64-neon-across.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-across.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-across.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s +; RUN: llc < %s -global-isel=1 -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s declare float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float>) diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll b/llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s +; RUN: llc -global-isel=1 -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>)