diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -152,6 +152,7 @@ bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI) const; unsigned emitConstantPoolEntry(const Constant *CPVal, MachineFunction &MF) const; @@ -2959,11 +2960,52 @@ return selectConcatVectors(I, MRI); case TargetOpcode::G_JUMP_TABLE: return selectJumpTable(I, MRI); + case TargetOpcode::G_VECREDUCE_FADD: + case TargetOpcode::G_VECREDUCE_ADD: + return selectReduction(I, MRI); } return false; } +bool AArch64InstructionSelector::selectReduction( + MachineInstr &I, MachineRegisterInfo &MRI) const { + Register VecReg = I.getOperand(1).getReg(); + LLT VecTy = MRI.getType(VecReg); + if (I.getOpcode() == TargetOpcode::G_VECREDUCE_ADD) { + unsigned Opc = 0; + if (VecTy == LLT::vector(16, 8)) + Opc = AArch64::ADDVv16i8v; + else if (VecTy == LLT::vector(8, 16)) + Opc = AArch64::ADDVv8i16v; + else if (VecTy == LLT::vector(4, 32)) + Opc = AArch64::ADDVv4i32v; + else if (VecTy == LLT::vector(2, 64)) + Opc = AArch64::ADDPv2i64p; + else { + LLVM_DEBUG(dbgs() << "Unhandled type for add reduction"); + return false; + } + I.setDesc(TII.get(Opc)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + + if (I.getOpcode() == TargetOpcode::G_VECREDUCE_FADD) { + unsigned Opc = 0; + if (VecTy == LLT::vector(2, 32)) + Opc = AArch64::FADDPv2i32p; + else if (VecTy == LLT::vector(2, 64)) + Opc = AArch64::FADDPv2i64p; + else { + LLVM_DEBUG(dbgs() << "Unhandled type for fadd reduction"); + return false; + } + I.setDesc(TII.get(Opc)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + return false; +} + bool AArch64InstructionSelector::selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI) const { assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT"); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-reduce-add.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-reduce-add.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-reduce-add.mir @@ -0,0 +1,114 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=aarch64 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=1 %s -o - | FileCheck %s +--- +name: add_B +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +liveins: + - { reg: '$x0' } +body: | + bb.1: + liveins: $x0 + + ; CHECK-LABEL: name: add_B + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 + ; CHECK: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[COPY]], 0 :: (load 16) + ; CHECK: [[ADDVv16i8v:%[0-9]+]]:fpr8 = ADDVv16i8v [[LDRQui]] + ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:fpr32 = SUBREG_TO_REG 0, [[ADDVv16i8v]], %subreg.bsub + ; CHECK: [[COPY1:%[0-9]+]]:gpr32all = COPY [[SUBREG_TO_REG]] + ; CHECK: $w0 = COPY [[COPY1]] + ; CHECK: RET_ReallyLR implicit $w0 + %0:gpr(p0) = COPY $x0 + %1:fpr(<16 x s8>) = G_LOAD %0(p0) :: (load 16) + %2:fpr(s8) = G_VECREDUCE_ADD %1(<16 x s8>) + %4:gpr(s8) = COPY %2(s8) + %3:gpr(s32) = G_ANYEXT %4(s8) + $w0 = COPY %3(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: add_H +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +liveins: + - { reg: '$x0' } +body: | + bb.1: + liveins: $x0 + + ; CHECK-LABEL: name: add_H + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 + ; CHECK: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[COPY]], 0 :: (load 16) + ; CHECK: [[ADDVv8i16v:%[0-9]+]]:fpr16 = ADDVv8i16v [[LDRQui]] + ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:fpr32 = SUBREG_TO_REG 0, [[ADDVv8i16v]], %subreg.hsub + ; CHECK: [[COPY1:%[0-9]+]]:gpr32all = COPY [[SUBREG_TO_REG]] + ; CHECK: $w0 = COPY [[COPY1]] + ; CHECK: RET_ReallyLR implicit $w0 + %0:gpr(p0) = COPY $x0 + %1:fpr(<8 x s16>) = G_LOAD %0(p0) :: (load 16) + %2:fpr(s16) = G_VECREDUCE_ADD %1(<8 x s16>) + %4:gpr(s16) = COPY %2(s16) + %3:gpr(s32) = G_ANYEXT %4(s16) + $w0 = COPY %3(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: add_S +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +liveins: + - { reg: '$x0' } +body: | + bb.1: + liveins: $x0 + + ; CHECK-LABEL: name: add_S + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 + ; CHECK: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[COPY]], 0 :: (load 16) + ; CHECK: [[ADDVv4i32v:%[0-9]+]]:fpr32 = ADDVv4i32v [[LDRQui]] + ; CHECK: $w0 = COPY [[ADDVv4i32v]] + ; CHECK: RET_ReallyLR implicit $w0 + %0:gpr(p0) = COPY $x0 + %1:fpr(<4 x s32>) = G_LOAD %0(p0) :: (load 16) + %2:fpr(s32) = G_VECREDUCE_ADD %1(<4 x s32>) + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: add_D +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +liveins: + - { reg: '$x0' } +body: | + bb.1: + liveins: $x0 + + ; CHECK-LABEL: name: add_D + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 + ; CHECK: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[COPY]], 0 :: (load 16) + ; CHECK: [[ADDPv2i64p:%[0-9]+]]:fpr64 = ADDPv2i64p [[LDRQui]] + ; CHECK: $x0 = COPY [[ADDPv2i64p]] + ; CHECK: RET_ReallyLR implicit $x0 + %0:gpr(p0) = COPY $x0 + %1:fpr(<2 x s64>) = G_LOAD %0(p0) :: (load 16) + %2:fpr(s64) = G_VECREDUCE_ADD %1(<2 x s64>) + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 + +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-reduce-fadd.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-reduce-fadd.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-reduce-fadd.mir @@ -0,0 +1,44 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=aarch64 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=1 %s -o - | FileCheck %s +--- +name: fadd_v2s32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $d0 + + ; CHECK-LABEL: name: fadd_v2s32 + ; CHECK: liveins: $d0 + ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0 + ; CHECK: [[FADDPv2i32p:%[0-9]+]]:fpr32 = FADDPv2i32p [[COPY]] + ; CHECK: $w0 = COPY [[FADDPv2i32p]] + ; CHECK: RET_ReallyLR implicit $w0 + %0:fpr(<2 x s32>) = COPY $d0 + %1:fpr(s32) = G_VECREDUCE_FADD %0(<2 x s32>) + $w0 = COPY %1(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: fadd_v2s64 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $q0 + + ; CHECK-LABEL: name: fadd_v2s64 + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 + ; CHECK: [[FADDPv2i64p:%[0-9]+]]:fpr64 = FADDPv2i64p [[COPY]] + ; CHECK: $x0 = COPY [[FADDPv2i64p]] + ; CHECK: RET_ReallyLR implicit $x0 + %0:fpr(<2 x s64>) = COPY $q0 + %2:fpr(s64) = G_VECREDUCE_FADD %0(<2 x s64>) + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 + +... diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll --- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll @@ -142,7 +142,7 @@ } declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) -declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) define i16 @uabd16b_rdx(<16 x i8>* %a, <16 x i8>* %b) { ; CHECK-LABEL: uabd16b_rdx @@ -168,7 +168,7 @@ %abcmp = icmp slt <16 x i32> %abdiff, zeroinitializer %ababs = sub nsw <16 x i32> zeroinitializer, %abdiff %absel = select <16 x i1> %abcmp, <16 x i32> %ababs, <16 x i32> %abdiff - %reduced_v = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %absel) + %reduced_v = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %absel) ret i32 %reduced_v } @@ -181,13 +181,13 @@ %abcmp = icmp slt <16 x i32> %abdiff, zeroinitializer %ababs = sub nsw <16 x i32> zeroinitializer, %abdiff %absel = select <16 x i1> %abcmp, <16 x i32> %ababs, <16 x i32> %abdiff - %reduced_v = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %absel) + %reduced_v = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %absel) ret i32 %reduced_v } declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) define i32 @uabd8h_rdx(<8 x i16>* %a, <8 x i16>* %b) { ; CHECK-LABEL: uabd8h_rdx @@ -219,19 +219,22 @@ define i32 @uabdl4s_rdx_i32(<4 x i16> %a, <4 x i16> %b) { ; CHECK-LABEL: uabdl4s_rdx_i32 -; CHECK: uabdl.4s +; DAG: uabdl.4s + +; GISel doesn't match this pattern yet. +; GISEL: addv.4s %aext = zext <4 x i16> %a to <4 x i32> %bext = zext <4 x i16> %b to <4 x i32> %abdiff = sub nsw <4 x i32> %aext, %bext %abcmp = icmp slt <4 x i32> %abdiff, zeroinitializer %ababs = sub nsw <4 x i32> zeroinitializer, %abdiff %absel = select <4 x i1> %abcmp, <4 x i32> %ababs, <4 x i32> %abdiff - %reduced_v = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %absel) + %reduced_v = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %absel) ret i32 %reduced_v } declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) define i64 @uabd4s_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) { ; CHECK: uabd4s_rdx @@ -263,14 +266,17 @@ define i64 @uabdl2d_rdx_i64(<2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: uabdl2d_rdx_i64 -; CHECK: uabdl.2d +; DAG: uabdl.2d + +; GISel doesn't match this pattern yet +; GISEL: addp.2d %aext = zext <2 x i32> %a to <2 x i64> %bext = zext <2 x i32> %b to <2 x i64> %abdiff = sub nsw <2 x i64> %aext, %bext %abcmp = icmp slt <2 x i64> %abdiff, zeroinitializer %ababs = sub nsw <2 x i64> zeroinitializer, %abdiff %absel = select <2 x i1> %abcmp, <2 x i64> %ababs, <2 x i64> %abdiff - %reduced_v = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %absel) + %reduced_v = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %absel) ret i64 %reduced_v }