diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1720,6 +1720,29 @@ return TargetOpcode::G_PTRMASK; case Intrinsic::lrint: return TargetOpcode::G_INTRINSIC_LRINT; + // FADD/FMUL require checking the FMF, so are handled elsewhere. + case Intrinsic::vector_reduce_fmin: + return TargetOpcode::G_VECREDUCE_FMIN; + case Intrinsic::vector_reduce_fmax: + return TargetOpcode::G_VECREDUCE_FMAX; + case Intrinsic::vector_reduce_add: + return TargetOpcode::G_VECREDUCE_ADD; + case Intrinsic::vector_reduce_mul: + return TargetOpcode::G_VECREDUCE_MUL; + case Intrinsic::vector_reduce_and: + return TargetOpcode::G_VECREDUCE_AND; + case Intrinsic::vector_reduce_or: + return TargetOpcode::G_VECREDUCE_OR; + case Intrinsic::vector_reduce_xor: + return TargetOpcode::G_VECREDUCE_XOR; + case Intrinsic::vector_reduce_smax: + return TargetOpcode::G_VECREDUCE_SMAX; + case Intrinsic::vector_reduce_smin: + return TargetOpcode::G_VECREDUCE_SMIN; + case Intrinsic::vector_reduce_umax: + return TargetOpcode::G_VECREDUCE_UMAX; + case Intrinsic::vector_reduce_umin: + return TargetOpcode::G_VECREDUCE_UMIN; } return Intrinsic::not_intrinsic; } @@ -2135,6 +2158,41 @@ EntryMBB.insert(EntryMBB.begin(), LocalEscape); } + return true; + } + case Intrinsic::vector_reduce_fadd: + case Intrinsic::vector_reduce_fmul: { + // Need to check for the reassoc flag to decide whether we want a + // sequential reduction opcode or not. + Register Dst = getOrCreateVReg(CI); + Register ScalarSrc = getOrCreateVReg(*CI.getArgOperand(0)); + Register VecSrc = getOrCreateVReg(*CI.getArgOperand(1)); + unsigned Opc = 0; + if (!CI.hasAllowReassoc()) { + // The sequential ordering case. + Opc = ID == Intrinsic::vector_reduce_fadd + ? TargetOpcode::G_VECREDUCE_SEQ_FADD + : TargetOpcode::G_VECREDUCE_SEQ_FMUL; + MIRBuilder.buildInstr(Opc, {Dst}, {ScalarSrc, VecSrc}, + MachineInstr::copyFlagsFromInstruction(CI)); + return true; + } + // We split the operation into a separate G_FADD/G_FMUL + the reduce, + // since the associativity doesn't matter. + unsigned ScalarOpc; + if (ID == Intrinsic::vector_reduce_fadd) { + Opc = TargetOpcode::G_VECREDUCE_FADD; + ScalarOpc = TargetOpcode::G_FADD; + } else { + Opc = TargetOpcode::G_VECREDUCE_FMUL; + ScalarOpc = TargetOpcode::G_FMUL; + } + LLT DstTy = MRI->getType(Dst); + auto Rdx = MIRBuilder.buildInstr( + Opc, {DstTy}, {VecSrc}, MachineInstr::copyFlagsFromInstruction(CI)); + MIRBuilder.buildInstr(ScalarOpc, {Dst}, {ScalarSrc, Rdx}, + MachineInstr::copyFlagsFromInstruction(CI)); + return true; } #define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC) \ diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -218,6 +218,11 @@ cl::desc("Split out cold blocks from machine functions based on profile " "information.")); +/// Disable the expand reductions pass for testing. +static cl::opt DisableExpandReductions( + "disable-expand-reductions", cl::init(false), cl::Hidden, + cl::desc("Disable the expand reduction intrinsics pass from running")); + /// Allow standard passes to be disabled by command line options. This supports /// simple binary flags that either suppress the pass or do nothing. /// i.e. -disable-mypass=false has no effect. @@ -708,7 +713,9 @@ addPass(createScalarizeMaskedMemIntrinPass()); // Expand reduction intrinsics into shuffle sequences if the target wants to. - addPass(createExpandReductionsPass()); + // Allow disabling it for testing purposes. + if (!DisableExpandReductions) + addPass(createExpandReductionsPass()); } /// Turn exception handling constructs into something the code generators can diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-reductions.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-reductions.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-reductions.ll @@ -0,0 +1,225 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -O0 -mtriple=aarch64-apple-ios -global-isel -disable-expand-reductions -stop-after=irtranslator %s -o - | FileCheck %s + +declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>) +declare double @llvm.vector.reduce.fmul.v4f64(double, <4 x double>) + +define float @fadd_seq(float %start, <4 x float> %vec) { + ; CHECK-LABEL: name: fadd_seq + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $q1, $s0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1 + ; CHECK: [[VECREDUCE_SEQ_FADD:%[0-9]+]]:_(s32) = G_VECREDUCE_SEQ_FADD [[COPY]](s32), [[COPY1]](<4 x s32>) + ; CHECK: $s0 = COPY [[VECREDUCE_SEQ_FADD]](s32) + ; CHECK: RET_ReallyLR implicit $s0 + %res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %vec) + ret float %res +} + +define float @fadd_fast(float %start, <4 x float> %vec) { + ; CHECK-LABEL: name: fadd_fast + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $q1, $s0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1 + ; CHECK: [[VECREDUCE_FADD:%[0-9]+]]:_(s32) = reassoc G_VECREDUCE_FADD [[COPY1]](<4 x s32>) + ; CHECK: [[FADD:%[0-9]+]]:_(s32) = reassoc G_FADD [[COPY]], [[VECREDUCE_FADD]] + ; CHECK: $s0 = COPY [[FADD]](s32) + ; CHECK: RET_ReallyLR implicit $s0 + %res = call reassoc float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %vec) + ret float %res +} + +define double @fmul_seq(double %start, <4 x double> %vec) { + ; CHECK-LABEL: name: fmul_seq + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $d0, $q1, $q2 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $d0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 + ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2 + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[COPY1]](<2 x s64>), [[COPY2]](<2 x s64>) + ; CHECK: [[VECREDUCE_SEQ_FMUL:%[0-9]+]]:_(s64) = G_VECREDUCE_SEQ_FMUL [[COPY]](s64), [[CONCAT_VECTORS]](<4 x s64>) + ; CHECK: $d0 = COPY [[VECREDUCE_SEQ_FMUL]](s64) + ; CHECK: RET_ReallyLR implicit $d0 + %res = call double @llvm.vector.reduce.fmul.v4f64(double %start, <4 x double> %vec) + ret double %res +} + +define double @fmul_fast(double %start, <4 x double> %vec) { + ; CHECK-LABEL: name: fmul_fast + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $d0, $q1, $q2 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $d0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 + ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2 + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[COPY1]](<2 x s64>), [[COPY2]](<2 x s64>) + ; CHECK: [[VECREDUCE_FMUL:%[0-9]+]]:_(s64) = reassoc G_VECREDUCE_FMUL [[CONCAT_VECTORS]](<4 x s64>) + ; CHECK: [[FMUL:%[0-9]+]]:_(s64) = reassoc G_FMUL [[COPY]], [[VECREDUCE_FMUL]] + ; CHECK: $d0 = COPY [[FMUL]](s64) + ; CHECK: RET_ReallyLR implicit $d0 + %res = call reassoc double @llvm.vector.reduce.fmul.v4f64(double %start, <4 x double> %vec) + ret double %res +} + +declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) +declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) + +define float @fmax(<4 x float> %vec) { + ; CHECK-LABEL: name: fmax + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[VECREDUCE_FMAX:%[0-9]+]]:_(s32) = G_VECREDUCE_FMAX [[COPY]](<4 x s32>) + ; CHECK: $s0 = COPY [[VECREDUCE_FMAX]](s32) + ; CHECK: RET_ReallyLR implicit $s0 + %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %vec) + ret float %res +} + +define float @fmin(<4 x float> %vec) { + ; CHECK-LABEL: name: fmin + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[VECREDUCE_FMIN:%[0-9]+]]:_(s32) = G_VECREDUCE_FMIN [[COPY]](<4 x s32>) + ; CHECK: $s0 = COPY [[VECREDUCE_FMIN]](s32) + ; CHECK: RET_ReallyLR implicit $s0 + %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %vec) + ret float %res +} + +define float @fmin_nnan(<4 x float> %vec) { + ; CHECK-LABEL: name: fmin_nnan + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[VECREDUCE_FMIN:%[0-9]+]]:_(s32) = nnan G_VECREDUCE_FMIN [[COPY]](<4 x s32>) + ; CHECK: $s0 = COPY [[VECREDUCE_FMIN]](s32) + ; CHECK: RET_ReallyLR implicit $s0 + %res = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> %vec) + ret float %res +} + +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) + +define i32 @add(<4 x i32> %vec) { + ; CHECK-LABEL: name: add + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[COPY]](<4 x s32>) + ; CHECK: $w0 = COPY [[VECREDUCE_ADD]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %vec) + ret i32 %res +} + +declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>) + +define i32 @mul(<4 x i32> %vec) { + ; CHECK-LABEL: name: mul + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[VECREDUCE_MUL:%[0-9]+]]:_(s32) = G_VECREDUCE_MUL [[COPY]](<4 x s32>) + ; CHECK: $w0 = COPY [[VECREDUCE_MUL]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %res = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %vec) + ret i32 %res +} + +declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>) + +define i32 @and(<4 x i32> %vec) { + ; CHECK-LABEL: name: and + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[VECREDUCE_AND:%[0-9]+]]:_(s32) = G_VECREDUCE_AND [[COPY]](<4 x s32>) + ; CHECK: $w0 = COPY [[VECREDUCE_AND]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %vec) + ret i32 %res +} + +declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>) + +define i32 @or(<4 x i32> %vec) { + ; CHECK-LABEL: name: or + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[VECREDUCE_OR:%[0-9]+]]:_(s32) = G_VECREDUCE_OR [[COPY]](<4 x s32>) + ; CHECK: $w0 = COPY [[VECREDUCE_OR]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %vec) + ret i32 %res +} + +declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>) + +define i32 @xor(<4 x i32> %vec) { + ; CHECK-LABEL: name: xor + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[VECREDUCE_XOR:%[0-9]+]]:_(s32) = G_VECREDUCE_XOR [[COPY]](<4 x s32>) + ; CHECK: $w0 = COPY [[VECREDUCE_XOR]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %vec) + ret i32 %res +} + +declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>) + +define i32 @smax(<4 x i32> %vec) { + ; CHECK-LABEL: name: smax + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[VECREDUCE_SMAX:%[0-9]+]]:_(s32) = G_VECREDUCE_SMAX [[COPY]](<4 x s32>) + ; CHECK: $w0 = COPY [[VECREDUCE_SMAX]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %vec) + ret i32 %res +} + +define i32 @smin(<4 x i32> %vec) { + ; CHECK-LABEL: name: smin + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[VECREDUCE_SMIN:%[0-9]+]]:_(s32) = G_VECREDUCE_SMIN [[COPY]](<4 x s32>) + ; CHECK: $w0 = COPY [[VECREDUCE_SMIN]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %vec) + ret i32 %res +} + +define i32 @umax(<4 x i32> %vec) { + ; CHECK-LABEL: name: umax + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[VECREDUCE_UMAX:%[0-9]+]]:_(s32) = G_VECREDUCE_UMAX [[COPY]](<4 x s32>) + ; CHECK: $w0 = COPY [[VECREDUCE_UMAX]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %vec) + ret i32 %res +} + +define i32 @umin(<4 x i32> %vec) { + ; CHECK-LABEL: name: umin + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[VECREDUCE_UMIN:%[0-9]+]]:_(s32) = G_VECREDUCE_UMIN [[COPY]](<4 x s32>) + ; CHECK: $w0 = COPY [[VECREDUCE_UMIN]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %vec) + ret i32 %res +}