diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td --- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td +++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td @@ -524,6 +524,20 @@ def int_ppc_altivec_vprtybq : GCCBuiltin<"__builtin_altivec_vprtybq">, Intrinsic<[llvm_v1i128_ty],[llvm_v1i128_ty],[IntrNoMem]>; + // BCD intrinsics. + def int_ppc_bcdadd : GCCBuiltin<"__builtin_ppc_bcdadd">, Intrinsic< + [llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + def int_ppc_bcdadd_p : GCCBuiltin<"__builtin_ppc_bcdadd_p">, Intrinsic< + [llvm_i32_ty], [llvm_i32_ty, llvm_v16i8_ty, llvm_v16i8_ty], + [IntrNoMem, ImmArg>]>; + def int_ppc_bcdsub : GCCBuiltin<"__builtin_ppc_bcdsub">, Intrinsic< + [llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + def int_ppc_bcdsub_p : GCCBuiltin<"__builtin_ppc_bcdsub_p">, Intrinsic< + [llvm_i32_ty], [llvm_i32_ty, llvm_v16i8_ty, llvm_v16i8_ty], + [IntrNoMem, ImmArg>]>; + // P10 Vector Extract with Mask def int_ppc_altivec_vextractbm : GCCBuiltin<"__builtin_altivec_vextractbm">, Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty], [IntrNoMem]>; diff --git a/llvm/lib/Target/PowerPC/P10InstrResources.td b/llvm/lib/Target/PowerPC/P10InstrResources.td --- a/llvm/lib/Target/PowerPC/P10InstrResources.td +++ b/llvm/lib/Target/PowerPC/P10InstrResources.td @@ -626,7 +626,9 @@ // 5 Cycles Fixed-Point and BCD operations, 3 input operands def : InstRW<[P10W_DX_5C, P10W_DISP_ANY, P10DX_Read, P10DX_Read, P10DX_Read], (instrs + BCDADD_rec, BCDS_rec, + BCDSUB_rec, BCDTRUNC_rec, VADDECUQ, VADDEUQM, diff --git a/llvm/lib/Target/PowerPC/P9InstrResources.td b/llvm/lib/Target/PowerPC/P9InstrResources.td --- a/llvm/lib/Target/PowerPC/P9InstrResources.td +++ b/llvm/lib/Target/PowerPC/P9InstrResources.td @@ -624,7 +624,9 @@ BCDS_rec, BCDTRUNC_rec, BCDUS_rec, - BCDUTRUNC_rec + BCDUTRUNC_rec, + BCDADD_rec, + BCDSUB_rec )>; // 12 Cycle DFU operation. Only one DFU unit per CPU so we use a whole diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -5049,16 +5049,94 @@ // value for the comparison. When selecting through a .td file, a type // error is raised. Must check this first so we never break on the // !Subtarget->isISA3_1() check. - if (N->getConstantOperandVal(0) == Intrinsic::ppc_fsels) { + auto IntID = N->getConstantOperandVal(0); + if (IntID == Intrinsic::ppc_fsels) { SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3)}; CurDAG->SelectNodeTo(N, PPC::FSELS, MVT::f32, Ops); return; } + if (IntID == Intrinsic::ppc_bcdadd_p || IntID == Intrinsic::ppc_bcdsub_p) { + auto Pred = N->getConstantOperandVal(1); + unsigned Opcode = + IntID == Intrinsic::ppc_bcdadd_p ? PPC::BCDADD_rec : PPC::BCDSUB_rec; + unsigned SubReg = 0; + unsigned ShiftVal = 0; + bool Reverse = false; + switch (Pred) { + case 0: + SubReg = PPC::sub_eq; + ShiftVal = 1; + break; + case 1: + SubReg = PPC::sub_eq; + ShiftVal = 1; + Reverse = true; + break; + case 2: + SubReg = PPC::sub_lt; + ShiftVal = 3; + break; + case 3: + SubReg = PPC::sub_lt; + ShiftVal = 3; + Reverse = true; + break; + case 4: + SubReg = PPC::sub_gt; + ShiftVal = 2; + break; + case 5: + SubReg = PPC::sub_gt; + ShiftVal = 2; + Reverse = true; + break; + case 6: + SubReg = PPC::sub_un; + break; + case 7: + SubReg = PPC::sub_un; + Reverse = true; + break; + } + + EVT VTs[] = {MVT::v16i8, MVT::Glue}; + SDValue Ops[] = {N->getOperand(2), N->getOperand(3), + CurDAG->getTargetConstant(0, dl, MVT::i32)}; + SDValue BCDOp = SDValue(CurDAG->getMachineNode(Opcode, dl, VTs, Ops), 0); + SDValue CR6Reg = CurDAG->getRegister(PPC::CR6, MVT::i32); + // On Power10, we can use SETBC[R]. On prior architectures, we have to use + // MFOCRF and shift/negate the value. + if (Subtarget->isISA3_1()) { + SDValue SubRegIdx = CurDAG->getTargetConstant(SubReg, dl, MVT::i32); + SDValue CRBit = SDValue( + CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::i1, + CR6Reg, SubRegIdx, BCDOp.getValue(1)), + 0); + CurDAG->SelectNodeTo(N, Reverse ? PPC::SETBCR : PPC::SETBC, MVT::i32, + CRBit); + } else { + SDValue Move = + SDValue(CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, CR6Reg, + BCDOp.getValue(1)), + 0); + SDValue Ops[] = {Move, getI32Imm((32 - (4 + ShiftVal)) & 31, dl), + getI32Imm(31, dl), getI32Imm(31, dl)}; + if (!Reverse) + CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + else { + SDValue Shift = SDValue( + CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0); + CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Shift, getI32Imm(1, dl)); + } + } + return; + } + if (!Subtarget->isISA3_1()) break; unsigned Opcode = 0; - switch (N->getConstantOperandVal(0)) { + switch (IntID) { default: break; case Intrinsic::ppc_altivec_vstribr_p: diff --git a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td --- a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td @@ -1161,6 +1161,22 @@ } // end HasAltivec +// [PO VRT VRA VRB 1 PS XO], "_o" means CR6 is set. +class VX_VT5_VA5_VB5_PS1_XO9_o xo, string opc, list pattern> + : VX_RD5_RSp5_PS1_XO9 { + let Defs = [CR6]; +} + +// [PO VRT VRA VRB 1 / XO] +class VX_VT5_VA5_VB5_XO9_o xo, string opc, list pattern> + : VX_RD5_RSp5_PS1_XO9 { + let Defs = [CR6]; + let PS = 0; +} + def HasP8Altivec : Predicate<"Subtarget->hasP8Altivec()">; def HasP8Crypto : Predicate<"Subtarget->hasP8Crypto()">; let Predicates = [HasP8Altivec] in { @@ -1351,6 +1367,13 @@ v2i64, v4i32>; def VUPKLSW : VX2_Int_Ty2<1742, "vupklsw", int_ppc_altivec_vupklsw, v2i64, v4i32>; +def BCDADD_rec : VX_VT5_VA5_VB5_PS1_XO9_o<1, "bcdadd." , []>; +def BCDSUB_rec : VX_VT5_VA5_VB5_PS1_XO9_o<65, "bcdsub." , []>; + +def : Pat<(v16i8 (int_ppc_bcdadd v16i8:$vA, v16i8:$vB, timm:$PS)), + (BCDADD_rec $vA, $vB, $PS)>; +def : Pat<(v16i8 (int_ppc_bcdsub v16i8:$vA, v16i8:$vB, timm:$PS)), + (BCDSUB_rec $vA, $vB, $PS)>; // Shuffle patterns for unary and swapped (LE) vector pack modulo. def:Pat<(vpkudum_unary_shuffle v16i8:$vA, undef), @@ -1598,22 +1621,6 @@ def BCDSETSGN_rec : VX_VT5_EO5_VB5_PS1_XO9_o<31, 385, "bcdsetsgn.", []>; -// [PO VRT VRA VRB 1 PS XO], "_o" means CR6 is set. -class VX_VT5_VA5_VB5_PS1_XO9_o xo, string opc, list pattern> - : VX_RD5_RSp5_PS1_XO9 { - let Defs = [CR6]; -} - -// [PO VRT VRA VRB 1 / XO] -class VX_VT5_VA5_VB5_XO9_o xo, string opc, list pattern> - : VX_RD5_RSp5_PS1_XO9 { - let Defs = [CR6]; - let PS = 0; -} - // Decimal Shift/Unsigned-Shift/Shift-and-Round def BCDS_rec : VX_VT5_VA5_VB5_PS1_XO9_o<193, "bcds." , []>; def BCDUS_rec : VX_VT5_VA5_VB5_XO9_o <129, "bcdus.", []>; diff --git a/llvm/test/CodeGen/PowerPC/bcd-intrinsics.ll b/llvm/test/CodeGen/PowerPC/bcd-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/bcd-intrinsics.ll @@ -0,0 +1,212 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-P9 + +define dso_local i64 @test_invalid(<16 x i8> %a) local_unnamed_addr #0 { +; CHECK-LABEL: test_invalid: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: bcdsub. v2, v2, v2, 0 +; CHECK-NEXT: setbc r3, 4*cr6+un +; CHECK-NEXT: extsw r3, r3 +; CHECK-NEXT: blr +; +; CHECK-P9-LABEL: test_invalid: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: bcdsub. v2, v2, v2, 0 +; CHECK-P9-NEXT: mfocrf r3, 2 +; CHECK-P9-NEXT: rlwinm r3, r3, 28, 31, 31 +; CHECK-P9-NEXT: extsw r3, r3 +; CHECK-P9-NEXT: blr +entry: + %0 = tail call i32 @llvm.ppc.bcdsub.p(i32 6, <16 x i8> %a, <16 x i8> %a) #2 + %conv.i = sext i32 %0 to i64 + ret i64 %conv.i +} + +define dso_local <16 x i8> @test_add(<16 x i8> %a, <16 x i8> %b, i64 %ps) local_unnamed_addr #0 { +; CHECK-LABEL: test_add: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: bcdadd. v2, v2, v3, 1 +; CHECK-NEXT: blr +; +; CHECK-P9-LABEL: test_add: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: bcdadd. v2, v2, v3, 1 +; CHECK-P9-NEXT: blr +entry: + %0 = tail call <16 x i8> @llvm.ppc.bcdadd(<16 x i8> %a, <16 x i8> %b, i32 1) + ret <16 x i8> %0 +} + +define dso_local i64 @test_add_ofl(<16 x i8> %a, <16 x i8> %b, i64 %ps) local_unnamed_addr #0 { +; CHECK-LABEL: test_add_ofl: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: bcdadd. v2, v2, v3, 0 +; CHECK-NEXT: setbc r3, 4*cr6+un +; CHECK-NEXT: extsw r3, r3 +; CHECK-NEXT: blr +; +; CHECK-P9-LABEL: test_add_ofl: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: bcdadd. v2, v2, v3, 0 +; CHECK-P9-NEXT: mfocrf r3, 2 +; CHECK-P9-NEXT: rlwinm r3, r3, 28, 31, 31 +; CHECK-P9-NEXT: extsw r3, r3 +; CHECK-P9-NEXT: blr +entry: + %0 = tail call i32 @llvm.ppc.bcdadd.p(i32 6, <16 x i8> %a, <16 x i8> %b) #2 + %conv.i = sext i32 %0 to i64 + ret i64 %conv.i +} + +define dso_local <16 x i8> @test_sub(<16 x i8> %a, <16 x i8> %b, i64 %ps) local_unnamed_addr #0 { +; CHECK-LABEL: test_sub: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: bcdsub. v2, v2, v3, 0 +; CHECK-NEXT: blr +; +; CHECK-P9-LABEL: test_sub: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: bcdsub. v2, v2, v3, 0 +; CHECK-P9-NEXT: blr +entry: + %0 = tail call <16 x i8> @llvm.ppc.bcdsub(<16 x i8> %a, <16 x i8> %b, i32 0) + ret <16 x i8> %0 +} + +define dso_local i64 @test_sub_ofl(<16 x i8> %a, <16 x i8> %b, i64 %ps) local_unnamed_addr #0 { +; CHECK-LABEL: test_sub_ofl: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: bcdsub. v2, v2, v3, 0 +; CHECK-NEXT: setbc r3, 4*cr6+un +; CHECK-NEXT: extsw r3, r3 +; CHECK-NEXT: blr +; +; CHECK-P9-LABEL: test_sub_ofl: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: bcdsub. v2, v2, v3, 0 +; CHECK-P9-NEXT: mfocrf r3, 2 +; CHECK-P9-NEXT: rlwinm r3, r3, 28, 31, 31 +; CHECK-P9-NEXT: extsw r3, r3 +; CHECK-P9-NEXT: blr +entry: + %0 = tail call i32 @llvm.ppc.bcdsub.p(i32 6, <16 x i8> %a, <16 x i8> %b) #2 + %conv.i = sext i32 %0 to i64 + ret i64 %conv.i +} + +define dso_local i64 @test_cmplt(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 { +; CHECK-LABEL: test_cmplt: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: bcdsub. v2, v2, v3, 0 +; CHECK-NEXT: setbc r3, 4*cr6+lt +; CHECK-NEXT: extsw r3, r3 +; CHECK-NEXT: blr +; +; CHECK-P9-LABEL: test_cmplt: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: bcdsub. v2, v2, v3, 0 +; CHECK-P9-NEXT: mfocrf r3, 2 +; CHECK-P9-NEXT: rlwinm r3, r3, 25, 31, 31 +; CHECK-P9-NEXT: extsw r3, r3 +; CHECK-P9-NEXT: blr +entry: + %0 = tail call i32 @llvm.ppc.bcdsub.p(i32 2, <16 x i8> %a, <16 x i8> %b) #2 + %conv.i = sext i32 %0 to i64 + ret i64 %conv.i +} + +define dso_local i64 @test_cmpgt(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 { +; CHECK-LABEL: test_cmpgt: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: bcdsub. v2, v2, v3, 0 +; CHECK-NEXT: setbc r3, 4*cr6+gt +; CHECK-NEXT: extsw r3, r3 +; CHECK-NEXT: blr +; +; CHECK-P9-LABEL: test_cmpgt: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: bcdsub. v2, v2, v3, 0 +; CHECK-P9-NEXT: mfocrf r3, 2 +; CHECK-P9-NEXT: rlwinm r3, r3, 26, 31, 31 +; CHECK-P9-NEXT: extsw r3, r3 +; CHECK-P9-NEXT: blr +entry: + %0 = tail call i32 @llvm.ppc.bcdsub.p(i32 4, <16 x i8> %a, <16 x i8> %b) #2 + %conv.i = sext i32 %0 to i64 + ret i64 %conv.i +} + +define dso_local i64 @test_cmpeq(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 { +; CHECK-LABEL: test_cmpeq: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: bcdsub. v2, v2, v3, 0 +; CHECK-NEXT: setbc r3, 4*cr6+eq +; CHECK-NEXT: extsw r3, r3 +; CHECK-NEXT: blr +; +; CHECK-P9-LABEL: test_cmpeq: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: bcdsub. v2, v2, v3, 0 +; CHECK-P9-NEXT: mfocrf r3, 2 +; CHECK-P9-NEXT: rlwinm r3, r3, 27, 31, 31 +; CHECK-P9-NEXT: extsw r3, r3 +; CHECK-P9-NEXT: blr +entry: + %0 = tail call i32 @llvm.ppc.bcdsub.p(i32 0, <16 x i8> %a, <16 x i8> %b) #2 + %conv.i = sext i32 %0 to i64 + ret i64 %conv.i +} + +define dso_local i64 @test_cmpge(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 { +; CHECK-LABEL: test_cmpge: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: bcdsub. v2, v2, v3, 0 +; CHECK-NEXT: setbcr r3, 4*cr6+lt +; CHECK-NEXT: extsw r3, r3 +; CHECK-NEXT: blr +; +; CHECK-P9-LABEL: test_cmpge: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: bcdsub. v2, v2, v3, 0 +; CHECK-P9-NEXT: mfocrf r3, 2 +; CHECK-P9-NEXT: rlwinm r3, r3, 25, 31, 31 +; CHECK-P9-NEXT: xori r3, r3, 1 +; CHECK-P9-NEXT: extsw r3, r3 +; CHECK-P9-NEXT: blr +entry: + %0 = tail call i32 @llvm.ppc.bcdsub.p(i32 3, <16 x i8> %a, <16 x i8> %b) #2 + %conv.i = sext i32 %0 to i64 + ret i64 %conv.i +} + +define dso_local i64 @test_cmple(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 { +; CHECK-LABEL: test_cmple: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: bcdsub. v2, v2, v3, 0 +; CHECK-NEXT: setbcr r3, 4*cr6+gt +; CHECK-NEXT: extsw r3, r3 +; CHECK-NEXT: blr +; +; CHECK-P9-LABEL: test_cmple: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: bcdsub. v2, v2, v3, 0 +; CHECK-P9-NEXT: mfocrf r3, 2 +; CHECK-P9-NEXT: rlwinm r3, r3, 26, 31, 31 +; CHECK-P9-NEXT: xori r3, r3, 1 +; CHECK-P9-NEXT: extsw r3, r3 +; CHECK-P9-NEXT: blr +entry: + %0 = tail call i32 @llvm.ppc.bcdsub.p(i32 5, <16 x i8> %a, <16 x i8> %b) #2 + %conv.i = sext i32 %0 to i64 + ret i64 %conv.i +} + +declare i32 @llvm.ppc.bcdsub.p(i32 immarg, <16 x i8>, <16 x i8>) #1 +declare i32 @llvm.ppc.bcdadd.p(i32 immarg, <16 x i8>, <16 x i8>) #1 +declare <16 x i8> @llvm.ppc.bcdadd(<16 x i8>, <16 x i8>, i32 immarg) #1 +declare <16 x i8> @llvm.ppc.bcdsub(<16 x i8>, <16 x i8>, i32 immarg) #1