diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -102,6 +102,9 @@ /// vector or scalar. XXSPLTI_SP_TO_DP, + /// XXSPLTI32DX - The PPC XXSPLTI32DX instruction. + XXSPLTI32DX, + /// VECINSERT - The PPC vector insert instruction /// VECINSERT, @@ -1270,6 +1273,10 @@ /// essentially v16i8 vector version of VINSERTH. SDValue lowerToVINSERTB(ShuffleVectorSDNode *N, SelectionDAG &DAG) const; + /// lowerToXXSPLTI32DX - Return the SDValue if this VECTOR_SHUFFLE can be + /// handled by the XXSPLTI32DX instruction introduced in ISA 3.1. + SDValue lowerToXXSPLTI32DX(ShuffleVectorSDNode *N, SelectionDAG &DAG) const; + // Return whether the call instruction can potentially be optimized to a // tail call. This will cause the optimizers to attempt to move, or // duplicate return instructions to help enable tail call optimizations. diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1477,6 +1477,8 @@ case PPCISD::XXSPLT: return "PPCISD::XXSPLT"; case PPCISD::XXSPLTI_SP_TO_DP: return "PPCISD::XXSPLTI_SP_TO_DP"; + case PPCISD::XXSPLTI32DX: + return "PPCISD::XXSPLTI32DX"; case PPCISD::VECINSERT: return "PPCISD::VECINSERT"; case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI"; case PPCISD::VECSHL: return "PPCISD::VECSHL"; @@ -9778,6 +9780,76 @@ return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); } +/// lowerToXXSPLTI32DX - Return the SDValue if this VECTOR_SHUFFLE can be +/// handled by the XXSPLTI32DX instruction introduced in ISA 3.1, otherwise +/// return the default SDValue. +SDValue PPCTargetLowering::lowerToXXSPLTI32DX(ShuffleVectorSDNode *SVN, + SelectionDAG &DAG) const { + SDValue LHS = SVN->getOperand(0); + SDValue RHS = SVN->getOperand(1); + auto ShuffleMask = SVN->getMask(); + SDValue VecShuffle(SVN, 0); + SDLoc DL(SVN); + + // Check that we have a four byte shuffle. + if (!isNByteElemShuffleMask(SVN, 4, 1)) + return SDValue(); + + // The LHS and RHS may be bitcasts to v8i16 as we canonicalize shuffles + // to v8i16. Peek through the bitcasts to get the actual operands, + // and canonicalize the RHS being a BUILD_VECTOR when lowering to xxsplti32dx. + LHS = peekThroughBitcasts(LHS); + RHS = peekThroughBitcasts(RHS); + if (RHS->getOpcode() != ISD::BUILD_VECTOR) { + std::swap(LHS, RHS); + VecShuffle = DAG.getCommutedVectorShuffle(*SVN); + ShuffleMask = cast(VecShuffle)->getMask(); + } + + // Ensure that the RHS is a vector of constants. + BuildVectorSDNode *BVN = dyn_cast(RHS.getNode()); + if (!BVN) + return SDValue(); + + APInt APSplatValue, APSplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + bool IsBVNConstSplat = + BVN->isConstantSplat(APSplatValue, APSplatUndef, SplatBitSize, + HasAnyUndefs, 0, !Subtarget.isLittleEndian()); + if (!IsBVNConstSplat) + return SDValue(); + + // Check if RHS is a splat of 4-bytes (or smaller). + if ((SplatBitSize / 8) > 4) + return SDValue(); + + // Check that the shuffle mask matches the semantics the XXSPLTI32DX. + // XXSPLTI32DX can insert 4 byte chunks from the constant splat C into: + // { C, Unchanged, C, Unchanged } or { Unchanged, C, Unchanged, C } + // Thus we check if the shuffle is any of the following 32 masks + // (represented as a 4-byte element shuffle in this comment): + // Case 1: <0, 4-7, 2, 4-7> and + // Case 2: <4-7, 1, 4-7, 3> + SDValue Index; + bool IsLE = Subtarget.isLittleEndian(); + if ((ShuffleMask[0] == 0 && ShuffleMask[8] == 8) && + (ShuffleMask[4] % 4 == 0 && ShuffleMask[12] % 4 == 0 && + ShuffleMask[4] > 15 && ShuffleMask[12] > 15)) // Case 1. + Index = DAG.getTargetConstant(IsLE ? 1 : 0, DL, MVT::i1); + else if ((ShuffleMask[4] == 4 && ShuffleMask[12] == 12) && // Case 2. + (ShuffleMask[0] % 4 == 0 && ShuffleMask[8] % 4 == 0 && + ShuffleMask[0] > 15 && ShuffleMask[8] > 15)) + Index = DAG.getTargetConstant(IsLE ? 0 : 1, DL, MVT::i1); + else + return SDValue(); + + SDValue SplatNode = DAG.getNode( + PPCISD::XXSPLTI32DX, DL, MVT::v2i64, DAG.getBitcast(MVT::v2i64, LHS), + Index, DAG.getTargetConstant(APSplatValue.getZExtValue(), DL, MVT::i32)); + return DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, SplatNode); +} + /// LowerROTL - Custom lowering for ROTL(v1i128) to vector_shuffle(v16i8). /// We lower ROTL(v1i128) to vector_shuffle(v16i8) only if shift amount is /// a multiple of 8. Otherwise convert it to a scalar rotation(i128) @@ -9895,6 +9967,12 @@ return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); } + if (Subtarget.hasPrefixInstrs()) { + SDValue SplatInsertNode; + if ((SplatInsertNode = lowerToXXSPLTI32DX(SVOp, DAG))) + return SplatInsertNode; + } + if (Subtarget.hasP9Altivec()) { SDValue NewISDNode; if ((NewISDNode = lowerToVINSERTH(SVOp, DAG))) diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -1,3 +1,19 @@ +//===----------------------------------------------------------------------===// +// PowerPC specific type constraints. +// + +def SDT_PPCSplat32 : SDTypeProfile<1, 3, [ SDTCisVT<0, v2i64>, + SDTCisVec<1>, SDTCisInt<2>, SDTCisInt<3> +]>; + +//===----------------------------------------------------------------------===// +// P10 specific PPCISD nodes. +// + +def PPCxxsplti32dx : SDNode<"PPCISD::XXSPLTI32DX", SDT_PPCSplat32, []>; + +//===----------------------------------------------------------------------===// + // PC Relative flag (for instructions that use the address of the prefix for // address computations). class isPCRel { bit PCRel = 1; } @@ -733,7 +749,10 @@ def XXSPLTI32DX : 8RR_DForm_IMM32_XT6_IX<32, 0, (outs vsrc:$XT), (ins vsrc:$XTi, i1imm:$IX, i32imm:$IMM32), - "xxsplti32dx $XT, $IX, $IMM32", IIC_VecGeneral, []>, + "xxsplti32dx $XT, $IX, $IMM32", IIC_VecGeneral, + [(set v2i64:$XT, + (PPCxxsplti32dx v2i64:$XTi, i1:$IX, + i32:$IMM32))]>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">; def XXPERMX : 8RR_XX4Form_IMM3_XTABC6<34, 0, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, diff --git a/llvm/test/CodeGen/PowerPC/p10-splatImm32.ll b/llvm/test/CodeGen/PowerPC/p10-splatImm32.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/p10-splatImm32.ll @@ -0,0 +1,103 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr10 < %s | \ +; RUN: FileCheck --check-prefix=CHECK-LE %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr10 < %s | \ +; RUN: FileCheck --check-prefix=CHECK-BE %s + +; Function Attrs: norecurse nounwind readnone +define <4 x i32> @test_xxsplti32dx_1(<4 x i32> %a) { +; CHECK-LE-LABEL: test_xxsplti32dx_1: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: xxsplti32dx vs34, -1, 566 +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: test_xxsplti32dx_1: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxsplti32dx vs34, 0, 566 +; CHECK-BE-NEXT: blr +entry: + %vecins1 = shufflevector <4 x i32> %a, <4 x i32> , <4 x i32> + ret <4 x i32> %vecins1 +} + +; Function Attrs: norecurse nounwind readnone +define <4 x i32> @test_xxsplti32dx_2(<4 x i32> %a) { +; CHECK-LE-LABEL: test_xxsplti32dx_2: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: xxsplti32dx vs34, 0, 33 +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: test_xxsplti32dx_2: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxsplti32dx vs34, -1, 33 +; CHECK-BE-NEXT: blr +entry: + %vecins1 = shufflevector <4 x i32> , <4 x i32> %a, <4 x i32> + ret <4 x i32> %vecins1 +} + +; Function Attrs: norecurse nounwind readnone +define <4 x i32> @test_xxsplti32dx_3(<4 x i32> %a) { +; CHECK-LE-LABEL: test_xxsplti32dx_3: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: xxsplti32dx vs34, -1, 12 +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: test_xxsplti32dx_3: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxsplti32dx vs34, 0, 12 +; CHECK-BE-NEXT: blr +entry: + %vecins1 = shufflevector <4 x i32> %a, <4 x i32> , <4 x i32> + ret <4 x i32> %vecins1 +} + +; Function Attrs: norecurse nounwind readnone +define <4 x i32> @test_xxsplti32dx_4(<4 x i32> %a) { +; CHECK-LE-LABEL: test_xxsplti32dx_4: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: xxsplti32dx vs34, 0, -683 +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: test_xxsplti32dx_4: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxsplti32dx vs34, -1, -683 +; CHECK-BE-NEXT: blr +entry: + %vecins1 = shufflevector <4 x i32> , <4 x i32> %a, <4 x i32> + ret <4 x i32> %vecins1 +} + +; Function Attrs: nounwind +define <4 x float> @test_xxsplti32dx_5(<4 x float> %vfa) { +; CHECK-LE-LABEL: test_xxsplti32dx_5: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: xxsplti32dx vs34, -1, 1065353216 +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: test_xxsplti32dx_5: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxsplti32dx vs34, 0, 1065353216 +; CHECK-BE-NEXT: blr +entry: + %vecins3.i = shufflevector <4 x float> %vfa, <4 x float> , <4 x i32> + ret <4 x float> %vecins3.i +} + +; Function Attrs: nounwind +define <4 x float> @test_xxsplti32dx_6(<4 x float> %vfa) { +; CHECK-LE-LABEL: test_xxsplti32dx_6: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: xxsplti32dx vs34, 0, 1073741824 +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: test_xxsplti32dx_6: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxsplti32dx vs34, -1, 1073741824 +; CHECK-BE-NEXT: blr +entry: + %vecins3.i = shufflevector <4 x float> , <4 x float> %vfa, <4 x i32> + ret <4 x float> %vecins3.i +}