diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -102,6 +102,10 @@ /// vector or scalar. XXSPLTI_SP_TO_DP, + /// XXSPLTI32DX - The PPC XXSPLTI32DX instruction. + /// + XXSPLTI32DX, + /// VECINSERT - The PPC vector insert instruction /// VECINSERT, @@ -1270,6 +1274,10 @@ /// essentially v16i8 vector version of VINSERTH. SDValue lowerToVINSERTB(ShuffleVectorSDNode *N, SelectionDAG &DAG) const; + /// lowerToXXSPLTI32DX - Return the SDValue if this VECTOR_SHUFFLE can be + /// handled by the XXSPLTI32DX instruction introduced in ISA 3.1. + SDValue lowerToXXSPLTI32DX(ShuffleVectorSDNode *N, SelectionDAG &DAG) const; + // Return whether the call instruction can potentially be optimized to a // tail call. This will cause the optimizers to attempt to move, or // duplicate return instructions to help enable tail call optimizations. diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1477,6 +1477,8 @@ case PPCISD::XXSPLT: return "PPCISD::XXSPLT"; case PPCISD::XXSPLTI_SP_TO_DP: return "PPCISD::XXSPLTI_SP_TO_DP"; + case PPCISD::XXSPLTI32DX: + return "PPCISD::XXSPLTI32DX"; case PPCISD::VECINSERT: return "PPCISD::VECINSERT"; case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI"; case PPCISD::VECSHL: return "PPCISD::VECSHL"; @@ -9778,6 +9780,77 @@ return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); } +/// lowerToXXSPLTI32DX - Return the SDValue if this VECTOR_SHUFFLE can be +/// handled by the XXSPLTI32DX instruction introduced in ISA 3.1, otherwise +/// return the default SDValue. +SDValue PPCTargetLowering::lowerToXXSPLTI32DX(ShuffleVectorSDNode *SVN, + SelectionDAG &DAG) const { + // The LHS and RHS may be bitcasts to v16i8 as we canonicalize shuffles + // to v16i8. Peek through the bitcasts to get the actual operands. + SDValue LHS = peekThroughBitcasts(SVN->getOperand(0)); + SDValue RHS = peekThroughBitcasts(SVN->getOperand(1)); + + auto ShuffleMask = SVN->getMask(); + SDValue VecShuffle(SVN, 0); + SDLoc DL(SVN); + + // Check that we have a four byte shuffle. + if (!isNByteElemShuffleMask(SVN, 4, 1)) + return SDValue(); + + // Canonicalize the RHS being a BUILD_VECTOR when lowering to xxsplti32dx. + if (RHS->getOpcode() != ISD::BUILD_VECTOR) { + std::swap(LHS, RHS); + VecShuffle = DAG.getCommutedVectorShuffle(*SVN); + ShuffleMask = cast(VecShuffle)->getMask(); + } + + // Ensure that the RHS is a vector of constants. + BuildVectorSDNode *BVN = dyn_cast(RHS.getNode()); + if (!BVN) + return SDValue(); + + // Check if RHS is a splat of 4-bytes (or smaller). + APInt APSplatValue, APSplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (!BVN->isConstantSplat(APSplatValue, APSplatUndef, SplatBitSize, + HasAnyUndefs, 0, !Subtarget.isLittleEndian()) || + SplatBitSize > 32) + return SDValue(); + + // Check that the shuffle mask matches the semantics of XXSPLTI32DX. + // The instruction splats a constant C into two words of the source vector + // producing { C, Unchanged, C, Unchanged } or { Unchanged, C, Unchanged, C }. + // Thus we check that the shuffle mask is the equivalent of + // <0, [4-7], 2, [4-7]> or <[4-7], 1, [4-7], 3> respectively. + // Note: the check above of isNByteElemShuffleMask() ensures that the bytes + // within each word are consecutive, so we only need to check the first byte. + SDValue Index; + bool IsLE = Subtarget.isLittleEndian(); + if ((ShuffleMask[0] == 0 && ShuffleMask[8] == 8) && + (ShuffleMask[4] % 4 == 0 && ShuffleMask[12] % 4 == 0 && + ShuffleMask[4] > 15 && ShuffleMask[12] > 15)) + Index = DAG.getTargetConstant(IsLE ? 0 : 1, DL, MVT::i32); + else if ((ShuffleMask[4] == 4 && ShuffleMask[12] == 12) && + (ShuffleMask[0] % 4 == 0 && ShuffleMask[8] % 4 == 0 && + ShuffleMask[0] > 15 && ShuffleMask[8] > 15)) + Index = DAG.getTargetConstant(IsLE ? 1 : 0, DL, MVT::i32); + else + return SDValue(); + + // If the splat is narrower than 32-bits, we need to get the 32-bit value + // for XXSPLTI32DX. + unsigned SplatVal = APSplatValue.getZExtValue(); + for (; SplatBitSize < 32; SplatBitSize <<= 1) + SplatVal |= (SplatVal << SplatBitSize); + + SDValue SplatNode = DAG.getNode( + PPCISD::XXSPLTI32DX, DL, MVT::v2i64, DAG.getBitcast(MVT::v2i64, LHS), + Index, DAG.getTargetConstant(SplatVal, DL, MVT::i32)); + return DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, SplatNode); +} + /// LowerROTL - Custom lowering for ROTL(v1i128) to vector_shuffle(v16i8). /// We lower ROTL(v1i128) to vector_shuffle(v16i8) only if shift amount is /// a multiple of 8. Otherwise convert it to a scalar rotation(i128) @@ -9895,6 +9968,12 @@ return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); } + if (Subtarget.hasPrefixInstrs()) { + SDValue SplatInsertNode; + if ((SplatInsertNode = lowerToXXSPLTI32DX(SVOp, DAG))) + return SplatInsertNode; + } + if (Subtarget.hasP9Altivec()) { SDValue NewISDNode; if ((NewISDNode = lowerToVINSERTH(SVOp, DAG))) diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -1,3 +1,19 @@ +//===----------------------------------------------------------------------===// +// PowerPC ISA 3.1 specific type constraints. +// + +def SDT_PPCSplat32 : SDTypeProfile<1, 3, [ SDTCisVT<0, v2i64>, + SDTCisVec<1>, SDTCisInt<2>, SDTCisInt<3> +]>; + +//===----------------------------------------------------------------------===// +// ISA 3.1 specific PPCISD nodes. +// + +def PPCxxsplti32dx : SDNode<"PPCISD::XXSPLTI32DX", SDT_PPCSplat32, []>; + +//===----------------------------------------------------------------------===// + // PC Relative flag (for instructions that use the address of the prefix for // address computations). class isPCRel { bit PCRel = 1; } @@ -732,8 +748,11 @@ (PPCxxspltidp i32:$IMM32))]>; def XXSPLTI32DX : 8RR_DForm_IMM32_XT6_IX<32, 0, (outs vsrc:$XT), - (ins vsrc:$XTi, i1imm:$IX, i32imm:$IMM32), - "xxsplti32dx $XT, $IX, $IMM32", IIC_VecGeneral, []>, + (ins vsrc:$XTi, u1imm:$IX, i32imm:$IMM32), + "xxsplti32dx $XT, $IX, $IMM32", IIC_VecGeneral, + [(set v2i64:$XT, + (PPCxxsplti32dx v2i64:$XTi, i32:$IX, + i32:$IMM32))]>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">; def XXPERMX : 8RR_XX4Form_IMM3_XTABC6<34, 0, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, diff --git a/llvm/test/CodeGen/PowerPC/p10-splatImm32.ll b/llvm/test/CodeGen/PowerPC/p10-splatImm32.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/p10-splatImm32.ll @@ -0,0 +1,120 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr10 < %s | \ +; RUN: FileCheck --check-prefix=CHECK-LE %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr10 < %s | \ +; RUN: FileCheck --check-prefix=CHECK-BE %s + +; Function Attrs: norecurse nounwind readnone +define <4 x i32> @test_xxsplti32dx_1(<4 x i32> %a) { +; CHECK-LE-LABEL: test_xxsplti32dx_1: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: xxsplti32dx vs34, 0, 566 +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: test_xxsplti32dx_1: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxsplti32dx vs34, 1, 566 +; CHECK-BE-NEXT: blr +entry: + %vecins1 = shufflevector <4 x i32> %a, <4 x i32> , <4 x i32> + ret <4 x i32> %vecins1 +} + +; Function Attrs: norecurse nounwind readnone +define <4 x i32> @test_xxsplti32dx_2(<4 x i32> %a) { +; CHECK-LE-LABEL: test_xxsplti32dx_2: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: xxsplti32dx vs34, 1, 33 +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: test_xxsplti32dx_2: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxsplti32dx vs34, 0, 33 +; CHECK-BE-NEXT: blr +entry: + %vecins1 = shufflevector <4 x i32> , <4 x i32> %a, <4 x i32> + ret <4 x i32> %vecins1 +} + +; Function Attrs: norecurse nounwind readnone +define <4 x i32> @test_xxsplti32dx_3(<4 x i32> %a) { +; CHECK-LE-LABEL: test_xxsplti32dx_3: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: xxsplti32dx vs34, 0, 12 +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: test_xxsplti32dx_3: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxsplti32dx vs34, 1, 12 +; CHECK-BE-NEXT: blr +entry: + %vecins1 = shufflevector <4 x i32> %a, <4 x i32> , <4 x i32> + ret <4 x i32> %vecins1 +} + +; Function Attrs: norecurse nounwind readnone +define <4 x i32> @test_xxsplti32dx_4(<4 x i32> %a) { +; CHECK-LE-LABEL: test_xxsplti32dx_4: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: xxsplti32dx vs34, 1, -683 +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: test_xxsplti32dx_4: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxsplti32dx vs34, 0, -683 +; CHECK-BE-NEXT: blr +entry: + %vecins1 = shufflevector <4 x i32> , <4 x i32> %a, <4 x i32> + ret <4 x i32> %vecins1 +} + +; Function Attrs: nounwind +define <4 x float> @test_xxsplti32dx_5(<4 x float> %vfa) { +; CHECK-LE-LABEL: test_xxsplti32dx_5: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: xxsplti32dx vs34, 0, 1065353216 +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: test_xxsplti32dx_5: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxsplti32dx vs34, 1, 1065353216 +; CHECK-BE-NEXT: blr +entry: + %vecins3.i = shufflevector <4 x float> %vfa, <4 x float> , <4 x i32> + ret <4 x float> %vecins3.i +} + +; Function Attrs: nounwind +define <4 x float> @test_xxsplti32dx_6(<4 x float> %vfa) { +; CHECK-LE-LABEL: test_xxsplti32dx_6: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: xxsplti32dx vs34, 1, 1073741824 +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: test_xxsplti32dx_6: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxsplti32dx vs34, 0, 1073741824 +; CHECK-BE-NEXT: blr +entry: + %vecins3.i = shufflevector <4 x float> , <4 x float> %vfa, <4 x i32> + ret <4 x float> %vecins3.i +} + +; Function Attrs: norecurse nounwind readnone +; Test to illustrate when the splat is narrower than 32-bits. +define dso_local <4 x i32> @test_xxsplti32dx_7(<4 x i32> %a) local_unnamed_addr #0 { +; CHECK-LE-LABEL: test_xxsplti32dx_7: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: xxsplti32dx vs34, 1, -1414812757 +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: test_xxsplti32dx_7: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxsplti32dx vs34, 0, -1414812757 +; CHECK-BE-NEXT: blr +entry: + %vecins1 = shufflevector <4 x i32> , <4 x i32> %a, <4 x i32> + ret <4 x i32> %vecins1 +}