diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -554,6 +554,14 @@ /// instructions such as LXVDSX, LXVWSX. LD_SPLAT, + /// VSRC, CHAIN = ZEXT_LD_SPLAT, CHAIN, Ptr - a splatting load memory + /// that zero-extends. + ZEXT_LD_SPLAT, + + /// VSRC, CHAIN = SEXT_LD_SPLAT, CHAIN, Ptr - a splatting load memory + /// that sign-extends. + SEXT_LD_SPLAT, + /// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian. /// Maps directly to an stxvd2x instruction that will be preceded by /// an xxswapd. diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1707,6 +1707,8 @@ case PPCISD::EXTRACT_VSX_REG: return "PPCISD::EXTRACT_VSX_REG"; case PPCISD::XXMFACC: return "PPCISD::XXMFACC"; case PPCISD::LD_SPLAT: return "PPCISD::LD_SPLAT"; + case PPCISD::ZEXT_LD_SPLAT: return "PPCISD::ZEXT_LD_SPLAT"; + case PPCISD::SEXT_LD_SPLAT: return "PPCISD::SEXT_LD_SPLAT"; case PPCISD::FNMSUB: return "PPCISD::FNMSUB"; case PPCISD::STRICT_FADDRTZ: return "PPCISD::STRICT_FADDRTZ"; @@ -9066,6 +9068,34 @@ return (!LosesInfo && !APFloatToConvert.isDenormal()); } +static bool IsValidSplatLoad(const PPCSubtarget &Subtarget, SDValue &Op, + unsigned &Opcode) { + const SDNode *InputNode = Op.getOperand(0).getNode(); + if (!InputNode || !ISD::isUNINDEXEDLoad(InputNode)) + return false; + + if (!Subtarget.hasVSX()) + return false; + + EVT Ty = Op->getValueType(0); + if (Ty == MVT::v2f64 || Ty == MVT::v4f32 || Ty == MVT::v4i32 || + Ty == MVT::v8i16 || Ty == MVT::v16i8) + return true; + + if (Ty == MVT::v2i64) { + // check the extend type if the input is i32 while the output vector type is + // v2i64. + if (cast(Op.getOperand(0))->getMemoryVT() == MVT::i32) { + if (ISD::isZEXTLoad(InputNode)) + Opcode = PPCISD::ZEXT_LD_SPLAT; + if (ISD::isSEXTLoad(InputNode)) + Opcode = PPCISD::SEXT_LD_SPLAT; + } + return true; + } + return false; +} + // If this is a case we can't handle, return null and let the default // expansion code take care of it. If we CAN select this case, and if it // selects to a single instruction, return Op. Otherwise, if we can codegen @@ -9129,17 +9159,17 @@ } if (!BVNIsConstantSplat || SplatBitSize > 32) { + unsigned NewOpcode = PPCISD::LD_SPLAT; - bool IsPermutedLoad = false; - const SDValue *InputLoad = - getNormalLoadInput(Op.getOperand(0), IsPermutedLoad); // Handle load-and-splat patterns as we have instructions that will do this // in one go. - if (InputLoad && DAG.isSplatValue(Op, true)) { + if (DAG.isSplatValue(Op, true) && + IsValidSplatLoad(Subtarget, Op, NewOpcode)) { + const SDValue *InputLoad = &Op.getOperand(0); LoadSDNode *LD = cast(*InputLoad); - // We have handling for 4 and 8 byte elements. - unsigned ElementSize = LD->getMemoryVT().getScalarSizeInBits(); + unsigned ElementSize = LD->getMemoryVT().getScalarSizeInBits() * + ((NewOpcode == PPCISD::LD_SPLAT) ? 1 : 2); // Checking for a single use of this load, we have to check for vector // width (128 bits) / ElementSize uses (since each operand of the @@ -9148,17 +9178,50 @@ for (SDValue BVInOp : Op->ops()) if (BVInOp.isUndef()) NumUsesOfInputLD--; + + // Execlude somes case where LD_SPLAT is worse than scalar_to_vector: + // case 1 - lfwizx/lfwiax + // 1.1: load result is i32 and is sign/zero extend to i64; + // 1.2: build a v2i64 vector type with above loaded value; + // 1.3: the vector has only one value at index 0, others are all undef; + // 1.4: on BE target, so that lfwizx/lfwiax does not need any permute. + // + // case 2 - lxvrhx + // 2.1: load result is i16; + // 2.2: build a v8i16 vector with above loaded value; + // 2.3: the vector has only one value at index 0, others are all undef; + // 2.4: on LE target, so that lxvrhx does not need any permute. + // + // case 3 - lxvrbx + // 3.1: load result is i8; + // 3.2: build a v16i8 vector with above loaded value; + // 3.3: the vector has only one value at index 0, others are all undef; + // 3.4: on LE target, so that lxvrbx does not need any permute. + // + // Above issue should also happen for "lfwizx/lfwiax + LE target + index + // 1" and "lxvrhx + BE target + index 7" and "lxvrbx + BE target + index + // 15", but funciton IsValidSplatLoad() now will only return true when + // the data at index 0 is not nullptr. So we will not get into trouble for + // these cases. + if (NumUsesOfInputLD == 1 && + ((Op->getValueType(0) == MVT::v2i64 && + NewOpcode != PPCISD::LD_SPLAT && !Subtarget.isLittleEndian() && + Subtarget.hasVSX() && Subtarget.hasLFIWAX()) || + (Subtarget.isLittleEndian() && Subtarget.isISA3_1() && + (Op->getValueType(0) == MVT::v8i16 || + Op->getValueType(0) == MVT::v16i8)))) + return SDValue(); + assert(NumUsesOfInputLD > 0 && "No uses of input LD of a build_vector?"); if (InputLoad->getNode()->hasNUsesOfValue(NumUsesOfInputLD, 0) && - ((Subtarget.hasVSX() && ElementSize == 64) || - (Subtarget.hasP9Vector() && ElementSize == 32))) { + Subtarget.hasVSX()) { SDValue Ops[] = { LD->getChain(), // Chain LD->getBasePtr(), // Ptr DAG.getValueType(Op.getValueType()) // VT }; SDValue LdSplt = DAG.getMemIntrinsicNode( - PPCISD::LD_SPLAT, dl, DAG.getVTList(Op.getValueType(), MVT::Other), + NewOpcode, dl, DAG.getVTList(Op.getValueType(), MVT::Other), Ops, LD->getMemoryVT(), LD->getMemOperand()); // Replace all uses of the output chain of the original load with the // output chain of the new load. diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -138,6 +138,10 @@ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def PPCldsplat : SDNode<"PPCISD::LD_SPLAT", SDT_PPCldsplat, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def PPCzextldsplat : SDNode<"PPCISD::ZEXT_LD_SPLAT", SDT_PPCldsplat, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def PPCsextldsplat : SDNode<"PPCISD::SEXT_LD_SPLAT", SDT_PPCldsplat, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def PPCSToV : SDNode<"PPCISD::SCALAR_TO_VECTOR_PERMUTED", SDTypeProfile<1, 1, []>, []>; @@ -2823,10 +2827,23 @@ def : Pat<(v4f32 (build_vector f32:$A, f32:$A, f32:$A, f32:$A)), (v4f32 (XXSPLTW (v4f32 (XSCVDPSPN $A)), 0))>; + def : Pat<(v2f64 (PPCldsplat ForceXForm:$A)), (v2f64 (LXVDSX ForceXForm:$A))>; +def : Pat<(v4f32 (PPCldsplat ForceXForm:$A)), + (v4f32 (XXSPLTW (SUBREG_TO_REG (i64 1), (LFIWZX ForceXForm:$A), sub_64), 1))>; def : Pat<(v2i64 (PPCldsplat ForceXForm:$A)), (v2i64 (LXVDSX ForceXForm:$A))>; +def : Pat<(v4i32 (PPCldsplat ForceXForm:$A)), + (v4i32 (XXSPLTW (SUBREG_TO_REG (i64 1), (LFIWZX ForceXForm:$A), sub_64), 1))>; +def : Pat<(v8i16 (PPCldsplat ForceXForm:$A)), + (v8i16 (VSPLTHs 3, (MTVSRD (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (LHZX ForceXForm:$A), sub_32))))>; +def : Pat<(v16i8 (PPCldsplat ForceXForm:$A)), + (v16i8 (VSPLTBs 7, (MTVSRD (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (LBZX ForceXForm:$A), sub_32))))>; +def : Pat<(v2i64 (PPCzextldsplat ForceXForm:$A)), + (v2i64 (XXPERMDIs (LFIWZX ForceXForm:$A), 0))>; +def : Pat<(v2i64 (PPCsextldsplat ForceXForm:$A)), + (v2i64 (XXPERMDIs (LFIWAX ForceXForm:$A), 0))>; // Build vectors of floating point converted to i64. def : Pat<(v2i64 (build_vector FltToLong.A, FltToLong.A)), @@ -4078,6 +4095,10 @@ (v4f32 (LXVWSX ForceXForm:$A))>; def : Pat<(v4i32 (PPCldsplat ForceXForm:$A)), (v4i32 (LXVWSX ForceXForm:$A))>; +def : Pat<(v8i16 (PPCldsplat ForceXForm:$A)), + (v8i16 (VSPLTHs 3, (LXSIHZX ForceXForm:$A)))>; +def : Pat<(v16i8 (PPCldsplat ForceXForm:$A)), + (v16i8 (VSPLTBs 7, (LXSIBZX ForceXForm:$A)))>; } // HasVSX, HasP9Vector // Any Power9 VSX subtarget with equivalent length but better Power10 VSX diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp --- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp +++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -603,14 +603,23 @@ ToErase = &MI; Simplified = true; } - } else if ((Immed == 0 || Immed == 3) && DefOpc == PPC::XXPERMDIs && + } else if ((Immed == 0 || Immed == 3 || Immed == 2) && DefOpc == PPC::XXPERMDIs && (DefMI->getOperand(2).getImm() == 0 || DefMI->getOperand(2).getImm() == 3)) { + ToErase = &MI; + Simplified = true; + // Swap of a splat, convert to copy. + if (Immed == 2) { + LLVM_DEBUG(dbgs() << "Optimizing swap(splat) => copy(splat): "); + LLVM_DEBUG(MI.dump()); + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY), + MI.getOperand(0).getReg()) + .add(MI.getOperand(1)); + break; + } // Splat fed by another splat - switch the output of the first // and remove the second. DefMI->getOperand(0).setReg(MI.getOperand(0).getReg()); - ToErase = &MI; - Simplified = true; LLVM_DEBUG(dbgs() << "Removing redundant splat: "); LLVM_DEBUG(MI.dump()); } diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll --- a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll +++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll @@ -640,24 +640,20 @@ define dso_local <16 x i8> @no_RAUW_in_combine_during_legalize(i32* nocapture readonly %ptr, i32 signext %offset) local_unnamed_addr #0 { ; CHECK-P8-LABEL: no_RAUW_in_combine_during_legalize: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: addis r5, r2, .LCPI16_0@toc@ha ; CHECK-P8-NEXT: sldi r4, r4, 2 -; CHECK-P8-NEXT: xxlxor v4, v4, v4 -; CHECK-P8-NEXT: addi r5, r5, .LCPI16_0@toc@l -; CHECK-P8-NEXT: lxsiwzx v2, r3, r4 -; CHECK-P8-NEXT: lvx v3, 0, r5 -; CHECK-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-P8-NEXT: xxlxor v3, v3, v3 +; CHECK-P8-NEXT: lfiwzx f0, r3, r4 +; CHECK-P8-NEXT: xxspltd v2, f0, 0 +; CHECK-P8-NEXT: vmrglb v2, v3, v2 ; CHECK-P8-NEXT: blr ; ; CHECK-P9-LABEL: no_RAUW_in_combine_during_legalize: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: sldi r4, r4, 2 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 -; CHECK-P9-NEXT: lxsiwzx v2, r3, r4 -; CHECK-P9-NEXT: addis r3, r2, .LCPI16_0@toc@ha -; CHECK-P9-NEXT: addi r3, r3, .LCPI16_0@toc@l -; CHECK-P9-NEXT: lxv v3, 0(r3) -; CHECK-P9-NEXT: vperm v2, v4, v2, v3 +; CHECK-P9-NEXT: xxlxor v3, v3, v3 +; CHECK-P9-NEXT: lfiwzx f0, r3, r4 +; CHECK-P9-NEXT: xxspltd v2, f0, 0 +; CHECK-P9-NEXT: vmrglb v2, v3, v2 ; CHECK-P9-NEXT: blr ; ; CHECK-P9-BE-LABEL: no_RAUW_in_combine_during_legalize: @@ -682,12 +678,9 @@ ; CHECK-P7-LABEL: no_RAUW_in_combine_during_legalize: ; CHECK-P7: # %bb.0: # %entry ; CHECK-P7-NEXT: sldi r4, r4, 2 -; CHECK-P7-NEXT: addi r5, r1, -16 ; CHECK-P7-NEXT: xxlxor v3, v3, v3 -; CHECK-P7-NEXT: lwzx r3, r3, r4 -; CHECK-P7-NEXT: std r3, -16(r1) -; CHECK-P7-NEXT: lxvd2x vs0, 0, r5 -; CHECK-P7-NEXT: xxswapd v2, vs0 +; CHECK-P7-NEXT: lfiwzx f0, r3, r4 +; CHECK-P7-NEXT: xxspltd v2, f0, 0 ; CHECK-P7-NEXT: vmrglb v2, v3, v2 ; CHECK-P7-NEXT: blr entry: @@ -831,7 +824,7 @@ define dso_local void @testByteSplat() #0 { ; CHECK-P8-LABEL: testByteSplat: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: lbz r3, 0(r3) +; CHECK-P8-NEXT: lbzx r3, 0, r3 ; CHECK-P8-NEXT: mtvsrd v2, r3 ; CHECK-P8-NEXT: vspltb v2, v2, 7 ; CHECK-P8-NEXT: stvx v2, 0, r3 @@ -863,11 +856,9 @@ ; ; CHECK-P7-LABEL: testByteSplat: ; CHECK-P7: # %bb.0: # %entry -; CHECK-P7-NEXT: lbz r3, 0(r3) -; CHECK-P7-NEXT: stb r3, -16(r1) -; CHECK-P7-NEXT: addi r3, r1, -16 -; CHECK-P7-NEXT: lvx v2, 0, r3 -; CHECK-P7-NEXT: vspltb v2, v2, 15 +; CHECK-P7-NEXT: lbzx r3, 0, r3 +; CHECK-P7-NEXT: mtvsrd v2, r3 +; CHECK-P7-NEXT: vspltb v2, v2, 7 ; CHECK-P7-NEXT: stvx v2, 0, r3 ; CHECK-P7-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/load-and-splat.ll b/llvm/test/CodeGen/PowerPC/load-and-splat.ll --- a/llvm/test/CodeGen/PowerPC/load-and-splat.ll +++ b/llvm/test/CodeGen/PowerPC/load-and-splat.ll @@ -9,6 +9,7 @@ ; RUN: -mtriple=powerpc64-unknown-unknown < %s | FileCheck %s \ ; RUN: -check-prefix=P7 +; v2f64 define dso_local void @test(<2 x double>* nocapture %c, double* nocapture readonly %a) local_unnamed_addr { ; P9-LABEL: test: ; P9: # %bb.0: # %entry @@ -39,6 +40,7 @@ ret void } +; v4f32 define dso_local void @test2(<4 x float>* nocapture %c, float* nocapture readonly %a) local_unnamed_addr { ; P9-LABEL: test2: ; P9: # %bb.0: # %entry @@ -57,11 +59,9 @@ ; ; P7-LABEL: test2: ; P7: # %bb.0: # %entry -; P7-NEXT: lwz r4, 12(r4) -; P7-NEXT: addi r5, r1, -16 -; P7-NEXT: stw r4, -16(r1) -; P7-NEXT: lxvw4x vs0, 0, r5 -; P7-NEXT: xxspltw vs0, vs0, 0 +; P7-NEXT: addi r4, r4, 12 +; P7-NEXT: lfiwzx f0, 0, r4 +; P7-NEXT: xxspltw vs0, vs0, 1 ; P7-NEXT: stxvw4x vs0, 0, r3 ; P7-NEXT: blr entry: @@ -73,6 +73,7 @@ ret void } +; v4i32 define dso_local void @test3(<4 x i32>* nocapture %c, i32* nocapture readonly %a) local_unnamed_addr { ; P9-LABEL: test3: ; P9: # %bb.0: # %entry @@ -91,11 +92,9 @@ ; ; P7-LABEL: test3: ; P7: # %bb.0: # %entry -; P7-NEXT: lwz r4, 12(r4) -; P7-NEXT: addi r5, r1, -16 -; P7-NEXT: stw r4, -16(r1) -; P7-NEXT: lxvw4x vs0, 0, r5 -; P7-NEXT: xxspltw vs0, vs0, 0 +; P7-NEXT: addi r4, r4, 12 +; P7-NEXT: lfiwzx f0, 0, r4 +; P7-NEXT: xxspltw vs0, vs0, 1 ; P7-NEXT: stxvw4x vs0, 0, r3 ; P7-NEXT: blr entry: @@ -107,6 +106,8 @@ ret void } + +; v2i64 define dso_local void @test4(<2 x i64>* nocapture %c, i64* nocapture readonly %a) local_unnamed_addr { ; P9-LABEL: test4: ; P9: # %bb.0: # %entry @@ -137,28 +138,26 @@ ret void } +; sext v2i64 define void @test5(<2 x i64>* %a, i32* %in) { ; P9-LABEL: test5: ; P9: # %bb.0: # %entry ; P9-NEXT: lfiwax f0, 0, r4 -; P9-NEXT: xxspltd vs0, vs0, 0 +; P9-NEXT: xxspltd vs0, f0, 0 ; P9-NEXT: stxv vs0, 0(r3) ; P9-NEXT: blr ; ; P8-LABEL: test5: ; P8: # %bb.0: # %entry ; P8-NEXT: lfiwax f0, 0, r4 -; P8-NEXT: xxspltd vs0, vs0, 0 +; P8-NEXT: xxspltd vs0, f0, 0 ; P8-NEXT: stxvd2x vs0, 0, r3 ; P8-NEXT: blr ; ; P7-LABEL: test5: ; P7: # %bb.0: # %entry -; P7-NEXT: lwa r4, 0(r4) -; P7-NEXT: addi r5, r1, -16 -; P7-NEXT: std r4, -8(r1) -; P7-NEXT: std r4, -16(r1) -; P7-NEXT: lxvd2x vs0, 0, r5 +; P7-NEXT: lfiwax f0, 0, r4 +; P7-NEXT: xxspltd vs0, f0, 0 ; P7-NEXT: stxvd2x vs0, 0, r3 ; P7-NEXT: blr entry: @@ -170,28 +169,26 @@ ret void } +; zext v2i64 define void @test6(<2 x i64>* %a, i32* %in) { ; P9-LABEL: test6: ; P9: # %bb.0: # %entry ; P9-NEXT: lfiwzx f0, 0, r4 -; P9-NEXT: xxspltd vs0, vs0, 0 +; P9-NEXT: xxspltd vs0, f0, 0 ; P9-NEXT: stxv vs0, 0(r3) ; P9-NEXT: blr ; ; P8-LABEL: test6: ; P8: # %bb.0: # %entry ; P8-NEXT: lfiwzx f0, 0, r4 -; P8-NEXT: xxspltd vs0, vs0, 0 +; P8-NEXT: xxspltd vs0, f0, 0 ; P8-NEXT: stxvd2x vs0, 0, r3 ; P8-NEXT: blr ; ; P7-LABEL: test6: ; P7: # %bb.0: # %entry -; P7-NEXT: lwz r4, 0(r4) -; P7-NEXT: addi r5, r1, -16 -; P7-NEXT: std r4, -8(r1) -; P7-NEXT: std r4, -16(r1) -; P7-NEXT: lxvd2x vs0, 0, r5 +; P7-NEXT: lfiwzx f0, 0, r4 +; P7-NEXT: xxspltd vs0, f0, 0 ; P7-NEXT: stxvd2x vs0, 0, r3 ; P7-NEXT: blr entry: @@ -203,6 +200,70 @@ ret void } +; v8i16 +define void @test7(<8 x i16>* %a, i16* %in) { +; P9-LABEL: test7: +; P9: # %bb.0: # %entry +; P9-NEXT: lxsihzx v2, 0, r4 +; P9-NEXT: vsplth v2, v2, 3 +; P9-NEXT: stxv v2, 0(r3) +; P9-NEXT: blr +; +; P8-LABEL: test7: +; P8: # %bb.0: # %entry +; P8-NEXT: lhzx r4, 0, r4 +; P8-NEXT: mtvsrd v2, r4 +; P8-NEXT: vsplth v2, v2, 3 +; P8-NEXT: stvx v2, 0, r3 +; P8-NEXT: blr +; +; P7-LABEL: test7: +; P7: # %bb.0: # %entry +; P7-NEXT: lhzx r4, 0, r4 +; P7-NEXT: mtvsrd v2, r4 +; P7-NEXT: vsplth v2, v2, 3 +; P7-NEXT: stxvw4x v2, 0, r3 +; P7-NEXT: blr +entry: + %0 = load i16, i16* %in, align 2 + %splat.splatinsert.i = insertelement <8 x i16> poison, i16 %0, i32 0 + %splat.splat.i = shufflevector <8 x i16> %splat.splatinsert.i, <8 x i16> poison, <8 x i32> zeroinitializer + store <8 x i16> %splat.splat.i, <8 x i16>* %a, align 16 + ret void +} + +; v16i8 +define void @test8(<16 x i8>* %a, i8* %in) { +; P9-LABEL: test8: +; P9: # %bb.0: # %entry +; P9-NEXT: lxsibzx v2, 0, r4 +; P9-NEXT: vspltb v2, v2, 7 +; P9-NEXT: stxv v2, 0(r3) +; P9-NEXT: blr +; +; P8-LABEL: test8: +; P8: # %bb.0: # %entry +; P8-NEXT: lbzx r4, 0, r4 +; P8-NEXT: mtvsrd v2, r4 +; P8-NEXT: vspltb v2, v2, 7 +; P8-NEXT: stvx v2, 0, r3 +; P8-NEXT: blr +; +; P7-LABEL: test8: +; P7: # %bb.0: # %entry +; P7-NEXT: lbzx r4, 0, r4 +; P7-NEXT: mtvsrd v2, r4 +; P7-NEXT: vspltb v2, v2, 7 +; P7-NEXT: stxvw4x v2, 0, r3 +; P7-NEXT: blr +entry: + %0 = load i8, i8* %in, align 1 + %splat.splatinsert.i = insertelement <16 x i8> poison, i8 %0, i32 0 + %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> poison, <16 x i32> zeroinitializer + store <16 x i8> %splat.splat.i, <16 x i8>* %a, align 16 + ret void +} + define <16 x i8> @unadjusted_lxvwsx(i32* %s, i32* %t) { ; P9-LABEL: unadjusted_lxvwsx: ; P9: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/PowerPC/scalar_vector_test_3.ll b/llvm/test/CodeGen/PowerPC/scalar_vector_test_3.ll --- a/llvm/test/CodeGen/PowerPC/scalar_vector_test_3.ll +++ b/llvm/test/CodeGen/PowerPC/scalar_vector_test_3.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mcpu=pwr9 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ ; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s --check-prefix=P9LE ; RUN: llc -mcpu=pwr9 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ @@ -14,24 +15,27 @@ ; P9LE-NEXT: lfiwax f0, 0, r3 ; P9LE-NEXT: xxmrghd v2, v2, vs0 ; P9LE-NEXT: blr - +; ; P9BE-LABEL: s2v_test1: ; P9BE: # %bb.0: # %entry ; P9BE-NEXT: lfiwax f0, 0, r3 ; P9BE-NEXT: xxpermdi v2, vs0, v2, 1 ; P9BE-NEXT: blr - +; ; P8LE-LABEL: s2v_test1: ; P8LE: # %bb.0: # %entry ; P8LE-NEXT: lfiwax f0, 0, r3 ; P8LE-NEXT: xxmrghd v2, v2, vs0 ; P8LE-NEXT: blr - +; ; P8BE-LABEL: s2v_test1: ; P8BE: # %bb.0: # %entry ; P8BE-NEXT: lfiwax f0, 0, r3 ; P8BE-NEXT: xxpermdi v2, vs0, v2, 1 ; P8BE-NEXT: blr + + + entry: %0 = load i32, i32* %int32, align 4 %conv = sext i32 %0 to i64 @@ -47,27 +51,30 @@ ; P9LE-NEXT: lfiwax f0, 0, r3 ; P9LE-NEXT: xxmrghd v2, v2, vs0 ; P9LE-NEXT: blr - +; ; P9BE-LABEL: s2v_test2: ; P9BE: # %bb.0: # %entry ; P9BE-NEXT: addi r3, r3, 4 ; P9BE-NEXT: lfiwax f0, 0, r3 ; P9BE-NEXT: xxpermdi v2, vs0, v2, 1 ; P9BE-NEXT: blr - +; ; P8LE-LABEL: s2v_test2: ; P8LE: # %bb.0: # %entry ; P8LE-NEXT: addi r3, r3, 4 ; P8LE-NEXT: lfiwax f0, 0, r3 ; P8LE-NEXT: xxmrghd v2, v2, vs0 ; P8LE-NEXT: blr - +; ; P8BE-LABEL: s2v_test2: ; P8BE: # %bb.0: # %entry ; P8BE-NEXT: addi r3, r3, 4 ; P8BE-NEXT: lfiwax f0, 0, r3 ; P8BE-NEXT: xxpermdi v2, vs0, v2, 1 ; P8BE-NEXT: blr + + + entry: %arrayidx = getelementptr inbounds i32, i32* %int32, i64 1 %0 = load i32, i32* %arrayidx, align 4 @@ -84,27 +91,30 @@ ; P9LE-NEXT: lfiwax f0, r3, r4 ; P9LE-NEXT: xxmrghd v2, v2, vs0 ; P9LE-NEXT: blr - +; ; P9BE-LABEL: s2v_test3: ; P9BE: # %bb.0: # %entry ; P9BE-NEXT: sldi r4, r7, 2 ; P9BE-NEXT: lfiwax f0, r3, r4 ; P9BE-NEXT: xxpermdi v2, vs0, v2, 1 ; P9BE-NEXT: blr - +; ; P8LE-LABEL: s2v_test3: ; P8LE: # %bb.0: # %entry ; P8LE-NEXT: sldi r4, r7, 2 ; P8LE-NEXT: lfiwax f0, r3, r4 ; P8LE-NEXT: xxmrghd v2, v2, vs0 ; P8LE-NEXT: blr - +; ; P8BE-LABEL: s2v_test3: ; P8BE: # %bb.0: # %entry ; P8BE-NEXT: sldi r4, r7, 2 ; P8BE-NEXT: lfiwax f0, r3, r4 ; P8BE-NEXT: xxpermdi v2, vs0, v2, 1 ; P8BE-NEXT: blr + + + entry: %idxprom = sext i32 %Idx to i64 %arrayidx = getelementptr inbounds i32, i32* %int32, i64 %idxprom @@ -122,27 +132,30 @@ ; P9LE-NEXT: lfiwax f0, 0, r3 ; P9LE-NEXT: xxmrghd v2, v2, vs0 ; P9LE-NEXT: blr - +; ; P9BE-LABEL: s2v_test4: ; P9BE: # %bb.0: # %entry ; P9BE-NEXT: addi r3, r3, 4 ; P9BE-NEXT: lfiwax f0, 0, r3 ; P9BE-NEXT: xxpermdi v2, vs0, v2, 1 ; P9BE-NEXT: blr - +; ; P8LE-LABEL: s2v_test4: ; P8LE: # %bb.0: # %entry ; P8LE-NEXT: addi r3, r3, 4 ; P8LE-NEXT: lfiwax f0, 0, r3 ; P8LE-NEXT: xxmrghd v2, v2, vs0 ; P8LE-NEXT: blr - +; ; P8BE-LABEL: s2v_test4: ; P8BE: # %bb.0: # %entry ; P8BE-NEXT: addi r3, r3, 4 ; P8BE-NEXT: lfiwax f0, 0, r3 ; P8BE-NEXT: xxpermdi v2, vs0, v2, 1 ; P8BE-NEXT: blr + + + entry: %arrayidx = getelementptr inbounds i32, i32* %int32, i64 1 %0 = load i32, i32* %arrayidx, align 4 @@ -158,24 +171,27 @@ ; P9LE-NEXT: lfiwax f0, 0, r5 ; P9LE-NEXT: xxmrghd v2, v2, vs0 ; P9LE-NEXT: blr - +; ; P9BE-LABEL: s2v_test5: ; P9BE: # %bb.0: # %entry ; P9BE-NEXT: lfiwax f0, 0, r5 ; P9BE-NEXT: xxpermdi v2, vs0, v2, 1 ; P9BE-NEXT: blr - +; ; P8LE-LABEL: s2v_test5: ; P8LE: # %bb.0: # %entry ; P8LE-NEXT: lfiwax f0, 0, r5 ; P8LE-NEXT: xxmrghd v2, v2, vs0 ; P8LE-NEXT: blr - +; ; P8BE-LABEL: s2v_test5: ; P8BE: # %bb.0: # %entry ; P8BE-NEXT: lfiwax f0, 0, r5 ; P8BE-NEXT: xxpermdi v2, vs0, v2, 1 ; P8BE-NEXT: blr + + + entry: %0 = load i32, i32* %ptr1, align 4 %conv = sext i32 %0 to i64 @@ -188,26 +204,29 @@ ; P9LE-LABEL: s2v_test6: ; P9LE: # %bb.0: # %entry ; P9LE-NEXT: lfiwax f0, 0, r3 -; P9LE-NEXT: xxspltd v2, vs0, 0 +; P9LE-NEXT: xxspltd v2, f0, 0 ; P9LE-NEXT: blr - +; ; P9BE-LABEL: s2v_test6: ; P9BE: # %bb.0: # %entry ; P9BE-NEXT: lfiwax f0, 0, r3 -; P9BE-NEXT: xxspltd v2, vs0, 0 +; P9BE-NEXT: xxspltd v2, f0, 0 ; P9BE-NEXT: blr - +; ; P8LE-LABEL: s2v_test6: ; P8LE: # %bb.0: # %entry ; P8LE-NEXT: lfiwax f0, 0, r3 -; P8LE-NEXT: xxspltd v2, vs0, 0 +; P8LE-NEXT: xxspltd v2, f0, 0 ; P8LE-NEXT: blr - +; ; P8BE-LABEL: s2v_test6: ; P8BE: # %bb.0: # %entry ; P8BE-NEXT: lfiwax f0, 0, r3 -; P8BE-NEXT: xxspltd v2, vs0, 0 +; P8BE-NEXT: xxspltd v2, f0, 0 ; P8BE-NEXT: blr + + + entry: %0 = load i32, i32* %ptr, align 4 %conv = sext i32 %0 to i64 @@ -221,26 +240,29 @@ ; P9LE-LABEL: s2v_test7: ; P9LE: # %bb.0: # %entry ; P9LE-NEXT: lfiwax f0, 0, r3 -; P9LE-NEXT: xxspltd v2, vs0, 0 +; P9LE-NEXT: xxspltd v2, f0, 0 ; P9LE-NEXT: blr - +; ; P9BE-LABEL: s2v_test7: ; P9BE: # %bb.0: # %entry ; P9BE-NEXT: lfiwax f0, 0, r3 -; P9BE-NEXT: xxspltd v2, vs0, 0 +; P9BE-NEXT: xxspltd v2, f0, 0 ; P9BE-NEXT: blr - +; ; P8LE-LABEL: s2v_test7: ; P8LE: # %bb.0: # %entry ; P8LE-NEXT: lfiwax f0, 0, r3 -; P8LE-NEXT: xxspltd v2, vs0, 0 +; P8LE-NEXT: xxspltd v2, f0, 0 ; P8LE-NEXT: blr - +; ; P8BE-LABEL: s2v_test7: ; P8BE: # %bb.0: # %entry ; P8BE-NEXT: lfiwax f0, 0, r3 -; P8BE-NEXT: xxspltd v2, vs0, 0 +; P8BE-NEXT: xxspltd v2, f0, 0 ; P8BE-NEXT: blr + + + entry: %0 = load i32, i32* %ptr, align 4 %conv = sext i32 %0 to i64