diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -5825,6 +5825,68 @@ return; } } + case PPCISD::LD_SPLAT: { + // For v16i8 and v8i16, if target has no direct move, we can still handle + // this without using stack. + if (Subtarget->hasAltivec() && !Subtarget->hasDirectMove()) { + SDValue ZeroReg = + CurDAG->getRegister(Subtarget->isPPC64() ? PPC::ZERO8 : PPC::ZERO, + Subtarget->isPPC64() ? MVT::i64 : MVT::i32); + unsigned LIOpcode = Subtarget->isPPC64() ? PPC::LI8 : PPC::LI; + EVT Type = N->getValueType(0); + if (Type == MVT::v16i8 || Type == MVT::v8i16) { + // v16i8 LD_SPLAT addr + // ======> + // Mask = LVSR/LVSL 0, addr + // LoadLow = LXV 0, addr + // Perm = VPERM LoadLow, LoadLow, Mask + // Splat = VSPLTB 15/0, Perm + // + // v8i16 LD_SPLAT addr + // ======> + // Mask = LVSR/LVSL 0, addr + // LoadLow = LXV 0, addr + // LoadHigh = LXV (LI, 1), addr + // Perm = VPERM LoadLow, LoadHigh, Mask + // Splat = VSPLTH 7/0, Perm + unsigned SplatOp = (Type == MVT::v16i8) ? PPC::VSPLTB : PPC::VSPLTH; + unsigned SplatElemIndex = + Subtarget->isLittleEndian() ? ((Type == MVT::v16i8) ? 15 : 7) : 0; + + SDNode *Mask = CurDAG->getMachineNode( + Subtarget->isLittleEndian() ? PPC::LVSR : PPC::LVSL, dl, Type, + ZeroReg, N->getOperand(1)); + + SDNode *LoadLow = CurDAG->getMachineNode( + PPC::LVX, dl, MVT::v16i8, MVT::Other, + {ZeroReg, N->getOperand(1), N->getOperand(0)}); + + SDNode *LoadHigh = LoadLow; + if (Type == MVT::v8i16) { + LoadHigh = CurDAG->getMachineNode( + PPC::LVX, dl, MVT::v16i8, MVT::Other, + {SDValue(CurDAG->getMachineNode( + LIOpcode, dl, MVT::i32, + CurDAG->getTargetConstant(1, dl, MVT::i8)), + 0), + N->getOperand(1), SDValue(LoadLow, 1)}); + } + + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(LoadHigh, 1)); + transferMemOperands(N, LoadHigh); + + SDNode *Perm = + CurDAG->getMachineNode(PPC::VPERM, dl, Type, SDValue(LoadLow, 0), + SDValue(LoadHigh, 0), SDValue(Mask, 0)); + CurDAG->SelectNodeTo( + N, SplatOp, Type, + CurDAG->getTargetConstant(SplatElemIndex, dl, MVT::i8), + SDValue(Perm, 0)); + return; + } + } + break; + } } SelectCode(N); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -559,6 +559,14 @@ /// instructions such as LXVDSX, LXVWSX. LD_SPLAT, + /// VSRC, CHAIN = ZEXT_LD_SPLAT, CHAIN, Ptr - a splatting load memory + /// that zero-extends. + ZEXT_LD_SPLAT, + + /// VSRC, CHAIN = SEXT_LD_SPLAT, CHAIN, Ptr - a splatting load memory + /// that sign-extends. + SEXT_LD_SPLAT, + /// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian. /// Maps directly to an stxvd2x instruction that will be preceded by /// an xxswapd. diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1712,6 +1712,8 @@ case PPCISD::EXTRACT_VSX_REG: return "PPCISD::EXTRACT_VSX_REG"; case PPCISD::XXMFACC: return "PPCISD::XXMFACC"; case PPCISD::LD_SPLAT: return "PPCISD::LD_SPLAT"; + case PPCISD::ZEXT_LD_SPLAT: return "PPCISD::ZEXT_LD_SPLAT"; + case PPCISD::SEXT_LD_SPLAT: return "PPCISD::SEXT_LD_SPLAT"; case PPCISD::FNMSUB: return "PPCISD::FNMSUB"; case PPCISD::STRICT_FADDRTZ: return "PPCISD::STRICT_FADDRTZ"; @@ -9060,6 +9062,34 @@ return (!LosesInfo && !APFloatToConvert.isDenormal()); } +static bool isValidSplatLoad(const PPCSubtarget &Subtarget, const SDValue &Op, + unsigned &Opcode) { + const SDNode *InputNode = Op.getOperand(0).getNode(); + if (!InputNode || !ISD::isUNINDEXEDLoad(InputNode)) + return false; + + if (!Subtarget.hasVSX()) + return false; + + EVT Ty = Op->getValueType(0); + if (Ty == MVT::v2f64 || Ty == MVT::v4f32 || Ty == MVT::v4i32 || + Ty == MVT::v8i16 || Ty == MVT::v16i8) + return true; + + if (Ty == MVT::v2i64) { + // check the extend type if the input is i32 while the output vector type is + // v2i64. + if (cast(Op.getOperand(0))->getMemoryVT() == MVT::i32) { + if (ISD::isZEXTLoad(InputNode)) + Opcode = PPCISD::ZEXT_LD_SPLAT; + if (ISD::isSEXTLoad(InputNode)) + Opcode = PPCISD::SEXT_LD_SPLAT; + } + return true; + } + return false; +} + // If this is a case we can't handle, return null and let the default // expansion code take care of it. If we CAN select this case, and if it // selects to a single instruction, return Op. Otherwise, if we can codegen @@ -9123,17 +9153,17 @@ } if (!BVNIsConstantSplat || SplatBitSize > 32) { + unsigned NewOpcode = PPCISD::LD_SPLAT; - bool IsPermutedLoad = false; - const SDValue *InputLoad = - getNormalLoadInput(Op.getOperand(0), IsPermutedLoad); // Handle load-and-splat patterns as we have instructions that will do this // in one go. - if (InputLoad && DAG.isSplatValue(Op, true)) { + if (DAG.isSplatValue(Op, true) && + isValidSplatLoad(Subtarget, Op, NewOpcode)) { + const SDValue *InputLoad = &Op.getOperand(0); LoadSDNode *LD = cast(*InputLoad); - // We have handling for 4 and 8 byte elements. - unsigned ElementSize = LD->getMemoryVT().getScalarSizeInBits(); + unsigned ElementSize = LD->getMemoryVT().getScalarSizeInBits() * + ((NewOpcode == PPCISD::LD_SPLAT) ? 1 : 2); // Checking for a single use of this load, we have to check for vector // width (128 bits) / ElementSize uses (since each operand of the @@ -9142,18 +9172,54 @@ for (SDValue BVInOp : Op->ops()) if (BVInOp.isUndef()) NumUsesOfInputLD--; + + // Execlude somes case where LD_SPLAT is worse than scalar_to_vector: + // Below cases should also happen for "lfiwzx/lfiwax + LE target + index + // 1" and "lxvrhx + BE target + index 7" and "lxvrbx + BE target + index + // 15", but funciton IsValidSplatLoad() now will only return true when + // the data at index 0 is not nullptr. So we will not get into trouble for + // these cases. + // + // case 1 - lfiwzx/lfiwax + // 1.1: load result is i32 and is sign/zero extend to i64; + // 1.2: build a v2i64 vector type with above loaded value; + // 1.3: the vector has only one value at index 0, others are all undef; + // 1.4: on BE target, so that lfiwzx/lfiwax does not need any permute. + if (NumUsesOfInputLD == 1 && + (Op->getValueType(0) == MVT::v2i64 && NewOpcode != PPCISD::LD_SPLAT && + !Subtarget.isLittleEndian() && Subtarget.hasVSX() && + Subtarget.hasLFIWAX())) + return SDValue(); + + // case 2 - lxvrhx + // 2.1: load result is i16; + // 2.2: build a v8i16 vector with above loaded value; + // 2.3: the vector has only one value at index 0, others are all undef; + // 2.4: on LE target, so that lxvrhx does not need any permute. + if (NumUsesOfInputLD == 1 && Subtarget.isLittleEndian() && + Subtarget.isISA3_1() && Op->getValueType(0) == MVT::v16i8) + return SDValue(); + + // case 3 - lxvrbx + // 3.1: load result is i8; + // 3.2: build a v16i8 vector with above loaded value; + // 3.3: the vector has only one value at index 0, others are all undef; + // 3.4: on LE target, so that lxvrbx does not need any permute. + if (NumUsesOfInputLD == 1 && Subtarget.isLittleEndian() && + Subtarget.isISA3_1() && Op->getValueType(0) == MVT::v8i16) + return SDValue(); + assert(NumUsesOfInputLD > 0 && "No uses of input LD of a build_vector?"); if (InputLoad->getNode()->hasNUsesOfValue(NumUsesOfInputLD, 0) && - ((Subtarget.hasVSX() && ElementSize == 64) || - (Subtarget.hasP9Vector() && ElementSize == 32))) { + Subtarget.hasVSX()) { SDValue Ops[] = { LD->getChain(), // Chain LD->getBasePtr(), // Ptr DAG.getValueType(Op.getValueType()) // VT }; SDValue LdSplt = DAG.getMemIntrinsicNode( - PPCISD::LD_SPLAT, dl, DAG.getVTList(Op.getValueType(), MVT::Other), - Ops, LD->getMemoryVT(), LD->getMemOperand()); + NewOpcode, dl, DAG.getVTList(Op.getValueType(), MVT::Other), Ops, + LD->getMemoryVT(), LD->getMemOperand()); // Replace all uses of the output chain of the original load with the // output chain of the new load. DAG.ReplaceAllUsesOfValueWith(InputLoad->getValue(1), diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -138,6 +138,10 @@ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def PPCldsplat : SDNode<"PPCISD::LD_SPLAT", SDT_PPCldsplat, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def PPCzextldsplat : SDNode<"PPCISD::ZEXT_LD_SPLAT", SDT_PPCldsplat, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def PPCsextldsplat : SDNode<"PPCISD::SEXT_LD_SPLAT", SDT_PPCldsplat, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def PPCSToV : SDNode<"PPCISD::SCALAR_TO_VECTOR_PERMUTED", SDTypeProfile<1, 1, []>, []>; @@ -2827,10 +2831,20 @@ def : Pat<(v4f32 (build_vector f32:$A, f32:$A, f32:$A, f32:$A)), (v4f32 (XXSPLTW (v4f32 (XSCVDPSPN $A)), 0))>; + +// Splat loads. def : Pat<(v2f64 (PPCldsplat ForceXForm:$A)), (v2f64 (LXVDSX ForceXForm:$A))>; +def : Pat<(v4f32 (PPCldsplat ForceXForm:$A)), + (v4f32 (XXSPLTW (SUBREG_TO_REG (i64 1), (LFIWZX ForceXForm:$A), sub_64), 1))>; def : Pat<(v2i64 (PPCldsplat ForceXForm:$A)), (v2i64 (LXVDSX ForceXForm:$A))>; +def : Pat<(v4i32 (PPCldsplat ForceXForm:$A)), + (v4i32 (XXSPLTW (SUBREG_TO_REG (i64 1), (LFIWZX ForceXForm:$A), sub_64), 1))>; +def : Pat<(v2i64 (PPCzextldsplat ForceXForm:$A)), + (v2i64 (XXPERMDIs (LFIWZX ForceXForm:$A), 0))>; +def : Pat<(v2i64 (PPCsextldsplat ForceXForm:$A)), + (v2i64 (XXPERMDIs (LFIWAX ForceXForm:$A), 0))>; // Build vectors of floating point converted to i64. def : Pat<(v2i64 (build_vector FltToLong.A, FltToLong.A)), @@ -3540,6 +3554,14 @@ def : Pat<(v4i32 (build_vector immSExt5NonZero:$A, immSExt5NonZero:$A, immSExt5NonZero:$A, immSExt5NonZero:$A)), (v4i32 (VSPLTISW imm:$A))>; + +// Splat loads. +// Note that, we use MTVSRD without checking PPC64 because we only care the +// lowest 16/8 bits. +def : Pat<(v8i16 (PPCldsplat ForceXForm:$A)), + (v8i16 (VSPLTHs 3, (MTVSRD (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (LHZX ForceXForm:$A), sub_32))))>; +def : Pat<(v16i8 (PPCldsplat ForceXForm:$A)), + (v16i8 (VSPLTBs 7, (MTVSRD (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (LBZX ForceXForm:$A), sub_32))))>; } // HasVSX, HasDirectMove // Big endian VSX subtarget with direct moves. @@ -4087,6 +4109,10 @@ (v4f32 (LXVWSX ForceXForm:$A))>; def : Pat<(v4i32 (PPCldsplat ForceXForm:$A)), (v4i32 (LXVWSX ForceXForm:$A))>; +def : Pat<(v8i16 (PPCldsplat ForceXForm:$A)), + (v8i16 (VSPLTHs 3, (LXSIHZX ForceXForm:$A)))>; +def : Pat<(v16i8 (PPCldsplat ForceXForm:$A)), + (v16i8 (VSPLTBs 7, (LXSIBZX ForceXForm:$A)))>; } // HasVSX, HasP9Vector // Any Power9 VSX subtarget with equivalent length but better Power10 VSX diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp --- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp +++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -603,14 +603,24 @@ ToErase = &MI; Simplified = true; } - } else if ((Immed == 0 || Immed == 3) && DefOpc == PPC::XXPERMDIs && + } else if ((Immed == 0 || Immed == 3 || Immed == 2) && + DefOpc == PPC::XXPERMDIs && (DefMI->getOperand(2).getImm() == 0 || DefMI->getOperand(2).getImm() == 3)) { + ToErase = &MI; + Simplified = true; + // Swap of a splat, convert to copy. + if (Immed == 2) { + LLVM_DEBUG(dbgs() << "Optimizing swap(splat) => copy(splat): "); + LLVM_DEBUG(MI.dump()); + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY), + MI.getOperand(0).getReg()) + .add(MI.getOperand(1)); + break; + } // Splat fed by another splat - switch the output of the first // and remove the second. DefMI->getOperand(0).setReg(MI.getOperand(0).getReg()); - ToErase = &MI; - Simplified = true; LLVM_DEBUG(dbgs() << "Removing redundant splat: "); LLVM_DEBUG(MI.dump()); } diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll --- a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll +++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll @@ -640,24 +640,20 @@ define dso_local <16 x i8> @no_RAUW_in_combine_during_legalize(i32* nocapture readonly %ptr, i32 signext %offset) local_unnamed_addr #0 { ; CHECK-P8-LABEL: no_RAUW_in_combine_during_legalize: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: addis r5, r2, .LCPI16_0@toc@ha ; CHECK-P8-NEXT: sldi r4, r4, 2 -; CHECK-P8-NEXT: xxlxor v4, v4, v4 -; CHECK-P8-NEXT: addi r5, r5, .LCPI16_0@toc@l -; CHECK-P8-NEXT: lxsiwzx v2, r3, r4 -; CHECK-P8-NEXT: lvx v3, 0, r5 -; CHECK-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-P8-NEXT: xxlxor v3, v3, v3 +; CHECK-P8-NEXT: lfiwzx f0, r3, r4 +; CHECK-P8-NEXT: xxspltd v2, f0, 0 +; CHECK-P8-NEXT: vmrglb v2, v3, v2 ; CHECK-P8-NEXT: blr ; ; CHECK-P9-LABEL: no_RAUW_in_combine_during_legalize: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: sldi r4, r4, 2 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 -; CHECK-P9-NEXT: lxsiwzx v2, r3, r4 -; CHECK-P9-NEXT: addis r3, r2, .LCPI16_0@toc@ha -; CHECK-P9-NEXT: addi r3, r3, .LCPI16_0@toc@l -; CHECK-P9-NEXT: lxv v3, 0(r3) -; CHECK-P9-NEXT: vperm v2, v4, v2, v3 +; CHECK-P9-NEXT: xxlxor v3, v3, v3 +; CHECK-P9-NEXT: lfiwzx f0, r3, r4 +; CHECK-P9-NEXT: xxspltd v2, f0, 0 +; CHECK-P9-NEXT: vmrglb v2, v3, v2 ; CHECK-P9-NEXT: blr ; ; CHECK-P9-BE-LABEL: no_RAUW_in_combine_during_legalize: @@ -682,12 +678,9 @@ ; CHECK-P7-LABEL: no_RAUW_in_combine_during_legalize: ; CHECK-P7: # %bb.0: # %entry ; CHECK-P7-NEXT: sldi r4, r4, 2 -; CHECK-P7-NEXT: addi r5, r1, -16 ; CHECK-P7-NEXT: xxlxor v3, v3, v3 -; CHECK-P7-NEXT: lwzx r3, r3, r4 -; CHECK-P7-NEXT: std r3, -16(r1) -; CHECK-P7-NEXT: lxvd2x vs0, 0, r5 -; CHECK-P7-NEXT: xxswapd v2, vs0 +; CHECK-P7-NEXT: lfiwzx f0, r3, r4 +; CHECK-P7-NEXT: xxspltd v2, f0, 0 ; CHECK-P7-NEXT: vmrglb v2, v3, v2 ; CHECK-P7-NEXT: blr entry: @@ -831,7 +824,7 @@ define dso_local void @testByteSplat() #0 { ; CHECK-P8-LABEL: testByteSplat: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: lbz r3, 0(r3) +; CHECK-P8-NEXT: lbzx r3, 0, r3 ; CHECK-P8-NEXT: mtvsrd v2, r3 ; CHECK-P8-NEXT: vspltb v2, v2, 7 ; CHECK-P8-NEXT: stvx v2, 0, r3 @@ -863,10 +856,9 @@ ; ; CHECK-P7-LABEL: testByteSplat: ; CHECK-P7: # %bb.0: # %entry -; CHECK-P7-NEXT: lbz r3, 0(r3) -; CHECK-P7-NEXT: stb r3, -16(r1) -; CHECK-P7-NEXT: addi r3, r1, -16 -; CHECK-P7-NEXT: lvx v2, 0, r3 +; CHECK-P7-NEXT: lvsr v2, 0, r3 +; CHECK-P7-NEXT: lvx v3, 0, r3 +; CHECK-P7-NEXT: vperm v2, v3, v3, v2 ; CHECK-P7-NEXT: vspltb v2, v2, 15 ; CHECK-P7-NEXT: stvx v2, 0, r3 ; CHECK-P7-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/load-and-splat.ll b/llvm/test/CodeGen/PowerPC/load-and-splat.ll --- a/llvm/test/CodeGen/PowerPC/load-and-splat.ll +++ b/llvm/test/CodeGen/PowerPC/load-and-splat.ll @@ -59,11 +59,9 @@ ; ; P7-LABEL: test2: ; P7: # %bb.0: # %entry -; P7-NEXT: lwz r4, 12(r4) -; P7-NEXT: addi r5, r1, -16 -; P7-NEXT: stw r4, -16(r1) -; P7-NEXT: lxvw4x vs0, 0, r5 -; P7-NEXT: xxspltw vs0, vs0, 0 +; P7-NEXT: addi r4, r4, 12 +; P7-NEXT: lfiwzx f0, 0, r4 +; P7-NEXT: xxspltw vs0, vs0, 1 ; P7-NEXT: stxvw4x vs0, 0, r3 ; P7-NEXT: blr entry: @@ -94,11 +92,9 @@ ; ; P7-LABEL: test3: ; P7: # %bb.0: # %entry -; P7-NEXT: lwz r4, 12(r4) -; P7-NEXT: addi r5, r1, -16 -; P7-NEXT: stw r4, -16(r1) -; P7-NEXT: lxvw4x vs0, 0, r5 -; P7-NEXT: xxspltw vs0, vs0, 0 +; P7-NEXT: addi r4, r4, 12 +; P7-NEXT: lfiwzx f0, 0, r4 +; P7-NEXT: xxspltw vs0, vs0, 1 ; P7-NEXT: stxvw4x vs0, 0, r3 ; P7-NEXT: blr entry: @@ -110,6 +106,7 @@ ret void } + ; v2i64 define dso_local void @test4(<2 x i64>* nocapture %c, i64* nocapture readonly %a) local_unnamed_addr { ; P9-LABEL: test4: @@ -146,24 +143,21 @@ ; P9-LABEL: test5: ; P9: # %bb.0: # %entry ; P9-NEXT: lfiwax f0, 0, r4 -; P9-NEXT: xxspltd vs0, vs0, 0 +; P9-NEXT: xxspltd vs0, f0, 0 ; P9-NEXT: stxv vs0, 0(r3) ; P9-NEXT: blr ; ; P8-LABEL: test5: ; P8: # %bb.0: # %entry ; P8-NEXT: lfiwax f0, 0, r4 -; P8-NEXT: xxspltd vs0, vs0, 0 +; P8-NEXT: xxspltd vs0, f0, 0 ; P8-NEXT: stxvd2x vs0, 0, r3 ; P8-NEXT: blr ; ; P7-LABEL: test5: ; P7: # %bb.0: # %entry -; P7-NEXT: lwa r4, 0(r4) -; P7-NEXT: addi r5, r1, -16 -; P7-NEXT: std r4, -8(r1) -; P7-NEXT: std r4, -16(r1) -; P7-NEXT: lxvd2x vs0, 0, r5 +; P7-NEXT: lfiwax f0, 0, r4 +; P7-NEXT: xxspltd vs0, f0, 0 ; P7-NEXT: stxvd2x vs0, 0, r3 ; P7-NEXT: blr entry: @@ -180,24 +174,21 @@ ; P9-LABEL: test6: ; P9: # %bb.0: # %entry ; P9-NEXT: lfiwzx f0, 0, r4 -; P9-NEXT: xxspltd vs0, vs0, 0 +; P9-NEXT: xxspltd vs0, f0, 0 ; P9-NEXT: stxv vs0, 0(r3) ; P9-NEXT: blr ; ; P8-LABEL: test6: ; P8: # %bb.0: # %entry ; P8-NEXT: lfiwzx f0, 0, r4 -; P8-NEXT: xxspltd vs0, vs0, 0 +; P8-NEXT: xxspltd vs0, f0, 0 ; P8-NEXT: stxvd2x vs0, 0, r3 ; P8-NEXT: blr ; ; P7-LABEL: test6: ; P7: # %bb.0: # %entry -; P7-NEXT: lwz r4, 0(r4) -; P7-NEXT: addi r5, r1, -16 -; P7-NEXT: std r4, -8(r1) -; P7-NEXT: std r4, -16(r1) -; P7-NEXT: lxvd2x vs0, 0, r5 +; P7-NEXT: lfiwzx f0, 0, r4 +; P7-NEXT: xxspltd vs0, f0, 0 ; P7-NEXT: stxvd2x vs0, 0, r3 ; P7-NEXT: blr entry: @@ -220,7 +211,7 @@ ; ; P8-LABEL: test7: ; P8: # %bb.0: # %entry -; P8-NEXT: lhz r4, 0(r4) +; P8-NEXT: lhzx r4, 0, r4 ; P8-NEXT: mtvsrd v2, r4 ; P8-NEXT: vsplth v2, v2, 3 ; P8-NEXT: stvx v2, 0, r3 @@ -228,10 +219,11 @@ ; ; P7-LABEL: test7: ; P7: # %bb.0: # %entry -; P7-NEXT: lhz r4, 0(r4) -; P7-NEXT: addi r5, r1, -16 -; P7-NEXT: sth r4, -16(r1) -; P7-NEXT: lxvw4x v2, 0, r5 +; P7-NEXT: li r5, 1 +; P7-NEXT: lvx v2, 0, r4 +; P7-NEXT: lvsl v4, 0, r4 +; P7-NEXT: lvx v3, r5, r4 +; P7-NEXT: vperm v2, v2, v3, v4 ; P7-NEXT: vsplth v2, v2, 0 ; P7-NEXT: stxvw4x v2, 0, r3 ; P7-NEXT: blr @@ -254,7 +246,7 @@ ; ; P8-LABEL: test8: ; P8: # %bb.0: # %entry -; P8-NEXT: lbz r4, 0(r4) +; P8-NEXT: lbzx r4, 0, r4 ; P8-NEXT: mtvsrd v2, r4 ; P8-NEXT: vspltb v2, v2, 7 ; P8-NEXT: stvx v2, 0, r3 @@ -262,10 +254,9 @@ ; ; P7-LABEL: test8: ; P7: # %bb.0: # %entry -; P7-NEXT: lbz r4, 0(r4) -; P7-NEXT: addi r5, r1, -16 -; P7-NEXT: stb r4, -16(r1) -; P7-NEXT: lxvw4x v2, 0, r5 +; P7-NEXT: lvsl v2, 0, r4 +; P7-NEXT: lvx v3, 0, r4 +; P7-NEXT: vperm v2, v3, v3, v2 ; P7-NEXT: vspltb v2, v2, 0 ; P7-NEXT: stxvw4x v2, 0, r3 ; P7-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/scalar_vector_test_3.ll b/llvm/test/CodeGen/PowerPC/scalar_vector_test_3.ll --- a/llvm/test/CodeGen/PowerPC/scalar_vector_test_3.ll +++ b/llvm/test/CodeGen/PowerPC/scalar_vector_test_3.ll @@ -204,25 +204,25 @@ ; P9LE-LABEL: s2v_test6: ; P9LE: # %bb.0: # %entry ; P9LE-NEXT: lfiwax f0, 0, r3 -; P9LE-NEXT: xxspltd v2, vs0, 0 +; P9LE-NEXT: xxspltd v2, f0, 0 ; P9LE-NEXT: blr ; ; P9BE-LABEL: s2v_test6: ; P9BE: # %bb.0: # %entry ; P9BE-NEXT: lfiwax f0, 0, r3 -; P9BE-NEXT: xxspltd v2, vs0, 0 +; P9BE-NEXT: xxspltd v2, f0, 0 ; P9BE-NEXT: blr ; ; P8LE-LABEL: s2v_test6: ; P8LE: # %bb.0: # %entry ; P8LE-NEXT: lfiwax f0, 0, r3 -; P8LE-NEXT: xxspltd v2, vs0, 0 +; P8LE-NEXT: xxspltd v2, f0, 0 ; P8LE-NEXT: blr ; ; P8BE-LABEL: s2v_test6: ; P8BE: # %bb.0: # %entry ; P8BE-NEXT: lfiwax f0, 0, r3 -; P8BE-NEXT: xxspltd v2, vs0, 0 +; P8BE-NEXT: xxspltd v2, f0, 0 ; P8BE-NEXT: blr @@ -240,25 +240,25 @@ ; P9LE-LABEL: s2v_test7: ; P9LE: # %bb.0: # %entry ; P9LE-NEXT: lfiwax f0, 0, r3 -; P9LE-NEXT: xxspltd v2, vs0, 0 +; P9LE-NEXT: xxspltd v2, f0, 0 ; P9LE-NEXT: blr ; ; P9BE-LABEL: s2v_test7: ; P9BE: # %bb.0: # %entry ; P9BE-NEXT: lfiwax f0, 0, r3 -; P9BE-NEXT: xxspltd v2, vs0, 0 +; P9BE-NEXT: xxspltd v2, f0, 0 ; P9BE-NEXT: blr ; ; P8LE-LABEL: s2v_test7: ; P8LE: # %bb.0: # %entry ; P8LE-NEXT: lfiwax f0, 0, r3 -; P8LE-NEXT: xxspltd v2, vs0, 0 +; P8LE-NEXT: xxspltd v2, f0, 0 ; P8LE-NEXT: blr ; ; P8BE-LABEL: s2v_test7: ; P8BE: # %bb.0: # %entry ; P8BE-NEXT: lfiwax f0, 0, r3 -; P8BE-NEXT: xxspltd v2, vs0, 0 +; P8BE-NEXT: xxspltd v2, f0, 0 ; P8BE-NEXT: blr