diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -10513,6 +10513,40 @@ SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); + EVT VT = Op.getValueType(); + SDNode *Op0 = Op->getOperand(0).getNode(); + + auto FoldScalarToVector = [&]() { + // If we are on targets which has VSX and P8 Vector, don't waste time here. + // td files have pseudo instruction LIWZX for this case. + if (!Subtarget.hasVSX() || Subtarget.hasP8Vector()) + return SDValue(); + + if (Op0->getOpcode() != ISD::LOAD || !Subtarget.hasLFIWAX()) + return SDValue(); + + LoadSDNode *LD = cast(Op0); + unsigned Opcode; + if (LD->getValueType(0) == MVT::f32 || LD->getValueType(0) == MVT::i32 || + (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 && + LD->getExtensionType() == ISD::ZEXTLOAD)) + Opcode = PPCISD::LFIWZX; + else if (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 && + LD->getExtensionType() == ISD::SEXTLOAD) + Opcode = PPCISD::LFIWAX; + else + return SDValue(); + + return DAG.getMemIntrinsicNode(Opcode, dl, DAG.getVTList(VT, MVT::Other), + {LD->getChain(), LD->getBasePtr()}, MVT::i32, + LD->getMemOperand()); + }; + + // Before we do the scalar_to_vector through memory, try to see if we can use + // a better way for some feeders. + if (SDValue Ret = FoldScalarToVector()) + return Ret; + // Create a stack slot that is 16-byte aligned. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); int FrameIdx = MFI.CreateStackObject(16, Align(16), false); diff --git a/llvm/test/CodeGen/PowerPC/load-and-splat.ll b/llvm/test/CodeGen/PowerPC/load-and-splat.ll --- a/llvm/test/CodeGen/PowerPC/load-and-splat.ll +++ b/llvm/test/CodeGen/PowerPC/load-and-splat.ll @@ -57,10 +57,8 @@ ; ; P7-LABEL: test2: ; P7: # %bb.0: # %entry -; P7-NEXT: lwz r4, 12(r4) -; P7-NEXT: addi r5, r1, -16 -; P7-NEXT: stw r4, -16(r1) -; P7-NEXT: lxvw4x vs0, 0, r5 +; P7-NEXT: addi r4, r4, 12 +; P7-NEXT: lfiwzx f0, 0, r4 ; P7-NEXT: xxspltw vs0, vs0, 0 ; P7-NEXT: stxvw4x vs0, 0, r3 ; P7-NEXT: blr @@ -91,10 +89,8 @@ ; ; P7-LABEL: test3: ; P7: # %bb.0: # %entry -; P7-NEXT: lwz r4, 12(r4) -; P7-NEXT: addi r5, r1, -16 -; P7-NEXT: stw r4, -16(r1) -; P7-NEXT: lxvw4x vs0, 0, r5 +; P7-NEXT: addi r4, r4, 12 +; P7-NEXT: lfiwzx f0, 0, r4 ; P7-NEXT: xxspltw vs0, vs0, 0 ; P7-NEXT: stxvw4x vs0, 0, r3 ; P7-NEXT: blr @@ -137,6 +133,7 @@ ret void } +; FIXME: use lfiwax for this case at PWR7. define void @test5(<2 x i64>* %a, i32* %in) { ; P9-LABEL: test5: ; P9: # %bb.0: # %entry @@ -170,6 +167,7 @@ ret void } +; FIXME: use lfiwzx for this case at PWR7. define void @test6(<2 x i64>* %a, i32* %in) { ; P9-LABEL: test6: ; P9: # %bb.0: # %entry @@ -217,10 +215,7 @@ ; ; P7-LABEL: unadjusted_lxvwsx: ; P7: # %bb.0: # %entry -; P7-NEXT: lwz r3, 0(r3) -; P7-NEXT: addi r4, r1, -16 -; P7-NEXT: stw r3, -16(r1) -; P7-NEXT: lxvw4x vs0, 0, r4 +; P7-NEXT: lfiwzx f0, 0, r3 ; P7-NEXT: xxspltw v2, vs0, 0 ; P7-NEXT: blr entry: