diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -845,6 +845,15 @@ return false; } + /// Return true if the target shall perform extract vector element and store + /// given that the vector is known to be splat of constant. + /// \p Index[out] gives the index of the vector element to be extracted when + /// this is true. + virtual bool shallExtractConstSplatVectorElementToStore( + Type *VectorTy, unsigned ElemSizeInBits, unsigned &Index) const { + return false; + } + /// Return true if inserting a scalar into a variable element of an undef /// vector is more efficiently handled by splatting the scalar instead. virtual bool shouldSplatInsEltVarIndex(EVT) const { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -7757,13 +7757,28 @@ } // If this store is smaller than the largest store see whether we can get - // the smaller value for free with a truncate. + // the smaller value for free with a truncate or extract vector element and + // then store. SDValue Value = MemSetValue; if (VT.bitsLT(LargestVT)) { + unsigned Index; + unsigned NElts = LargestVT.getSizeInBits() / VT.getSizeInBits(); + EVT SVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), NElts); if (!LargestVT.isVector() && !VT.isVector() && TLI.isTruncateFree(LargestVT, VT)) Value = DAG.getNode(ISD::TRUNCATE, dl, VT, MemSetValue); - else + else if (LargestVT.isVector() && !VT.isVector() && + TLI.shallExtractConstSplatVectorElementToStore( + LargestVT.getTypeForEVT(*DAG.getContext()), + VT.getSizeInBits(), Index) && + TLI.isTypeLegal(SVT) && + LargestVT.getSizeInBits() == SVT.getSizeInBits()) { + // Target which can combine store(extractelement VectorTy, Idx) can get + // the smaller value for free. + SDValue TailValue = DAG.getNode(ISD::BITCAST, dl, SVT, MemSetValue); + Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, TailValue, + DAG.getVectorIdxConstant(Index, dl)); + } else Value = getMemsetValue(Src, VT, DAG, dl); } assert(Value.getValueType() == VT && "Value with wrong type."); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -791,6 +791,11 @@ return true; } + bool + shallExtractConstSplatVectorElementToStore(Type *VectorTy, + unsigned ElemSizeInBits, + unsigned &Index) const override; + bool isCtlzFast() const override { return true; } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1635,6 +1635,27 @@ return VT.isScalarInteger(); } +bool PPCTargetLowering::shallExtractConstSplatVectorElementToStore( + Type *VectorTy, unsigned ElemSizeInBits, unsigned &Index) const { + if (!Subtarget.isPPC64() || !Subtarget.hasVSX()) + return false; + + if (auto *VTy = dyn_cast(VectorTy)) { + if (VTy->getScalarType()->isIntegerTy()) { + // ElemSizeInBits 8/16 can fit in immediate field, not needed here. + if (ElemSizeInBits == 32) { + Index = Subtarget.isLittleEndian() ? 2 : 1; + return true; + } + if (ElemSizeInBits == 64) { + Index = Subtarget.isLittleEndian() ? 1 : 0; + return true; + } + } + } + return false; +} + const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((PPCISD::NodeType)Opcode) { case PPCISD::FIRST_NUMBER: break; @@ -17086,10 +17107,20 @@ if (getTargetMachine().getOptLevel() != CodeGenOpt::None) { // We should use Altivec/VSX loads and stores when available. For unaligned // addresses, unaligned VSX loads are only fast starting with the P8. - if (Subtarget.hasAltivec() && Op.size() >= 16 && - (Op.isAligned(Align(16)) || - ((Op.isMemset() && Subtarget.hasVSX()) || Subtarget.hasP8Vector()))) - return MVT::v4i32; + if (Subtarget.hasAltivec() && Op.size() >= 16) { + if (Op.isMemset() && Subtarget.hasVSX()) { + uint64_t TailSize = Op.size() % 16; + // For memset lowering, EXTRACT_VECTOR_ELT tries to return constant + // element if vector element type matches tail store. For tail size + // 3/4, the tail store is i32, v4i32 cannot be used, need a legal one. + if (TailSize > 2 && TailSize <= 4) { + return MVT::v8i16; + } + return MVT::v4i32; + } + if (Op.isAligned(Align(16)) || Subtarget.hasP8Vector()) + return MVT::v4i32; + } } if (Subtarget.isPPC64()) { diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td --- a/llvm/lib/Target/PowerPC/PPCInstrP10.td +++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td @@ -2031,8 +2031,15 @@ (v8i16 (COPY_TO_REGCLASS (LXVRHX ForceXForm:$src), VSRC))>; def : Pat<(v16i8 (scalar_to_vector (i32 (extloadi8 ForceXForm:$src)))), (v16i8 (COPY_TO_REGCLASS (LXVRBX ForceXForm:$src), VSRC))>; + def : Pat<(store (i64 (extractelt v2i64:$A, 1)), ForceXForm:$src), + (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), ForceXForm:$src)>; } +let Predicates = [IsISA3_1, IsBigEndian] in { + def : Pat<(store (i64 (extractelt v2i64:$A, 0)), ForceXForm:$src), + (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), ForceXForm:$src)>; +} + // FIXME: The swap is overkill when the shift amount is a constant. // We should just fix the constant in the DAG. let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX] in { diff --git a/llvm/test/CodeGen/PowerPC/memset-tail.ll b/llvm/test/CodeGen/PowerPC/memset-tail.ll --- a/llvm/test/CodeGen/PowerPC/memset-tail.ll +++ b/llvm/test/CodeGen/PowerPC/memset-tail.ll @@ -169,59 +169,45 @@ ; P8-BE-LABEL: memsetTailV1B8: ; P8-BE: # %bb.0: # %entry ; P8-BE-NEXT: vspltisb 2, 15 -; P8-BE-NEXT: lis 4, 3855 -; P8-BE-NEXT: ori 4, 4, 3855 -; P8-BE-NEXT: rldimi 4, 4, 32, 0 +; P8-BE-NEXT: li 4, 16 +; P8-BE-NEXT: stxsdx 34, 3, 4 ; P8-BE-NEXT: stxvw4x 34, 0, 3 -; P8-BE-NEXT: std 4, 16(3) ; P8-BE-NEXT: blr ; ; P9-BE-LABEL: memsetTailV1B8: ; P9-BE: # %bb.0: # %entry -; P9-BE-NEXT: lis 4, 3855 ; P9-BE-NEXT: xxspltib 0, 15 -; P9-BE-NEXT: ori 4, 4, 3855 ; P9-BE-NEXT: stxv 0, 0(3) -; P9-BE-NEXT: rldimi 4, 4, 32, 0 -; P9-BE-NEXT: std 4, 16(3) +; P9-BE-NEXT: stfd 0, 16(3) ; P9-BE-NEXT: blr ; ; P10-BE-LABEL: memsetTailV1B8: ; P10-BE: # %bb.0: # %entry -; P10-BE-NEXT: pli 4, 252645135 -; P10-BE-NEXT: rldimi 4, 4, 32, 0 -; P10-BE-NEXT: std 4, 16(3) ; P10-BE-NEXT: xxspltib 0, 15 ; P10-BE-NEXT: stxv 0, 0(3) +; P10-BE-NEXT: stfd 0, 16(3) ; P10-BE-NEXT: blr ; ; P8-LE-LABEL: memsetTailV1B8: ; P8-LE: # %bb.0: # %entry -; P8-LE-NEXT: lis 4, 3855 ; P8-LE-NEXT: vspltisb 2, 15 -; P8-LE-NEXT: ori 4, 4, 3855 -; P8-LE-NEXT: rldimi 4, 4, 32, 0 -; P8-LE-NEXT: std 4, 16(3) +; P8-LE-NEXT: li 4, 16 +; P8-LE-NEXT: stxsdx 34, 3, 4 ; P8-LE-NEXT: stxvd2x 34, 0, 3 ; P8-LE-NEXT: blr ; ; P9-LE-LABEL: memsetTailV1B8: ; P9-LE: # %bb.0: # %entry -; P9-LE-NEXT: lis 4, 3855 ; P9-LE-NEXT: xxspltib 0, 15 -; P9-LE-NEXT: ori 4, 4, 3855 ; P9-LE-NEXT: stxv 0, 0(3) -; P9-LE-NEXT: rldimi 4, 4, 32, 0 -; P9-LE-NEXT: std 4, 16(3) +; P9-LE-NEXT: stfd 0, 16(3) ; P9-LE-NEXT: blr ; ; P10-LE-LABEL: memsetTailV1B8: ; P10-LE: # %bb.0: # %entry -; P10-LE-NEXT: pli 4, 252645135 -; P10-LE-NEXT: rldimi 4, 4, 32, 0 -; P10-LE-NEXT: std 4, 16(3) ; P10-LE-NEXT: xxspltib 0, 15 ; P10-LE-NEXT: stxv 0, 0(3) +; P10-LE-NEXT: stfd 0, 16(3) ; P10-LE-NEXT: blr entry: tail call void @llvm.memset.p0.i64(ptr %p, i8 15, i64 24, i1 false) @@ -231,63 +217,45 @@ define dso_local void @memsetTailV1B7(ptr nocapture noundef writeonly %p) local_unnamed_addr { ; P8-BE-LABEL: memsetTailV1B7: ; P8-BE: # %bb.0: # %entry -; P8-BE-NEXT: lis 4, 3855 ; P8-BE-NEXT: vspltisb 2, 15 -; P8-BE-NEXT: li 5, 15 -; P8-BE-NEXT: ori 4, 4, 3855 -; P8-BE-NEXT: rldimi 4, 4, 32, 0 -; P8-BE-NEXT: stdx 4, 3, 5 +; P8-BE-NEXT: li 4, 15 +; P8-BE-NEXT: stxsdx 34, 3, 4 ; P8-BE-NEXT: stxvw4x 34, 0, 3 ; P8-BE-NEXT: blr ; ; P9-BE-LABEL: memsetTailV1B7: ; P9-BE: # %bb.0: # %entry -; P9-BE-NEXT: lis 4, 3855 -; P9-BE-NEXT: li 5, 15 -; P9-BE-NEXT: ori 4, 4, 3855 -; P9-BE-NEXT: rldimi 4, 4, 32, 0 -; P9-BE-NEXT: stdx 4, 3, 5 ; P9-BE-NEXT: xxspltib 0, 15 +; P9-BE-NEXT: stfd 0, 15(3) ; P9-BE-NEXT: stxv 0, 0(3) ; P9-BE-NEXT: blr ; ; P10-BE-LABEL: memsetTailV1B7: ; P10-BE: # %bb.0: # %entry -; P10-BE-NEXT: pli 4, 252645135 -; P10-BE-NEXT: rldimi 4, 4, 32, 0 -; P10-BE-NEXT: pstd 4, 15(3), 0 ; P10-BE-NEXT: xxspltib 0, 15 +; P10-BE-NEXT: stfd 0, 15(3) ; P10-BE-NEXT: stxv 0, 0(3) ; P10-BE-NEXT: blr ; ; P8-LE-LABEL: memsetTailV1B7: ; P8-LE: # %bb.0: # %entry -; P8-LE-NEXT: lis 4, 3855 ; P8-LE-NEXT: vspltisb 2, 15 -; P8-LE-NEXT: li 5, 15 -; P8-LE-NEXT: ori 4, 4, 3855 -; P8-LE-NEXT: rldimi 4, 4, 32, 0 -; P8-LE-NEXT: stdx 4, 3, 5 +; P8-LE-NEXT: li 4, 15 +; P8-LE-NEXT: stxsdx 34, 3, 4 ; P8-LE-NEXT: stxvd2x 34, 0, 3 ; P8-LE-NEXT: blr ; ; P9-LE-LABEL: memsetTailV1B7: ; P9-LE: # %bb.0: # %entry -; P9-LE-NEXT: lis 4, 3855 -; P9-LE-NEXT: li 5, 15 -; P9-LE-NEXT: ori 4, 4, 3855 -; P9-LE-NEXT: rldimi 4, 4, 32, 0 -; P9-LE-NEXT: stdx 4, 3, 5 ; P9-LE-NEXT: xxspltib 0, 15 +; P9-LE-NEXT: stfd 0, 15(3) ; P9-LE-NEXT: stxv 0, 0(3) ; P9-LE-NEXT: blr ; ; P10-LE-LABEL: memsetTailV1B7: ; P10-LE: # %bb.0: # %entry -; P10-LE-NEXT: pli 4, 252645135 -; P10-LE-NEXT: rldimi 4, 4, 32, 0 -; P10-LE-NEXT: pstd 4, 15(3), 0 ; P10-LE-NEXT: xxspltib 0, 15 +; P10-LE-NEXT: stfd 0, 15(3) ; P10-LE-NEXT: stxv 0, 0(3) ; P10-LE-NEXT: blr entry: @@ -299,52 +267,48 @@ ; P8-BE-LABEL: memsetTailV1B4: ; P8-BE: # %bb.0: # %entry ; P8-BE-NEXT: vspltisb 2, 15 -; P8-BE-NEXT: lis 4, 3855 -; P8-BE-NEXT: ori 4, 4, 3855 -; P8-BE-NEXT: stw 4, 16(3) +; P8-BE-NEXT: li 4, 16 +; P8-BE-NEXT: stxsiwx 34, 3, 4 ; P8-BE-NEXT: stxvw4x 34, 0, 3 ; P8-BE-NEXT: blr ; ; P9-BE-LABEL: memsetTailV1B4: ; P9-BE: # %bb.0: # %entry -; P9-BE-NEXT: lis 4, 3855 -; P9-BE-NEXT: ori 4, 4, 3855 -; P9-BE-NEXT: stw 4, 16(3) ; P9-BE-NEXT: xxspltib 0, 15 +; P9-BE-NEXT: li 4, 16 +; P9-BE-NEXT: stfiwx 0, 3, 4 ; P9-BE-NEXT: stxv 0, 0(3) ; P9-BE-NEXT: blr ; ; P10-BE-LABEL: memsetTailV1B4: ; P10-BE: # %bb.0: # %entry -; P10-BE-NEXT: pli 4, 252645135 -; P10-BE-NEXT: stw 4, 16(3) ; P10-BE-NEXT: xxspltib 0, 15 +; P10-BE-NEXT: li 4, 16 +; P10-BE-NEXT: stfiwx 0, 3, 4 ; P10-BE-NEXT: stxv 0, 0(3) ; P10-BE-NEXT: blr ; ; P8-LE-LABEL: memsetTailV1B4: ; P8-LE: # %bb.0: # %entry ; P8-LE-NEXT: vspltisb 2, 15 -; P8-LE-NEXT: lis 4, 3855 -; P8-LE-NEXT: ori 4, 4, 3855 -; P8-LE-NEXT: stw 4, 16(3) +; P8-LE-NEXT: li 4, 16 +; P8-LE-NEXT: stxsiwx 34, 3, 4 ; P8-LE-NEXT: stxvd2x 34, 0, 3 ; P8-LE-NEXT: blr ; ; P9-LE-LABEL: memsetTailV1B4: ; P9-LE: # %bb.0: # %entry -; P9-LE-NEXT: lis 4, 3855 -; P9-LE-NEXT: ori 4, 4, 3855 -; P9-LE-NEXT: stw 4, 16(3) ; P9-LE-NEXT: xxspltib 0, 15 +; P9-LE-NEXT: li 4, 16 +; P9-LE-NEXT: stfiwx 0, 3, 4 ; P9-LE-NEXT: stxv 0, 0(3) ; P9-LE-NEXT: blr ; ; P10-LE-LABEL: memsetTailV1B4: ; P10-LE: # %bb.0: # %entry -; P10-LE-NEXT: pli 4, 252645135 -; P10-LE-NEXT: stw 4, 16(3) ; P10-LE-NEXT: xxspltib 0, 15 +; P10-LE-NEXT: li 4, 16 +; P10-LE-NEXT: stfiwx 0, 3, 4 ; P10-LE-NEXT: stxv 0, 0(3) ; P10-LE-NEXT: blr entry: @@ -356,52 +320,48 @@ ; P8-BE-LABEL: memsetTailV1B3: ; P8-BE: # %bb.0: # %entry ; P8-BE-NEXT: vspltisb 2, 15 -; P8-BE-NEXT: lis 4, 3855 -; P8-BE-NEXT: ori 4, 4, 3855 +; P8-BE-NEXT: li 4, 15 +; P8-BE-NEXT: stxsiwx 34, 3, 4 ; P8-BE-NEXT: stxvw4x 34, 0, 3 -; P8-BE-NEXT: stw 4, 15(3) ; P8-BE-NEXT: blr ; ; P9-BE-LABEL: memsetTailV1B3: ; P9-BE: # %bb.0: # %entry -; P9-BE-NEXT: lis 4, 3855 -; P9-BE-NEXT: ori 4, 4, 3855 -; P9-BE-NEXT: stw 4, 15(3) ; P9-BE-NEXT: xxspltib 0, 15 +; P9-BE-NEXT: li 4, 15 +; P9-BE-NEXT: stfiwx 0, 3, 4 ; P9-BE-NEXT: stxv 0, 0(3) ; P9-BE-NEXT: blr ; ; P10-BE-LABEL: memsetTailV1B3: ; P10-BE: # %bb.0: # %entry -; P10-BE-NEXT: pli 4, 252645135 -; P10-BE-NEXT: stw 4, 15(3) ; P10-BE-NEXT: xxspltib 0, 15 +; P10-BE-NEXT: li 4, 15 +; P10-BE-NEXT: stfiwx 0, 3, 4 ; P10-BE-NEXT: stxv 0, 0(3) ; P10-BE-NEXT: blr ; ; P8-LE-LABEL: memsetTailV1B3: ; P8-LE: # %bb.0: # %entry ; P8-LE-NEXT: vspltisb 2, 15 -; P8-LE-NEXT: lis 4, 3855 -; P8-LE-NEXT: ori 4, 4, 3855 -; P8-LE-NEXT: stw 4, 15(3) +; P8-LE-NEXT: li 4, 15 +; P8-LE-NEXT: stxsiwx 34, 3, 4 ; P8-LE-NEXT: stxvd2x 34, 0, 3 ; P8-LE-NEXT: blr ; ; P9-LE-LABEL: memsetTailV1B3: ; P9-LE: # %bb.0: # %entry -; P9-LE-NEXT: lis 4, 3855 -; P9-LE-NEXT: ori 4, 4, 3855 -; P9-LE-NEXT: stw 4, 15(3) ; P9-LE-NEXT: xxspltib 0, 15 +; P9-LE-NEXT: li 4, 15 +; P9-LE-NEXT: stfiwx 0, 3, 4 ; P9-LE-NEXT: stxv 0, 0(3) ; P9-LE-NEXT: blr ; ; P10-LE-LABEL: memsetTailV1B3: ; P10-LE: # %bb.0: # %entry -; P10-LE-NEXT: pli 4, 252645135 -; P10-LE-NEXT: stw 4, 15(3) ; P10-LE-NEXT: xxspltib 0, 15 +; P10-LE-NEXT: li 4, 15 +; P10-LE-NEXT: stfiwx 0, 3, 4 ; P10-LE-NEXT: stxv 0, 0(3) ; P10-LE-NEXT: blr entry: @@ -682,30 +642,22 @@ ; P8-BE: # %bb.0: # %entry ; P8-BE-NEXT: ld 4, L..C3(2) # %const.0 ; P8-BE-NEXT: lxvw4x 0, 0, 4 -; P8-BE-NEXT: lis 4, -23131 -; P8-BE-NEXT: ori 4, 4, 42405 -; P8-BE-NEXT: rldimi 4, 4, 32, 0 +; P8-BE-NEXT: stfd 0, 16(3) ; P8-BE-NEXT: stxvw4x 0, 0, 3 -; P8-BE-NEXT: std 4, 16(3) ; P8-BE-NEXT: blr ; ; P9-BE-LABEL: memset2TailV1B8: ; P9-BE: # %bb.0: # %entry -; P9-BE-NEXT: lis 4, -23131 ; P9-BE-NEXT: xxspltib 0, 165 -; P9-BE-NEXT: ori 4, 4, 42405 ; P9-BE-NEXT: stxv 0, 0(3) -; P9-BE-NEXT: rldimi 4, 4, 32, 0 -; P9-BE-NEXT: std 4, 16(3) +; P9-BE-NEXT: stfd 0, 16(3) ; P9-BE-NEXT: blr ; ; P10-BE-LABEL: memset2TailV1B8: ; P10-BE: # %bb.0: # %entry -; P10-BE-NEXT: pli 4, 2779096485 -; P10-BE-NEXT: rldimi 4, 4, 32, 0 -; P10-BE-NEXT: std 4, 16(3) ; P10-BE-NEXT: xxspltib 0, 165 ; P10-BE-NEXT: stxv 0, 0(3) +; P10-BE-NEXT: stfd 0, 16(3) ; P10-BE-NEXT: blr ; ; P8-LE-LABEL: memset2TailV1B8: @@ -713,30 +665,22 @@ ; P8-LE-NEXT: addis 4, 2, .LCPI12_0@toc@ha ; P8-LE-NEXT: addi 4, 4, .LCPI12_0@toc@l ; P8-LE-NEXT: lxvd2x 0, 0, 4 -; P8-LE-NEXT: lis 4, -23131 -; P8-LE-NEXT: ori 4, 4, 42405 -; P8-LE-NEXT: rldimi 4, 4, 32, 0 -; P8-LE-NEXT: std 4, 16(3) +; P8-LE-NEXT: stfd 0, 16(3) ; P8-LE-NEXT: stxvd2x 0, 0, 3 ; P8-LE-NEXT: blr ; ; P9-LE-LABEL: memset2TailV1B8: ; P9-LE: # %bb.0: # %entry -; P9-LE-NEXT: lis 4, -23131 ; P9-LE-NEXT: xxspltib 0, 165 -; P9-LE-NEXT: ori 4, 4, 42405 ; P9-LE-NEXT: stxv 0, 0(3) -; P9-LE-NEXT: rldimi 4, 4, 32, 0 -; P9-LE-NEXT: std 4, 16(3) +; P9-LE-NEXT: stfd 0, 16(3) ; P9-LE-NEXT: blr ; ; P10-LE-LABEL: memset2TailV1B8: ; P10-LE: # %bb.0: # %entry -; P10-LE-NEXT: pli 4, 2779096485 -; P10-LE-NEXT: rldimi 4, 4, 32, 0 -; P10-LE-NEXT: std 4, 16(3) ; P10-LE-NEXT: xxspltib 0, 165 ; P10-LE-NEXT: stxv 0, 0(3) +; P10-LE-NEXT: stfd 0, 16(3) ; P10-LE-NEXT: blr entry: tail call void @llvm.memset.p0.i64(ptr %p, i8 165, i64 24, i1 false) @@ -747,65 +691,45 @@ ; P8-BE-LABEL: memset2TailV1B7: ; P8-BE: # %bb.0: # %entry ; P8-BE-NEXT: ld 4, L..C4(2) # %const.0 -; P8-BE-NEXT: lis 5, -23131 ; P8-BE-NEXT: lxvw4x 0, 0, 4 -; P8-BE-NEXT: ori 4, 5, 42405 -; P8-BE-NEXT: li 5, 15 -; P8-BE-NEXT: rldimi 4, 4, 32, 0 -; P8-BE-NEXT: stdx 4, 3, 5 +; P8-BE-NEXT: stfd 0, 15(3) ; P8-BE-NEXT: stxvw4x 0, 0, 3 ; P8-BE-NEXT: blr ; ; P9-BE-LABEL: memset2TailV1B7: ; P9-BE: # %bb.0: # %entry -; P9-BE-NEXT: lis 4, -23131 -; P9-BE-NEXT: li 5, 15 -; P9-BE-NEXT: ori 4, 4, 42405 -; P9-BE-NEXT: rldimi 4, 4, 32, 0 -; P9-BE-NEXT: stdx 4, 3, 5 ; P9-BE-NEXT: xxspltib 0, 165 +; P9-BE-NEXT: stfd 0, 15(3) ; P9-BE-NEXT: stxv 0, 0(3) ; P9-BE-NEXT: blr ; ; P10-BE-LABEL: memset2TailV1B7: ; P10-BE: # %bb.0: # %entry -; P10-BE-NEXT: pli 4, 2779096485 -; P10-BE-NEXT: rldimi 4, 4, 32, 0 -; P10-BE-NEXT: pstd 4, 15(3), 0 ; P10-BE-NEXT: xxspltib 0, 165 +; P10-BE-NEXT: stfd 0, 15(3) ; P10-BE-NEXT: stxv 0, 0(3) ; P10-BE-NEXT: blr ; ; P8-LE-LABEL: memset2TailV1B7: ; P8-LE: # %bb.0: # %entry ; P8-LE-NEXT: addis 4, 2, .LCPI13_0@toc@ha -; P8-LE-NEXT: lis 5, -23131 ; P8-LE-NEXT: addi 4, 4, .LCPI13_0@toc@l ; P8-LE-NEXT: lxvd2x 0, 0, 4 -; P8-LE-NEXT: ori 4, 5, 42405 -; P8-LE-NEXT: li 5, 15 -; P8-LE-NEXT: rldimi 4, 4, 32, 0 -; P8-LE-NEXT: stdx 4, 3, 5 +; P8-LE-NEXT: stfd 0, 15(3) ; P8-LE-NEXT: stxvd2x 0, 0, 3 ; P8-LE-NEXT: blr ; ; P9-LE-LABEL: memset2TailV1B7: ; P9-LE: # %bb.0: # %entry -; P9-LE-NEXT: lis 4, -23131 -; P9-LE-NEXT: li 5, 15 -; P9-LE-NEXT: ori 4, 4, 42405 -; P9-LE-NEXT: rldimi 4, 4, 32, 0 -; P9-LE-NEXT: stdx 4, 3, 5 ; P9-LE-NEXT: xxspltib 0, 165 +; P9-LE-NEXT: stfd 0, 15(3) ; P9-LE-NEXT: stxv 0, 0(3) ; P9-LE-NEXT: blr ; ; P10-LE-LABEL: memset2TailV1B7: ; P10-LE: # %bb.0: # %entry -; P10-LE-NEXT: pli 4, 2779096485 -; P10-LE-NEXT: rldimi 4, 4, 32, 0 -; P10-LE-NEXT: pstd 4, 15(3), 0 ; P10-LE-NEXT: xxspltib 0, 165 +; P10-LE-NEXT: stfd 0, 15(3) ; P10-LE-NEXT: stxv 0, 0(3) ; P10-LE-NEXT: blr entry: @@ -818,26 +742,24 @@ ; P8-BE: # %bb.0: # %entry ; P8-BE-NEXT: ld 4, L..C5(2) # %const.0 ; P8-BE-NEXT: lxvw4x 0, 0, 4 -; P8-BE-NEXT: lis 4, -23131 -; P8-BE-NEXT: ori 4, 4, 42405 -; P8-BE-NEXT: stw 4, 16(3) +; P8-BE-NEXT: li 4, 16 +; P8-BE-NEXT: stfiwx 0, 3, 4 ; P8-BE-NEXT: stxvw4x 0, 0, 3 ; P8-BE-NEXT: blr ; ; P9-BE-LABEL: memset2TailV1B4: ; P9-BE: # %bb.0: # %entry -; P9-BE-NEXT: lis 4, -23131 -; P9-BE-NEXT: ori 4, 4, 42405 -; P9-BE-NEXT: stw 4, 16(3) ; P9-BE-NEXT: xxspltib 0, 165 +; P9-BE-NEXT: li 4, 16 +; P9-BE-NEXT: stfiwx 0, 3, 4 ; P9-BE-NEXT: stxv 0, 0(3) ; P9-BE-NEXT: blr ; ; P10-BE-LABEL: memset2TailV1B4: ; P10-BE: # %bb.0: # %entry -; P10-BE-NEXT: pli 4, -1515870811 -; P10-BE-NEXT: stw 4, 16(3) ; P10-BE-NEXT: xxspltib 0, 165 +; P10-BE-NEXT: li 4, 16 +; P10-BE-NEXT: stfiwx 0, 3, 4 ; P10-BE-NEXT: stxv 0, 0(3) ; P10-BE-NEXT: blr ; @@ -846,26 +768,24 @@ ; P8-LE-NEXT: addis 4, 2, .LCPI14_0@toc@ha ; P8-LE-NEXT: addi 4, 4, .LCPI14_0@toc@l ; P8-LE-NEXT: lxvd2x 0, 0, 4 -; P8-LE-NEXT: lis 4, -23131 -; P8-LE-NEXT: ori 4, 4, 42405 -; P8-LE-NEXT: stw 4, 16(3) +; P8-LE-NEXT: li 4, 16 +; P8-LE-NEXT: stfiwx 0, 3, 4 ; P8-LE-NEXT: stxvd2x 0, 0, 3 ; P8-LE-NEXT: blr ; ; P9-LE-LABEL: memset2TailV1B4: ; P9-LE: # %bb.0: # %entry -; P9-LE-NEXT: lis 4, -23131 -; P9-LE-NEXT: ori 4, 4, 42405 -; P9-LE-NEXT: stw 4, 16(3) ; P9-LE-NEXT: xxspltib 0, 165 +; P9-LE-NEXT: li 4, 16 +; P9-LE-NEXT: stfiwx 0, 3, 4 ; P9-LE-NEXT: stxv 0, 0(3) ; P9-LE-NEXT: blr ; ; P10-LE-LABEL: memset2TailV1B4: ; P10-LE: # %bb.0: # %entry -; P10-LE-NEXT: pli 4, -1515870811 -; P10-LE-NEXT: stw 4, 16(3) ; P10-LE-NEXT: xxspltib 0, 165 +; P10-LE-NEXT: li 4, 16 +; P10-LE-NEXT: stfiwx 0, 3, 4 ; P10-LE-NEXT: stxv 0, 0(3) ; P10-LE-NEXT: blr entry: @@ -878,26 +798,24 @@ ; P8-BE: # %bb.0: # %entry ; P8-BE-NEXT: ld 4, L..C6(2) # %const.0 ; P8-BE-NEXT: lxvw4x 0, 0, 4 -; P8-BE-NEXT: lis 4, -23131 -; P8-BE-NEXT: ori 4, 4, 42405 -; P8-BE-NEXT: stw 4, 15(3) +; P8-BE-NEXT: li 4, 15 +; P8-BE-NEXT: stfiwx 0, 3, 4 ; P8-BE-NEXT: stxvw4x 0, 0, 3 ; P8-BE-NEXT: blr ; ; P9-BE-LABEL: memset2TailV1B3: ; P9-BE: # %bb.0: # %entry -; P9-BE-NEXT: lis 4, -23131 -; P9-BE-NEXT: ori 4, 4, 42405 -; P9-BE-NEXT: stw 4, 15(3) ; P9-BE-NEXT: xxspltib 0, 165 +; P9-BE-NEXT: li 4, 15 +; P9-BE-NEXT: stfiwx 0, 3, 4 ; P9-BE-NEXT: stxv 0, 0(3) ; P9-BE-NEXT: blr ; ; P10-BE-LABEL: memset2TailV1B3: ; P10-BE: # %bb.0: # %entry -; P10-BE-NEXT: pli 4, -1515870811 -; P10-BE-NEXT: stw 4, 15(3) ; P10-BE-NEXT: xxspltib 0, 165 +; P10-BE-NEXT: li 4, 15 +; P10-BE-NEXT: stfiwx 0, 3, 4 ; P10-BE-NEXT: stxv 0, 0(3) ; P10-BE-NEXT: blr ; @@ -906,26 +824,24 @@ ; P8-LE-NEXT: addis 4, 2, .LCPI15_0@toc@ha ; P8-LE-NEXT: addi 4, 4, .LCPI15_0@toc@l ; P8-LE-NEXT: lxvd2x 0, 0, 4 -; P8-LE-NEXT: lis 4, -23131 -; P8-LE-NEXT: ori 4, 4, 42405 -; P8-LE-NEXT: stw 4, 15(3) +; P8-LE-NEXT: li 4, 15 +; P8-LE-NEXT: stfiwx 0, 3, 4 ; P8-LE-NEXT: stxvd2x 0, 0, 3 ; P8-LE-NEXT: blr ; ; P9-LE-LABEL: memset2TailV1B3: ; P9-LE: # %bb.0: # %entry -; P9-LE-NEXT: lis 4, -23131 -; P9-LE-NEXT: ori 4, 4, 42405 -; P9-LE-NEXT: stw 4, 15(3) ; P9-LE-NEXT: xxspltib 0, 165 +; P9-LE-NEXT: li 4, 15 +; P9-LE-NEXT: stfiwx 0, 3, 4 ; P9-LE-NEXT: stxv 0, 0(3) ; P9-LE-NEXT: blr ; ; P10-LE-LABEL: memset2TailV1B3: ; P10-LE: # %bb.0: # %entry -; P10-LE-NEXT: pli 4, -1515870811 -; P10-LE-NEXT: stw 4, 15(3) ; P10-LE-NEXT: xxspltib 0, 165 +; P10-LE-NEXT: li 4, 15 +; P10-LE-NEXT: stfiwx 0, 3, 4 ; P10-LE-NEXT: stxv 0, 0(3) ; P10-LE-NEXT: blr entry: