diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -610,6 +610,9 @@ setOperationAction(ISD::INTRINSIC_VOID, MVT::i32, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + setOperationAction(ISD::VP_LOAD, MVT::i32, Promote); + setOperationAction(ISD::VP_STORE, MVT::i32, Promote); + // Comparisons that require checking two conditions. if (Subtarget.hasSPE()) { setCondCodeAction(ISD::SETO, MVT::f32, Expand); @@ -1326,6 +1329,8 @@ setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::VP_LOAD); + setTargetDAGCombine(ISD::VP_STORE); setTargetDAGCombine(ISD::BR_CC); if (Subtarget.useCRBits()) setTargetDAGCombine(ISD::BRCOND); @@ -15215,6 +15220,63 @@ } } break; + case ISD::VP_LOAD: { + if (!DCI.isAfterLegalizeDAG()) + break; + auto *LD = cast(N); + SDValue Length = LD->getVectorLength(); + SDNode *LengthNode = Length.getNode(); + // We don't want to shift on subsequent combines. + // As a workaround, we check if the length node is a constant shift of at + // least 56. The shift+mul might get combined, so we also check for a + // corresponding constant multiplication. + if (LengthNode->getOpcode() == ISD::SHL) + if (isa(Length.getOperand(1).getNode())) + if (Length.getConstantOperandVal(1) >= 56) + return SDValue(); + if (LengthNode->getOpcode() == ISD::MUL) + if (isa(Length.getOperand(1).getNode())) + if (Length.getConstantOperandVal(1) >> 56) + return SDValue(); + SDLoc DL(N); + unsigned EltBytes = N->getValueType(0).getScalarSizeInBits() / 8; + SDValue ExtLength = DAG.getZExtOrTrunc(Length, DL, MVT::i64); + SDValue ShiftedLength = + DAG.getNode(ISD::SHL, DL, MVT::i64, ExtLength, + DAG.getConstant(56 + countTrailingZeros(EltBytes), DL, + getPointerTy(DAG.getDataLayout()))); + SmallVector NewOps(N->op_begin(), N->op_end()); + NewOps[4] = ShiftedLength; + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); + break; + } + case ISD::VP_STORE: { + if (!DCI.isAfterLegalizeDAG()) + break; + auto *ST = cast(N); + SDValue Length = ST->getVectorLength(); + SDNode *LengthNode = Length.getNode(); + if (LengthNode->getOpcode() == ISD::SHL) + if (isa(Length.getOperand(1).getNode())) + if (Length.getConstantOperandVal(1) >= 56) + return SDValue(); + if (LengthNode->getOpcode() == ISD::MUL) + if (isa(Length.getOperand(1).getNode())) + if (Length.getConstantOperandVal(1) >> 56) + return SDValue(); + SDLoc DL(N); + unsigned EltBytes = + N->getOperand(1).getValueType().getScalarSizeInBits() / 8; + SDValue ExtLength = DAG.getZExtOrTrunc(Length, DL, MVT::i64); + SDValue ShiftedLength = + DAG.getNode(ISD::SHL, DL, MVT::i64, ExtLength, + DAG.getConstant(56 + countTrailingZeros(EltBytes), DL, + getPointerTy(DAG.getDataLayout()))); + SmallVector NewOps(N->op_begin(), N->op_end()); + NewOps[5] = ShiftedLength; + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); + break; + } case ISD::INTRINSIC_WO_CHAIN: { bool isLittleEndian = Subtarget.isLittleEndian(); unsigned IID = cast(N->getOperand(0))->getZExtValue(); diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -2434,6 +2434,32 @@ (v2i64 (XXSPLTW EQWSHAND, 2)), 0)); } +def SDTVPLoad: SDTypeProfile<1, 4, [ + SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisPtrTy<2>, SDTCisSameNumEltsAs<0, 3>, SDTCisInt<3>, SDTCisInt<4> +]>; +def SDTVPStore: SDTypeProfile<0, 5, [ + SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisPtrTy<2>, SDTCisSameNumEltsAs<0, 3>, SDTCisInt<3>, SDTCisInt<4> +]>; +def vp_load : SDNode<"ISD::VP_LOAD", SDTVPLoad, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def vp_store : SDNode<"ISD::VP_STORE", SDTVPStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def load_vl : PatFrags<(ops node:$src1, node:$src2), [ + (vp_load node:$src1, undef, undef, node:$src2), + (vp_load node:$src1, undef, immAllZerosV, node:$src2) + ], [{ + return !cast(N)->isExpandingLoad() && + cast(N)->getExtensionType() == ISD::NON_EXTLOAD && + cast(N)->isUnindexed(); +}]>; +def store_vl : PatFrags<(ops node:$src1, node:$src2, node:$src3), [ + (vp_store node:$src1, node:$src2, undef, undef, node:$src3), + (vp_store node:$src1, node:$src2, undef, immAllZerosV, node:$src3) + ], [{ + return !cast(N)->isTruncatingStore() && + cast(N)->isUnindexed(); +}]>; + //---------------------------- Anonymous Patterns ----------------------------// // Predicate combinations are kept in roughly chronological order in terms of // instruction availability in the architecture. For example, VSX came in with @@ -3861,6 +3887,21 @@ def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, XForm:$dst), (STXVX $rS, XForm:$dst)>; +// ld/st-with-length patterns +foreach vt = [ v2i64, v4i32, v2f64, v4f32, ] in { + def : Pat<(!cast(""#vt) (load_vl addr:$src, i64:$rB)), + (LXVL $src, $rB)>; + def : Pat<(store_vl !cast(""#vt):$rS, addr:$dst, i64:$rB), + (STXVL $rS, $dst, $rB)>; +} +// We have to do v8i16 and v16i8 separately because they are not in VSRC. +foreach vt = [ v8i16, v16i8, ] in { + def : Pat<(!cast(""#vt) (load_vl addr:$src, i64:$rB)), + (COPY_TO_REGCLASS (LXVL $src, $rB), VRRC)>; + def : Pat<(store_vl !cast(""#vt):$rS, addr:$dst, i64:$rB), + (STXVL (COPY_TO_REGCLASS $rS, VSRC), $dst, $rB)>; +} + // Build vectors from i8 loads defm : ScalToVecWPermute &Args) const; + TTI::VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const; /// @} }; diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -1337,3 +1337,48 @@ return false; } + +TTI::VPLegalization +PPCTTIImpl::getVPLegalizationStrategy(const VPIntrinsic &PI) const { + auto Legal = TargetTransformInfo::VPLegalization( + /* EVLParamStrategy */ TargetTransformInfo::VPLegalization::Legal, + /* OperatorStrategy */ TargetTransformInfo::VPLegalization::Legal); + auto Illegal = BaseT::getVPLegalizationStrategy(PI); + // Masks are unsupported. + if (!isa(PI.getMaskParam())) + return Illegal; + switch (PI.getIntrinsicID()) { + default: + return Illegal; + case Intrinsic::vp_load: + case Intrinsic::vp_store: { + // We currently don't support the target-independent interface for Altivec. + // Loads/stores with length instructions use bits 0-7 of the GPR operand and + // therefore cannot be used in 32-bit mode. + if ((!ST->hasP9Vector() && !ST->hasP10Vector()) || !ST->isPPC64()) + return Illegal; + Type *DataType = PI.getIntrinsicID() == Intrinsic::vp_load + ? PI.getType() + : PI.getMemoryDataParam()->getType(); + if (isa(DataType)) { + // auto *VecTy = dyn_cast(DataType); + unsigned VecWidth = DataType->getPrimitiveSizeInBits(); + return VecWidth == 128 ? Legal : Illegal; + } + Type *ScalarTy = DataType->getScalarType(); + if (ScalarTy->isPointerTy()) + return Legal; + + if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) + return Legal; + + if (!ScalarTy->isIntegerTy()) + return Illegal; + + unsigned IntWidth = ScalarTy->getIntegerBitWidth(); + if (IntWidth == 8 || IntWidth == 16 || IntWidth == 32 || IntWidth == 64) + return Legal; + return Illegal; + } + } +} diff --git a/llvm/test/CodeGen/PowerPC/ldst-with-length-vector.ll b/llvm/test/CodeGen/PowerPC/ldst-with-length-vector.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/ldst-with-length-vector.ll @@ -0,0 +1,559 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s + +define void @store_vl_v2i64(<2 x i64>* %ptr, <2 x i64> %val, i32 %evl) { +; CHECK-LABEL: store_vl_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: sldi 4, 7, 59 +; CHECK-NEXT: stxvl 34, 3, 4 +; CHECK-NEXT: blr + call void @llvm.vp.store.v2i64(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v2i64(<2 x i64>, <2 x i64>*, <2 x i1>, i32) +define <2 x i64> @load_vl_v2i64_i32(<2 x i64>* %ptr, i32 %evl) { +; CHECK-LABEL: load_vl_v2i64_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: sldi 4, 4, 59 +; CHECK-NEXT: lxvl 34, 3, 4 +; CHECK-NEXT: blr + %res = call <2 x i64> @llvm.vp.load.v2i64(<2 x i64>* %ptr, <2 x i1> undef, i32 %evl) + ret <2 x i64> %res +} +declare <2 x i64> @llvm.vp.load.v2i64(<2 x i64>*, <2 x i1>, i32) + +define void @store_vl_v4i32(<4 x i32>* %ptr, <4 x i32> %val, i32 %evl) { +; CHECK-LABEL: store_vl_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: sldi 4, 7, 58 +; CHECK-NEXT: stxvl 34, 3, 4 +; CHECK-NEXT: blr + call void @llvm.vp.store.v4i32(<4 x i32> %val, <4 x i32>* %ptr, <4 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v4i32(<4 x i32>, <4 x i32>*, <4 x i1>, i32) +define <4 x i32> @load_vl_v4i32_i32(<4 x i32>* %ptr, i32 %evl) { +; CHECK-LABEL: load_vl_v4i32_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: sldi 4, 4, 58 +; CHECK-NEXT: lxvl 34, 3, 4 +; CHECK-NEXT: blr + %res = call <4 x i32> @llvm.vp.load.v4i32(<4 x i32>* %ptr, <4 x i1> undef, i32 %evl) + ret <4 x i32> %res +} +declare <4 x i32> @llvm.vp.load.v4i32(<4 x i32>*, <4 x i1>, i32) + +define void @store_vl_v8i16(<8 x i16>* %ptr, <8 x i16> %val, i32 %evl) { +; CHECK-LABEL: store_vl_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: sldi 4, 7, 57 +; CHECK-NEXT: stxvl 34, 3, 4 +; CHECK-NEXT: blr + call void @llvm.vp.store.v8i16(<8 x i16> %val, <8 x i16>* %ptr, <8 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v8i16(<8 x i16>, <8 x i16>*, <8 x i1>, i32) +define <8 x i16> @load_vl_v8i16_i32(<8 x i16>* %ptr, i32 %evl) { +; CHECK-LABEL: load_vl_v8i16_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: sldi 4, 4, 57 +; CHECK-NEXT: lxvl 34, 3, 4 +; CHECK-NEXT: blr + %res = call <8 x i16> @llvm.vp.load.v8i16(<8 x i16>* %ptr, <8 x i1> undef, i32 %evl) + ret <8 x i16> %res +} +declare <8 x i16> @llvm.vp.load.v8i16(<8 x i16>*, <8 x i1>, i32) + +define void @store_vl_v16i8(<16 x i8>* %ptr, <16 x i8> %val, i32 %evl) { +; CHECK-LABEL: store_vl_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: sldi 4, 7, 56 +; CHECK-NEXT: stxvl 34, 3, 4 +; CHECK-NEXT: blr + call void @llvm.vp.store.v16i8(<16 x i8> %val, <16 x i8>* %ptr, <16 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v16i8(<16 x i8>, <16 x i8>*, <16 x i1>, i32) +define <16 x i8> @load_vl_v16i8_i32(<16 x i8>* %ptr, i32 %evl) { +; CHECK-LABEL: load_vl_v16i8_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: sldi 4, 4, 56 +; CHECK-NEXT: lxvl 34, 3, 4 +; CHECK-NEXT: blr + %res = call <16 x i8> @llvm.vp.load.v16i8(<16 x i8>* %ptr, <16 x i1> undef, i32 %evl) + ret <16 x i8> %res +} +declare <16 x i8> @llvm.vp.load.v16i8(<16 x i8>*, <16 x i1>, i32) + +define void @store_vl_v4i64(<4 x i64>* %ptr, <4 x i64> %val, i32 %evl) { +; CHECK-LABEL: store_vl_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li 5, 1 +; CHECK-NEXT: addi 4, 3, 16 +; CHECK-NEXT: rldic 5, 5, 60, 3 +; CHECK-NEXT: stxvl 35, 4, 5 +; CHECK-NEXT: stxvl 34, 3, 5 +; CHECK-NEXT: blr + call void @llvm.vp.store.v4i64(<4 x i64> %val, <4 x i64>* %ptr, <4 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v4i64(<4 x i64>, <4 x i64>*, <4 x i1>, i32) +define <4 x i64> @load_vl_v4i64_i32(<4 x i64>* %ptr, i32 %evl) { +; CHECK-LABEL: load_vl_v4i64_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li 4, 1 +; CHECK-NEXT: rldic 4, 4, 60, 3 +; CHECK-NEXT: lxvl 34, 3, 4 +; CHECK-NEXT: addi 3, 3, 16 +; CHECK-NEXT: lxvl 35, 3, 4 +; CHECK-NEXT: blr + %res = call <4 x i64> @llvm.vp.load.v4i64(<4 x i64>* %ptr, <4 x i1> undef, i32 %evl) + ret <4 x i64> %res +} +declare <4 x i64> @llvm.vp.load.v4i64(<4 x i64>*, <4 x i1>, i32) + +define void @store_vl_v8i32(<8 x i32>* %ptr, <8 x i32> %val, i32 %evl) { +; CHECK-LABEL: store_vl_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li 5, 1 +; CHECK-NEXT: addi 4, 3, 16 +; CHECK-NEXT: rldic 5, 5, 60, 3 +; CHECK-NEXT: stxvl 35, 4, 5 +; CHECK-NEXT: stxvl 34, 3, 5 +; CHECK-NEXT: blr + call void @llvm.vp.store.v8i32(<8 x i32> %val, <8 x i32>* %ptr, <8 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v8i32(<8 x i32>, <8 x i32>*, <8 x i1>, i32) +define <8 x i32> @load_vl_v8i32_i32(<8 x i32>* %ptr, i32 %evl) { +; CHECK-LABEL: load_vl_v8i32_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li 4, 1 +; CHECK-NEXT: rldic 4, 4, 60, 3 +; CHECK-NEXT: lxvl 34, 3, 4 +; CHECK-NEXT: addi 3, 3, 16 +; CHECK-NEXT: lxvl 35, 3, 4 +; CHECK-NEXT: blr + %res = call <8 x i32> @llvm.vp.load.v8i32(<8 x i32>* %ptr, <8 x i1> undef, i32 %evl) + ret <8 x i32> %res +} +declare <8 x i32> @llvm.vp.load.v8i32(<8 x i32>*, <8 x i1>, i32) + +define void @store_vl_v16i16(<16 x i16>* %ptr, <16 x i16> %val, i32 %evl) { +; CHECK-LABEL: store_vl_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li 5, 1 +; CHECK-NEXT: addi 4, 3, 16 +; CHECK-NEXT: rldic 5, 5, 60, 3 +; CHECK-NEXT: stxvl 35, 4, 5 +; CHECK-NEXT: stxvl 34, 3, 5 +; CHECK-NEXT: blr + call void @llvm.vp.store.v16i16(<16 x i16> %val, <16 x i16>* %ptr, <16 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v16i16(<16 x i16>, <16 x i16>*, <16 x i1>, i32) +define <16 x i16> @load_vl_v16i16_i32(<16 x i16>* %ptr, i32 %evl) { +; CHECK-LABEL: load_vl_v16i16_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li 4, 1 +; CHECK-NEXT: rldic 4, 4, 60, 3 +; CHECK-NEXT: lxvl 34, 3, 4 +; CHECK-NEXT: addi 3, 3, 16 +; CHECK-NEXT: lxvl 35, 3, 4 +; CHECK-NEXT: blr + %res = call <16 x i16> @llvm.vp.load.v16i16(<16 x i16>* %ptr, <16 x i1> undef, i32 %evl) + ret <16 x i16> %res +} +declare <16 x i16> @llvm.vp.load.v16i16(<16 x i16>*, <16 x i1>, i32) + +define void @store_vl_v32i8(<32 x i8>* %ptr, <32 x i8> %val, i32 %evl) { +; CHECK-LABEL: store_vl_v32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li 5, 1 +; CHECK-NEXT: addi 4, 3, 16 +; CHECK-NEXT: rldic 5, 5, 60, 3 +; CHECK-NEXT: stxvl 35, 4, 5 +; CHECK-NEXT: stxvl 34, 3, 5 +; CHECK-NEXT: blr + call void @llvm.vp.store.v32i8(<32 x i8> %val, <32 x i8>* %ptr, <32 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v32i8(<32 x i8>, <32 x i8>*, <32 x i1>, i32) +define <32 x i8> @load_vl_v32i8_i32(<32 x i8>* %ptr, i32 %evl) { +; CHECK-LABEL: load_vl_v32i8_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li 4, 1 +; CHECK-NEXT: rldic 4, 4, 60, 3 +; CHECK-NEXT: lxvl 34, 3, 4 +; CHECK-NEXT: addi 3, 3, 16 +; CHECK-NEXT: lxvl 35, 3, 4 +; CHECK-NEXT: blr + %res = call <32 x i8> @llvm.vp.load.v32i8(<32 x i8>* %ptr, <32 x i1> undef, i32 %evl) + ret <32 x i8> %res +} +declare <32 x i8> @llvm.vp.load.v32i8(<32 x i8>*, <32 x i1>, i32) + +define void @store_vl_v3i64(<3 x i64>* %ptr, <3 x i64> %val, i32 %evl) { +; CHECK-LABEL: store_vl_v3i64: +; CHECK: # %bb.0: +; CHECK-NEXT: mtvsrdd 35, 5, 4 +; CHECK-NEXT: li 4, 1 +; CHECK-NEXT: mtfprd 0, 6 +; CHECK-NEXT: xxswapd 34, 0 +; CHECK-NEXT: rldic 5, 4, 60, 3 +; CHECK-NEXT: rldic 4, 4, 59, 4 +; CHECK-NEXT: stxvl 35, 3, 5 +; CHECK-NEXT: addi 3, 3, 16 +; CHECK-NEXT: stxvl 34, 3, 4 +; CHECK-NEXT: blr + call void @llvm.vp.store.v3i64(<3 x i64> %val, <3 x i64>* %ptr, <3 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v3i64(<3 x i64>, <3 x i64>*, <3 x i1>, i32) +define <3 x i64> @load_vl_v3i64_i32(<3 x i64>* %ptr, i32 %evl) { +; CHECK-LABEL: load_vl_v3i64_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li 5, 1 +; CHECK-NEXT: addi 4, 3, 16 +; CHECK-NEXT: rldic 6, 5, 59, 4 +; CHECK-NEXT: lxvl 34, 4, 6 +; CHECK-NEXT: rldic 4, 5, 60, 3 +; CHECK-NEXT: lxvl 0, 3, 4 +; CHECK-NEXT: mfvsrld 5, 34 +; CHECK-NEXT: mfvsrld 3, 0 +; CHECK-NEXT: mffprd 4, 0 +; CHECK-NEXT: blr + %res = call <3 x i64> @llvm.vp.load.v3i64(<3 x i64>* %ptr, <3 x i1> undef, i32 %evl) + ret <3 x i64> %res +} +declare <3 x i64> @llvm.vp.load.v3i64(<3 x i64>*, <3 x i1>, i32) + +define void @store_vl_v7i32(<7 x i32>* %ptr, <7 x i32> %val, i32 %evl) { +; CHECK-LABEL: store_vl_v7i32: +; CHECK: # %bb.0: +; CHECK-NEXT: rldimi 4, 5, 32, 0 +; CHECK-NEXT: rldimi 6, 7, 32, 0 +; CHECK-NEXT: mtvsrwz 34, 8 +; CHECK-NEXT: mtvsrwz 35, 9 +; CHECK-NEXT: mtvsrwz 36, 10 +; CHECK-NEXT: mtvsrdd 0, 6, 4 +; CHECK-NEXT: addis 4, 2, .LCPI18_0@toc@ha +; CHECK-NEXT: vmrghw 2, 3, 2 +; CHECK-NEXT: addi 4, 4, .LCPI18_0@toc@l +; CHECK-NEXT: lxv 35, 0(4) +; CHECK-NEXT: li 4, 1 +; CHECK-NEXT: rldic 4, 4, 60, 3 +; CHECK-NEXT: stxvl 0, 3, 4 +; CHECK-NEXT: li 4, 3 +; CHECK-NEXT: addi 3, 3, 16 +; CHECK-NEXT: vperm 2, 4, 2, 3 +; CHECK-NEXT: rldic 4, 4, 58, 4 +; CHECK-NEXT: stxvl 34, 3, 4 +; CHECK-NEXT: blr + call void @llvm.vp.store.v7i32(<7 x i32> %val, <7 x i32>* %ptr, <7 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v7i32(<7 x i32>, <7 x i32>*, <7 x i1>, i32) +define <7 x i32> @load_vl_v7i32_i32(<7 x i32>* %ptr, i32 %evl) { +; CHECK-LABEL: load_vl_v7i32_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li 5, 1 +; CHECK-NEXT: rldic 5, 5, 60, 3 +; CHECK-NEXT: lxvl 0, 4, 5 +; CHECK-NEXT: li 5, 3 +; CHECK-NEXT: addi 4, 4, 16 +; CHECK-NEXT: rldic 5, 5, 58, 4 +; CHECK-NEXT: lxvl 1, 4, 5 +; CHECK-NEXT: li 4, 24 +; CHECK-NEXT: stfiwx 1, 3, 4 +; CHECK-NEXT: stxv 0, 0(3) +; CHECK-NEXT: xxswapd 0, 1 +; CHECK-NEXT: stfd 0, 16(3) +; CHECK-NEXT: blr + %res = call <7 x i32> @llvm.vp.load.v7i32(<7 x i32>* %ptr, <7 x i1> undef, i32 %evl) + ret <7 x i32> %res +} +declare <7 x i32> @llvm.vp.load.v7i32(<7 x i32>*, <7 x i1>, i32) + +define void @store_vl_v15i16(<15 x i16>* %ptr, <15 x i16> %val, i32 %evl) { +; CHECK-LABEL: store_vl_v15i16: +; CHECK: # %bb.0: +; CHECK-NEXT: mtvsrd 34, 4 +; CHECK-NEXT: mtvsrd 35, 5 +; CHECK-NEXT: mtvsrd 36, 7 +; CHECK-NEXT: addi 4, 1, 96 +; CHECK-NEXT: vmrghh 2, 3, 2 +; CHECK-NEXT: mtvsrd 35, 6 +; CHECK-NEXT: mtvsrd 37, 9 +; CHECK-NEXT: addi 5, 1, 104 +; CHECK-NEXT: vmrghh 3, 4, 3 +; CHECK-NEXT: mtvsrd 36, 8 +; CHECK-NEXT: vmrghh 4, 5, 4 +; CHECK-NEXT: mtvsrd 37, 10 +; CHECK-NEXT: vmrglw 2, 3, 2 +; CHECK-NEXT: lxsihzx 35, 0, 4 +; CHECK-NEXT: addi 4, 1, 112 +; CHECK-NEXT: vmrghh 3, 3, 5 +; CHECK-NEXT: vmrglw 3, 3, 4 +; CHECK-NEXT: xxmrgld 0, 35, 34 +; CHECK-NEXT: lxsihzx 34, 0, 5 +; CHECK-NEXT: lxsihzx 35, 0, 4 +; CHECK-NEXT: addi 4, 1, 120 +; CHECK-NEXT: addi 5, 1, 128 +; CHECK-NEXT: lxsihzx 36, 0, 5 +; CHECK-NEXT: addi 5, 1, 144 +; CHECK-NEXT: vmrghh 2, 3, 2 +; CHECK-NEXT: lxsihzx 35, 0, 4 +; CHECK-NEXT: addi 4, 1, 136 +; CHECK-NEXT: vmrghh 3, 4, 3 +; CHECK-NEXT: lxsihzx 36, 0, 5 +; CHECK-NEXT: addi 5, 1, 152 +; CHECK-NEXT: lxsihzx 37, 0, 5 +; CHECK-NEXT: li 5, 7 +; CHECK-NEXT: vmrglw 2, 3, 2 +; CHECK-NEXT: lxsihzx 35, 0, 4 +; CHECK-NEXT: addis 4, 2, .LCPI20_0@toc@ha +; CHECK-NEXT: rldic 5, 5, 57, 4 +; CHECK-NEXT: addi 4, 4, .LCPI20_0@toc@l +; CHECK-NEXT: vmrghh 3, 4, 3 +; CHECK-NEXT: lxv 36, 0(4) +; CHECK-NEXT: addis 4, 2, .LCPI20_1@toc@ha +; CHECK-NEXT: addi 4, 4, .LCPI20_1@toc@l +; CHECK-NEXT: vperm 3, 5, 3, 4 +; CHECK-NEXT: lxv 36, 0(4) +; CHECK-NEXT: addi 4, 3, 16 +; CHECK-NEXT: vperm 2, 3, 2, 4 +; CHECK-NEXT: stxvl 34, 4, 5 +; CHECK-NEXT: li 4, 1 +; CHECK-NEXT: rldic 4, 4, 60, 3 +; CHECK-NEXT: stxvl 0, 3, 4 +; CHECK-NEXT: blr + call void @llvm.vp.store.v15i16(<15 x i16> %val, <15 x i16>* %ptr, <15 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v15i16(<15 x i16>, <15 x i16>*, <15 x i1>, i32) +define <15 x i16> @load_vl_v15i16_i32(<15 x i16>* %ptr, i32 %evl) { +; CHECK-LABEL: load_vl_v15i16_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li 5, 1 +; CHECK-NEXT: rldic 5, 5, 60, 3 +; CHECK-NEXT: lxvl 0, 4, 5 +; CHECK-NEXT: li 5, 7 +; CHECK-NEXT: addi 4, 4, 16 +; CHECK-NEXT: rldic 5, 5, 57, 4 +; CHECK-NEXT: lxvl 34, 4, 5 +; CHECK-NEXT: li 4, 24 +; CHECK-NEXT: vsldoi 3, 2, 2, 12 +; CHECK-NEXT: stxsiwx 34, 3, 4 +; CHECK-NEXT: li 4, 28 +; CHECK-NEXT: stxsihx 35, 3, 4 +; CHECK-NEXT: stxv 0, 0(3) +; CHECK-NEXT: xxswapd 0, 34 +; CHECK-NEXT: stfd 0, 16(3) +; CHECK-NEXT: blr + %res = call <15 x i16> @llvm.vp.load.v15i16(<15 x i16>* %ptr, <15 x i1> undef, i32 %evl) + ret <15 x i16> %res +} +declare <15 x i16> @llvm.vp.load.v15i16(<15 x i16>*, <15 x i1>, i32) + +define void @store_vl_v31i8(<31 x i8>* %ptr, <31 x i8> %val, i32 %evl) { +; CHECK-LABEL: store_vl_v31i8: +; CHECK: # %bb.0: +; CHECK-NEXT: mtvsrd 34, 4 +; CHECK-NEXT: mtvsrd 35, 5 +; CHECK-NEXT: mtvsrd 36, 6 +; CHECK-NEXT: mtvsrd 37, 7 +; CHECK-NEXT: vmrghb 2, 3, 2 +; CHECK-NEXT: addi 11, 1, 104 +; CHECK-NEXT: addi 4, 1, 112 +; CHECK-NEXT: mtvsrd 35, 8 +; CHECK-NEXT: vmrghb 4, 5, 4 +; CHECK-NEXT: mtvsrd 37, 9 +; CHECK-NEXT: lxsibzx 32, 0, 4 +; CHECK-NEXT: addi 5, 1, 120 +; CHECK-NEXT: addi 6, 1, 128 +; CHECK-NEXT: lxsibzx 33, 0, 6 +; CHECK-NEXT: addi 7, 1, 136 +; CHECK-NEXT: addi 8, 1, 144 +; CHECK-NEXT: lxsibzx 38, 0, 8 +; CHECK-NEXT: vmrghb 3, 5, 3 +; CHECK-NEXT: lxsibzx 37, 0, 11 +; CHECK-NEXT: addi 9, 1, 152 +; CHECK-NEXT: addi 4, 1, 160 +; CHECK-NEXT: lxsibzx 39, 0, 4 +; CHECK-NEXT: addi 6, 1, 168 +; CHECK-NEXT: addi 4, 1, 176 +; CHECK-NEXT: lxsibzx 40, 0, 4 +; CHECK-NEXT: vmrglh 2, 4, 2 +; CHECK-NEXT: mtvsrd 36, 10 +; CHECK-NEXT: addi 4, 1, 200 +; CHECK-NEXT: vmrghb 5, 0, 5 +; CHECK-NEXT: lxsibzx 32, 0, 5 +; CHECK-NEXT: addi 5, 1, 96 +; CHECK-NEXT: vmrghb 0, 1, 0 +; CHECK-NEXT: lxsibzx 33, 0, 7 +; CHECK-NEXT: addi 7, 1, 184 +; CHECK-NEXT: vmrglh 5, 0, 5 +; CHECK-NEXT: vmrghb 1, 6, 1 +; CHECK-NEXT: lxsibzx 38, 0, 9 +; CHECK-NEXT: vmrghb 6, 7, 6 +; CHECK-NEXT: lxsibzx 39, 0, 5 +; CHECK-NEXT: addi 5, 1, 192 +; CHECK-NEXT: lxsibzx 41, 0, 5 +; CHECK-NEXT: addi 5, 1, 208 +; CHECK-NEXT: vmrglh 0, 6, 1 +; CHECK-NEXT: lxsibzx 42, 0, 5 +; CHECK-NEXT: addi 5, 1, 224 +; CHECK-NEXT: vmrghb 4, 7, 4 +; CHECK-NEXT: lxsibzx 39, 0, 6 +; CHECK-NEXT: vmrglw 5, 0, 5 +; CHECK-NEXT: vmrglh 3, 4, 3 +; CHECK-NEXT: vmrghb 7, 8, 7 +; CHECK-NEXT: lxsibzx 40, 0, 7 +; CHECK-NEXT: vmrglw 2, 3, 2 +; CHECK-NEXT: lxsibzx 35, 0, 5 +; CHECK-NEXT: addi 5, 1, 240 +; CHECK-NEXT: xxmrgld 0, 37, 34 +; CHECK-NEXT: vmrghb 8, 9, 8 +; CHECK-NEXT: lxsibzx 41, 0, 4 +; CHECK-NEXT: addi 4, 1, 216 +; CHECK-NEXT: lxsibzx 34, 0, 4 +; CHECK-NEXT: addi 4, 1, 232 +; CHECK-NEXT: vmrglh 4, 8, 7 +; CHECK-NEXT: vmrghb 9, 10, 9 +; CHECK-NEXT: vmrghb 2, 3, 2 +; CHECK-NEXT: lxsibzx 35, 0, 4 +; CHECK-NEXT: addi 4, 1, 248 +; CHECK-NEXT: vmrglh 2, 2, 9 +; CHECK-NEXT: vmrglw 2, 2, 4 +; CHECK-NEXT: lxsibzx 36, 0, 5 +; CHECK-NEXT: addi 5, 1, 256 +; CHECK-NEXT: lxsibzx 37, 0, 5 +; CHECK-NEXT: addi 5, 1, 272 +; CHECK-NEXT: vmrghb 3, 4, 3 +; CHECK-NEXT: lxsibzx 36, 0, 4 +; CHECK-NEXT: addi 4, 1, 264 +; CHECK-NEXT: vmrghb 4, 5, 4 +; CHECK-NEXT: lxsibzx 37, 0, 5 +; CHECK-NEXT: addi 5, 1, 280 +; CHECK-NEXT: lxsibzx 32, 0, 5 +; CHECK-NEXT: li 5, 15 +; CHECK-NEXT: vmrglh 3, 4, 3 +; CHECK-NEXT: lxsibzx 36, 0, 4 +; CHECK-NEXT: addis 4, 2, .LCPI22_0@toc@ha +; CHECK-NEXT: rldic 5, 5, 56, 4 +; CHECK-NEXT: addi 4, 4, .LCPI22_0@toc@l +; CHECK-NEXT: vmrghb 4, 5, 4 +; CHECK-NEXT: lxv 37, 0(4) +; CHECK-NEXT: addis 4, 2, .LCPI22_1@toc@ha +; CHECK-NEXT: addi 4, 4, .LCPI22_1@toc@l +; CHECK-NEXT: vperm 4, 0, 4, 5 +; CHECK-NEXT: vmrglw 3, 4, 3 +; CHECK-NEXT: lxv 36, 0(4) +; CHECK-NEXT: addi 4, 3, 16 +; CHECK-NEXT: vperm 2, 3, 2, 4 +; CHECK-NEXT: stxvl 34, 4, 5 +; CHECK-NEXT: li 4, 1 +; CHECK-NEXT: rldic 4, 4, 60, 3 +; CHECK-NEXT: stxvl 0, 3, 4 +; CHECK-NEXT: blr + call void @llvm.vp.store.v31i8(<31 x i8> %val, <31 x i8>* %ptr, <31 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v31i8(<31 x i8>, <31 x i8>*, <31 x i1>, i32) +define <31 x i8> @load_vl_v31i8_i32(<31 x i8>* %ptr, i32 %evl) { +; CHECK-LABEL: load_vl_v31i8_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li 5, 1 +; CHECK-NEXT: rldic 5, 5, 60, 3 +; CHECK-NEXT: lxvl 0, 4, 5 +; CHECK-NEXT: li 5, 15 +; CHECK-NEXT: addi 4, 4, 16 +; CHECK-NEXT: rldic 5, 5, 56, 4 +; CHECK-NEXT: lxvl 34, 4, 5 +; CHECK-NEXT: li 4, 24 +; CHECK-NEXT: vsldoi 3, 2, 2, 10 +; CHECK-NEXT: stxsiwx 34, 3, 4 +; CHECK-NEXT: li 4, 30 +; CHECK-NEXT: stxsibx 35, 3, 4 +; CHECK-NEXT: vsldoi 3, 2, 2, 12 +; CHECK-NEXT: li 4, 28 +; CHECK-NEXT: stxsihx 35, 3, 4 +; CHECK-NEXT: stxv 0, 0(3) +; CHECK-NEXT: xxswapd 0, 34 +; CHECK-NEXT: stfd 0, 16(3) +; CHECK-NEXT: blr + %res = call <31 x i8> @llvm.vp.load.v31i8(<31 x i8>* %ptr, <31 x i1> undef, i32 %evl) + ret <31 x i8> %res +} +declare <31 x i8> @llvm.vp.load.v31i8(<31 x i8>*, <31 x i1>, i32) + +define void @store_vl_v3i32(<3 x i32>* %ptr, <3 x i32> %val, i32 %evl) { +; CHECK-LABEL: store_vl_v3i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li 4, 3 +; CHECK-NEXT: rldic 4, 4, 58, 4 +; CHECK-NEXT: stxvl 34, 3, 4 +; CHECK-NEXT: blr + call void @llvm.vp.store.v3i32(<3 x i32> %val, <3 x i32>* %ptr, <3 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v3i32(<3 x i32>, <3 x i32>*, <3 x i1>, i32) +define <3 x i32> @load_vl_v3i32_i32(<3 x i32>* %ptr, i32 %evl) { +; CHECK-LABEL: load_vl_v3i32_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li 4, 3 +; CHECK-NEXT: rldic 4, 4, 58, 4 +; CHECK-NEXT: lxvl 34, 3, 4 +; CHECK-NEXT: blr + %res = call <3 x i32> @llvm.vp.load.v3i32(<3 x i32>* %ptr, <3 x i1> undef, i32 %evl) + ret <3 x i32> %res +} +declare <3 x i32> @llvm.vp.load.v3i32(<3 x i32>*, <3 x i1>, i32) + +define void @store_vl_v7i16(<7 x i16>* %ptr, <7 x i16> %val, i32 %evl) { +; CHECK-LABEL: store_vl_v7i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li 4, 7 +; CHECK-NEXT: rldic 4, 4, 57, 4 +; CHECK-NEXT: stxvl 34, 3, 4 +; CHECK-NEXT: blr + call void @llvm.vp.store.v7i16(<7 x i16> %val, <7 x i16>* %ptr, <7 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v7i16(<7 x i16>, <7 x i16>*, <7 x i1>, i32) +define <7 x i16> @load_vl_v7i16_i32(<7 x i16>* %ptr, i32 %evl) { +; CHECK-LABEL: load_vl_v7i16_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li 4, 7 +; CHECK-NEXT: rldic 4, 4, 57, 4 +; CHECK-NEXT: lxvl 34, 3, 4 +; CHECK-NEXT: blr + %res = call <7 x i16> @llvm.vp.load.v7i16(<7 x i16>* %ptr, <7 x i1> undef, i32 %evl) + ret <7 x i16> %res +} +declare <7 x i16> @llvm.vp.load.v7i16(<7 x i16>*, <7 x i1>, i32) + +define void @store_vl_v15i8(<15 x i8>* %ptr, <15 x i8> %val, i32 %evl) { +; CHECK-LABEL: store_vl_v15i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li 4, 15 +; CHECK-NEXT: rldic 4, 4, 56, 4 +; CHECK-NEXT: stxvl 34, 3, 4 +; CHECK-NEXT: blr + call void @llvm.vp.store.v15i8(<15 x i8> %val, <15 x i8>* %ptr, <15 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v15i8(<15 x i8>, <15 x i8>*, <15 x i1>, i32) +define <15 x i8> @load_vl_v15i8_i32(<15 x i8>* %ptr, i32 %evl) { +; CHECK-LABEL: load_vl_v15i8_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li 4, 15 +; CHECK-NEXT: rldic 4, 4, 56, 4 +; CHECK-NEXT: lxvl 34, 3, 4 +; CHECK-NEXT: blr + %res = call <15 x i8> @llvm.vp.load.v15i8(<15 x i8>* %ptr, <15 x i1> undef, i32 %evl) + ret <15 x i8> %res +} +declare <15 x i8> @llvm.vp.load.v15i8(<15 x i8>*, <15 x i1>, i32) +