diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def --- a/clang/include/clang/Basic/BuiltinsPPC.def +++ b/clang/include/clang/Basic/BuiltinsPPC.def @@ -738,6 +738,8 @@ MMA_BUILTIN(pmxvbf16ger2pn, "vW512*VVi15i15i3", true) MMA_BUILTIN(pmxvbf16ger2np, "vW512*VVi15i15i3", true) MMA_BUILTIN(pmxvbf16ger2nn, "vW512*VVi15i15i3", true) +MMA_BUILTIN(lxvp, "W256SLLiW256C*", false) +MMA_BUILTIN(stxvp, "vW256SLLiW256C*", false) // FIXME: Obviously incomplete. diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -14776,6 +14776,19 @@ break; #include "clang/Basic/BuiltinsPPC.def" } + if (BuiltinID == PPC::BI__builtin_mma_lxvp || + BuiltinID == PPC::BI__builtin_mma_stxvp) { + if (BuiltinID == PPC::BI__builtin_mma_lxvp) { + Ops[1] = Builder.CreateBitCast(Ops[1], Int8PtrTy); + Ops[0] = Builder.CreateGEP(Ops[1], Ops[0]); + } else { + Ops[2] = Builder.CreateBitCast(Ops[2], Int8PtrTy); + Ops[1] = Builder.CreateGEP(Ops[2], Ops[1]); + } + Ops.pop_back(); + llvm::Function *F = CGM.getIntrinsic(ID); + return Builder.CreateCall(F, Ops, ""); + } SmallVector CallOps; if (Accumulate) { Address Addr = EmitPointerWithAlignment(E->getArg(0)); diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td --- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td +++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td @@ -1422,6 +1422,14 @@ def int_ppc_mma_xxsetaccz : Intrinsic<[llvm_v512i1_ty], [], [IntrNoMem]>; + def int_ppc_mma_lxvp : + Intrinsic<[llvm_v256i1_ty], [llvm_ptr_ty], + [IntrReadMem, IntrArgMemOnly]>; + + def int_ppc_mma_stxvp : + Intrinsic<[], [llvm_v256i1_ty, llvm_ptr_ty], + [IntrWriteMem, IntrArgMemOnly]>; + // MMA Reduced-Precision: Outer Product Intrinsic Definitions. defm int_ppc_mma_xvi4ger8 : PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>; diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -293,6 +293,13 @@ Align(16)); } + /// SelectAddrImmX34 - Returns true if the address N can be represented by + /// a base register plus a signed 34-bit displacement. Suitable for use by + /// PSTXVP and friends. + bool SelectAddrImmX34(SDValue N, SDValue &Disp, SDValue &Base) { + return PPCLowering->SelectAddressRegImm34(N, Disp, Base, *CurDAG); + } + // Select an address into a single register. bool SelectAddr(SDValue N, SDValue &Base) { Base = N; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -770,6 +770,8 @@ bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base, SelectionDAG &DAG, MaybeAlign EncodingAlignment) const; + bool SelectAddressRegImm34(SDValue N, SDValue &Disp, SDValue &Base, + SelectionDAG &DAG) const; /// SelectAddressRegRegOnly - Given the specified addressed, force it to be /// represented as an indexed [r+r] operation. @@ -1325,6 +1327,8 @@ bool isIntS16Immediate(SDNode *N, int16_t &Imm); bool isIntS16Immediate(SDValue Op, int16_t &Imm); + bool isIntS34Immediate(SDNode *N, int64_t &Imm); + bool isIntS34Immediate(SDValue Op, int64_t &Imm); bool convertToNonDenormSingle(APInt &ArgAPInt); bool convertToNonDenormSingle(APFloat &ArgAPFloat); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -2399,6 +2399,20 @@ return false; } +/// isIntS34Immediate - This method tests if value of node given can be +/// accurately represented as a sign extension from a 34-bit value. If so, +/// this returns true and the immediate. +bool llvm::isIntS34Immediate(SDNode *N, int64_t &Imm) { + if (!isa(N)) + return false; + + Imm = (int64_t)cast(N)->getZExtValue(); + return isInt<34>(Imm); +} +bool llvm::isIntS34Immediate(SDValue Op, int64_t &Imm) { + return isIntS34Immediate(Op.getNode(), Imm); +} + /// SelectAddressRegReg - Given the specified addressed, check to see if it /// can be represented as an indexed [r+r] operation. Returns false if it /// can be more efficiently represented as [r+imm]. If \p EncodingAlignment is @@ -2599,6 +2613,50 @@ return true; // [r+0] } +/// Similar to the 16-bit case but for instructions that take a 34-bit +/// displacement field (prefixed loads/stores). +bool PPCTargetLowering::SelectAddressRegImm34(SDValue N, SDValue &Disp, + SDValue &Base, + SelectionDAG &DAG) const { + // Only on 64-bit targets. + if (N.getValueType() != MVT::i64) + return false; + + SDLoc dl(N); + int64_t Imm = 0; + if (N.getOpcode() == ISD::ADD) { + if (isIntS34Immediate(N.getOperand(1), Imm)) { + Disp = DAG.getTargetConstant(Imm, dl, N.getValueType()); + if (FrameIndexSDNode *FI = dyn_cast(N.getOperand(0))) + Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); + else + Base = N.getOperand(0); + return true; + } + } else if (N.getOpcode() == ISD::OR) { + if (isIntS34Immediate(N.getOperand(1), Imm)) { + // If this is an or of disjoint bitfields, we can codegen this as an add + // (for better address arithmetic) if the LHS and RHS of the OR are + // provably disjoint. + KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0)); + + if ((LHSKnown.Zero.getZExtValue() | ~(uint64_t)Imm) == ~0ULL) { + if (FrameIndexSDNode *FI = dyn_cast(N.getOperand(0))) + Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); + else + Base = N.getOperand(0); + Disp = DAG.getTargetConstant(Imm, dl, N.getValueType()); + return true; + } + } + } else if (isIntS34Immediate(N, Imm)) { // If the address is a 34-bit const. + Disp = DAG.getTargetConstant(Imm, dl, N.getValueType()); + Base = DAG.getRegister(PPC::ZERO8, N.getValueType()); + return true; + } + return false; +} + /// SelectAddressRegRegOnly - Given the specified addressed, force it to be /// represented as an indexed [r+r] operation. bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base, diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -1031,11 +1031,13 @@ // Define PowerPC specific addressing mode. // d-form -def iaddr : ComplexPattern; // "stb" +def iaddr : ComplexPattern; // "stb" // ds-form -def iaddrX4 : ComplexPattern; // "std" +def iaddrX4 : ComplexPattern; // "std" // dq-form -def iaddrX16 : ComplexPattern; // "stxv" +def iaddrX16 : ComplexPattern; // "stxv" +// 8LS:d-form +def iaddrX34 : ComplexPattern; // "pstxvp" // Below forms are all x-form addressing mode, use three different ones so we // can make a accurate check for x-form instructions in ISEL. diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -1654,6 +1654,24 @@ "pstxvp $XTp, $D_RA", IIC_LdStLFD>; } +let Predicates = [PairedVectorMemops] in { + // Intrinsics for Paired Vector Loads. + def : Pat<(v256i1 (int_ppc_mma_lxvp iaddrX16:$src)), (LXVP memrix16:$src)>; + def : Pat<(v256i1 (int_ppc_mma_lxvp xaddr:$src)), (LXVPX xaddr:$src)>; + let Predicates = [PairedVectorMemops, PrefixInstrs] in { + def : Pat<(v256i1 (int_ppc_mma_lxvp iaddrX34:$src)), (PLXVP memri34:$src)>; + } + // Intrinsics for Paired Vector Stores. + def : Pat<(int_ppc_mma_stxvp v256i1:$XSp, iaddrX16:$dst), + (STXVP $XSp, memrix16:$dst)>; + def : Pat<(int_ppc_mma_stxvp v256i1:$XSp, xaddr:$dst), + (STXVPX $XSp, xaddr:$dst)>; + let Predicates = [PairedVectorMemops, PrefixInstrs] in { + def : Pat<(int_ppc_mma_stxvp v256i1:$XSp, iaddrX34:$dst), + (PSTXVP $XSp, memri34:$dst)>; + } +} + // TODO: We have an added complexity of 500 here. This is only a temporary // solution to have tablegen consider these patterns first. The way we do // addressing for PowerPC is complex depending on available D form, X form, or diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -1223,7 +1223,8 @@ case Intrinsic::ppc_vsx_lxvd2x_be: case Intrinsic::ppc_vsx_lxvw4x_be: case Intrinsic::ppc_vsx_lxvl: - case Intrinsic::ppc_vsx_lxvll: { + case Intrinsic::ppc_vsx_lxvll: + case Intrinsic::ppc_mma_lxvp: { Info.PtrVal = Inst->getArgOperand(0); Info.ReadMem = true; Info.WriteMem = false; @@ -1239,7 +1240,8 @@ case Intrinsic::ppc_vsx_stxvd2x_be: case Intrinsic::ppc_vsx_stxvw4x_be: case Intrinsic::ppc_vsx_stxvl: - case Intrinsic::ppc_vsx_stxvll: { + case Intrinsic::ppc_vsx_stxvll: + case Intrinsic::ppc_mma_stxvp: { Info.PtrVal = Inst->getArgOperand(1); Info.ReadMem = false; Info.WriteMem = true; diff --git a/llvm/test/CodeGen/PowerPC/dform-pair-load-store.ll b/llvm/test/CodeGen/PowerPC/dform-pair-load-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/dform-pair-load-store.ll @@ -0,0 +1,58 @@ +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-p:64:64-n32:64-v256:256:256-v512:512:512" + +declare <256 x i1> @llvm.ppc.mma.lxvp(i8*) +declare void @llvm.ppc.mma.stxvp(<256 x i1>, i8*) +define void @foo(i32 zeroext %n, <256 x i1>* %ptr, <256 x i1>* %ptr2) { +; CHECK-LABEL: foo: +; CHECK: .LBB0_2: # %for.body +; CHECK-NEXT: # +; CHECK: lxvp +; CHECK: lxvp +; CHECK: lxvp +; CHECK: lxvp +; CHECK: stxvp +; CHECK: stxvp +; CHECK: stxvp +; CHECK: stxvp +entry: + %cmp35.not = icmp eq i32 %n, 0 + br i1 %cmp35.not, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: + %0 = bitcast <256 x i1>* %ptr to i8* + %1 = bitcast <256 x i1>* %ptr2 to i8* + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %2 = getelementptr i8, i8* %0, i64 %indvars.iv + %3 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %2) + %add2 = add nuw nsw i64 %indvars.iv, 32 + %4 = getelementptr i8, i8* %0, i64 %add2 + %5 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %4) + %add4 = add nuw nsw i64 %indvars.iv, 64 + %6 = getelementptr i8, i8* %0, i64 %add4 + %7 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %6) + %add6 = add nuw nsw i64 %indvars.iv, 96 + %8 = getelementptr i8, i8* %0, i64 %add6 + %9 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %8) + %10 = getelementptr i8, i8* %1, i64 %indvars.iv + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %3, i8* %10) + %11 = getelementptr i8, i8* %1, i64 %add2 + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %5, i8* %11) + %12 = getelementptr i8, i8* %1, i64 %add4 + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %7, i8* %12) + %13 = getelementptr i8, i8* %1, i64 %add6 + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %9, i8* %13) + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll --- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll @@ -698,3 +698,307 @@ declare <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1>, <256 x i1>, <16 x i8>, i32, i32) declare <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1>, <256 x i1>, <16 x i8>) + +; Function Attrs: nounwind +define void @test_ldst_1(<256 x i1>* %vpp, <256 x i1>* %vp2) { +; CHECK-LABEL: test_ldst_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxvp vsp0, 0(r3) +; CHECK-NEXT: stxvp vsp0, 0(r4) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_1: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxvp vsp0, 0(r3) +; CHECK-BE-NEXT: stxvp vsp0, 0(r4) +; CHECK-BE-NEXT: blr +entry: + %0 = bitcast <256 x i1>* %vpp to i8* + %1 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %0) + %2 = bitcast <256 x i1>* %vp2 to i8* + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %1, i8* %2) + ret void +} + +; Function Attrs: argmemonly nounwind readonly +declare <256 x i1> @llvm.ppc.mma.lxvp(i8*) + +; Function Attrs: argmemonly nounwind writeonly +declare void @llvm.ppc.mma.stxvp(<256 x i1>, i8*) + +; Function Attrs: nounwind +define void @test_ldst_2(<256 x i1>* %vpp, i64 %offset, <256 x i1>* %vp2) { +; CHECK-LABEL: test_ldst_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxvpx vsp0, r3, r4 +; CHECK-NEXT: stxvpx vsp0, r5, r4 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_2: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxvpx vsp0, r3, r4 +; CHECK-BE-NEXT: stxvpx vsp0, r5, r4 +; CHECK-BE-NEXT: blr +entry: + %0 = bitcast <256 x i1>* %vpp to i8* + %1 = getelementptr i8, i8* %0, i64 %offset + %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1) + %3 = bitcast <256 x i1>* %vp2 to i8* + %4 = getelementptr i8, i8* %3, i64 %offset + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4) + ret void +} + +; Function Attrs: nounwind +define void @test_ldst_3(<256 x i1>* %vpp, <256 x i1>* %vp2) { +; CHECK-LABEL: test_ldst_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: plxvp vsp0, 18(r3), 0 +; CHECK-NEXT: pstxvp vsp0, 18(r4), 0 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_3: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: plxvp vsp0, 18(r3), 0 +; CHECK-BE-NEXT: pstxvp vsp0, 18(r4), 0 +; CHECK-BE-NEXT: blr +entry: + %0 = bitcast <256 x i1>* %vpp to i8* + %1 = getelementptr i8, i8* %0, i64 18 + %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1) + %3 = bitcast <256 x i1>* %vp2 to i8* + %4 = getelementptr i8, i8* %3, i64 18 + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4) + ret void +} + +; Function Attrs: nounwind +define void @test_ldst_4(<256 x i1>* %vpp, <256 x i1>* %vp2) { +; CHECK-LABEL: test_ldst_4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: plxvp vsp0, 1(r3), 0 +; CHECK-NEXT: pstxvp vsp0, 1(r4), 0 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_4: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: plxvp vsp0, 1(r3), 0 +; CHECK-BE-NEXT: pstxvp vsp0, 1(r4), 0 +; CHECK-BE-NEXT: blr +entry: + %0 = bitcast <256 x i1>* %vpp to i8* + %1 = getelementptr i8, i8* %0, i64 1 + %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1) + %3 = bitcast <256 x i1>* %vp2 to i8* + %4 = getelementptr i8, i8* %3, i64 1 + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4) + ret void +} + +; Function Attrs: nounwind +define void @test_ldst_5(<256 x i1>* %vpp, <256 x i1>* %vp2) { +; CHECK-LABEL: test_ldst_5: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: plxvp vsp0, 42(r3), 0 +; CHECK-NEXT: pstxvp vsp0, 42(r4), 0 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_5: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: plxvp vsp0, 42(r3), 0 +; CHECK-BE-NEXT: pstxvp vsp0, 42(r4), 0 +; CHECK-BE-NEXT: blr +entry: + %0 = bitcast <256 x i1>* %vpp to i8* + %1 = getelementptr i8, i8* %0, i64 42 + %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1) + %3 = bitcast <256 x i1>* %vp2 to i8* + %4 = getelementptr i8, i8* %3, i64 42 + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4) + ret void +} + +; Function Attrs: nounwind +define void @test_ldst_6(<256 x i1>* %vpp, <256 x i1>* %vp2) { +; CHECK-LABEL: test_ldst_6: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxvp vsp0, 4096(r3) +; CHECK-NEXT: stxvp vsp0, 4096(r4) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_6: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxvp vsp0, 4096(r3) +; CHECK-BE-NEXT: stxvp vsp0, 4096(r4) +; CHECK-BE-NEXT: blr +entry: + %0 = getelementptr <256 x i1>, <256 x i1>* %vpp, i64 128 + %1 = bitcast <256 x i1>* %0 to i8* + %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1) + %3 = getelementptr <256 x i1>, <256 x i1>* %vp2, i64 128 + %4 = bitcast <256 x i1>* %3 to i8* + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4) + ret void +} + +; Function Attrs: nounwind +define void @test_ldst_7(<256 x i1>* %vpp, <256 x i1>* %vp2) { +; FIXME: A prefixed load (plxvp) is expected here as the offset in this +; test case is a constant that fits within 34-bits. +; CHECK-LABEL: test_ldst_7: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li r5, 0 +; CHECK-NEXT: ori r5, r5, 32799 +; CHECK-NEXT: lxvpx vsp0, r3, r5 +; CHECK-NEXT: stxvpx vsp0, r4, r5 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_7: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: li r5, 0 +; CHECK-BE-NEXT: ori r5, r5, 32799 +; CHECK-BE-NEXT: lxvpx vsp0, r3, r5 +; CHECK-BE-NEXT: stxvpx vsp0, r4, r5 +; CHECK-BE-NEXT: blr +entry: + %0 = bitcast <256 x i1>* %vpp to i8* + %1 = getelementptr i8, i8* %0, i64 32799 + %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1) + %3 = bitcast <256 x i1>* %vp2 to i8* + %4 = getelementptr i8, i8* %3, i64 32799 + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4) + ret void +} + +; Function Attrs: nofree nounwind +define void @test_ldst_8(i8* nocapture readonly %vqp, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp) { +; CHECK-LABEL: test_ldst_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxv vs1, 32(r3) +; CHECK-NEXT: lxv vs0, 48(r3) +; CHECK-NEXT: lxv vs3, 0(r3) +; CHECK-NEXT: lxv vs2, 16(r3) +; CHECK-NEXT: plxvp vsp4, 8(r4), 0 +; CHECK-NEXT: xxmtacc acc0 +; CHECK-NEXT: pmxvf64gernn acc0, vsp4, v2, 0, 0 +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs0, 48(r7) +; CHECK-NEXT: stxv vs1, 32(r7) +; CHECK-NEXT: stxv vs2, 16(r7) +; CHECK-NEXT: stxv vs3, 0(r7) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_8: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) +; CHECK-BE-NEXT: lxv vs3, 48(r3) +; CHECK-BE-NEXT: lxv vs2, 32(r3) +; CHECK-BE-NEXT: plxvp vsp4, 8(r4), 0 +; CHECK-BE-NEXT: xxmtacc acc0 +; CHECK-BE-NEXT: pmxvf64gernn acc0, vsp4, v2, 0, 0 +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs1, 16(r7) +; CHECK-BE-NEXT: stxv vs0, 0(r7) +; CHECK-BE-NEXT: stxv vs3, 48(r7) +; CHECK-BE-NEXT: stxv vs2, 32(r7) +; CHECK-BE-NEXT: blr +entry: + %0 = bitcast i8* %vqp to <512 x i1>* + %1 = load <512 x i1>, <512 x i1>* %0, align 64 + %2 = bitcast <256 x i1>* %vpp to i8* + %3 = getelementptr i8, i8* %2, i64 8 + %4 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %3) + %5 = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> %1, <256 x i1> %4, <16 x i8> %vc, i32 0, i32 0) + %6 = bitcast i8* %resp to <512 x i1>* + store <512 x i1> %5, <512 x i1>* %6, align 64 + ret void +} + +; Function Attrs: nofree nounwind +define void @test_ldst_9(i8* nocapture readonly %vqp, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp) { +; CHECK-LABEL: test_ldst_9: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxv vs1, 32(r3) +; CHECK-NEXT: lxv vs0, 48(r3) +; CHECK-NEXT: lxv vs3, 0(r3) +; CHECK-NEXT: lxv vs2, 16(r3) +; CHECK-NEXT: lxvp vsp4, 0(r4) +; CHECK-NEXT: xxmtacc acc0 +; CHECK-NEXT: xvf64gernp acc0, vsp4, v2 +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs0, 48(r7) +; CHECK-NEXT: stxv vs1, 32(r7) +; CHECK-NEXT: stxv vs2, 16(r7) +; CHECK-NEXT: stxv vs3, 0(r7) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_9: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) +; CHECK-BE-NEXT: lxv vs3, 48(r3) +; CHECK-BE-NEXT: lxv vs2, 32(r3) +; CHECK-BE-NEXT: lxvp vsp4, 0(r4) +; CHECK-BE-NEXT: xxmtacc acc0 +; CHECK-BE-NEXT: xvf64gernp acc0, vsp4, v2 +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs1, 16(r7) +; CHECK-BE-NEXT: stxv vs0, 0(r7) +; CHECK-BE-NEXT: stxv vs3, 48(r7) +; CHECK-BE-NEXT: stxv vs2, 32(r7) +; CHECK-BE-NEXT: blr +entry: + %0 = bitcast i8* %vqp to <512 x i1>* + %1 = load <512 x i1>, <512 x i1>* %0, align 64 + %2 = bitcast <256 x i1>* %vpp to i8* + %3 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %2) + %4 = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> %1, <256 x i1> %3, <16 x i8> %vc) + %5 = bitcast i8* %resp to <512 x i1>* + store <512 x i1> %4, <512 x i1>* %5, align 64 + ret void +} + +; Function Attrs: nofree nounwind +define void @test_ldst_10(i8* nocapture readonly %vqp, i64 %offs, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp) { +; CHECK-LABEL: test_ldst_10: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxv vs1, 32(r3) +; CHECK-NEXT: lxv vs0, 48(r3) +; CHECK-NEXT: lxv vs3, 0(r3) +; CHECK-NEXT: lxv vs2, 16(r3) +; CHECK-NEXT: lxvp vsp4, 0(r5) +; CHECK-NEXT: xxmtacc acc0 +; CHECK-NEXT: xvf64gernp acc0, vsp4, v2 +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs0, 48(r9) +; CHECK-NEXT: stxv vs1, 32(r9) +; CHECK-NEXT: stxv vs2, 16(r9) +; CHECK-NEXT: stxv vs3, 0(r9) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_10: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) +; CHECK-BE-NEXT: lxv vs3, 48(r3) +; CHECK-BE-NEXT: lxv vs2, 32(r3) +; CHECK-BE-NEXT: lxvp vsp4, 0(r5) +; CHECK-BE-NEXT: xxmtacc acc0 +; CHECK-BE-NEXT: xvf64gernp acc0, vsp4, v2 +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs1, 16(r9) +; CHECK-BE-NEXT: stxv vs0, 0(r9) +; CHECK-BE-NEXT: stxv vs3, 48(r9) +; CHECK-BE-NEXT: stxv vs2, 32(r9) +; CHECK-BE-NEXT: blr +entry: + %0 = bitcast i8* %vqp to <512 x i1>* + %1 = load <512 x i1>, <512 x i1>* %0, align 64 + %2 = bitcast <256 x i1>* %vpp to i8* + %3 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %2) + %4 = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> %1, <256 x i1> %3, <16 x i8> %vc) + %5 = bitcast i8* %resp to <512 x i1>* + store <512 x i1> %4, <512 x i1>* %5, align 64 + ret void +}