diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def --- a/clang/include/clang/Basic/BuiltinsPPC.def +++ b/clang/include/clang/Basic/BuiltinsPPC.def @@ -738,6 +738,8 @@ MMA_BUILTIN(pmxvbf16ger2pn, "vW512*VVi15i15i3", true) MMA_BUILTIN(pmxvbf16ger2np, "vW512*VVi15i15i3", true) MMA_BUILTIN(pmxvbf16ger2nn, "vW512*VVi15i15i3", true) +MMA_BUILTIN(lxvp, "W256SLLiW256C*", false) +MMA_BUILTIN(stxvp, "vW256SLLiW256C*", false) // FIXME: Obviously incomplete. diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -14776,6 +14776,19 @@ break; #include "clang/Basic/BuiltinsPPC.def" } + if (BuiltinID == PPC::BI__builtin_mma_lxvp || + BuiltinID == PPC::BI__builtin_mma_stxvp) { + if (BuiltinID == PPC::BI__builtin_mma_lxvp) { + Ops[1] = Builder.CreateBitCast(Ops[1], Int8PtrTy); + Ops[0] = Builder.CreateGEP(Ops[1], Ops[0]); + } else { + Ops[2] = Builder.CreateBitCast(Ops[2], Int8PtrTy); + Ops[1] = Builder.CreateGEP(Ops[2], Ops[1]); + } + Ops.pop_back(); + llvm::Function *F = CGM.getIntrinsic(ID); + return Builder.CreateCall(F, Ops, ""); + } SmallVector CallOps; if (Accumulate) { Address Addr = EmitPointerWithAlignment(E->getArg(0)); diff --git a/clang/test/CodeGen/builtins-ppc-mma.c b/clang/test/CodeGen/builtins-ppc-mma.c --- a/clang/test/CodeGen/builtins-ppc-mma.c +++ b/clang/test/CodeGen/builtins-ppc-mma.c @@ -1036,3 +1036,162 @@ __builtin_mma_pmxvbf16ger2nn(&vq, vc, vc, 0, 0, 0); *((__vector_quad *)resp) = vq; } + +// CHECK-LABEL: @test66( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* +// CHECK-NEXT: [[TMP1:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8* +// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP1]], i8* [[TMP2]]) +// CHECK-NEXT: ret void +// +void test66(const __vector_pair *vpp, const __vector_pair *vp2) { + __vector_pair vp = __builtin_mma_lxvp(0LL, vpp); + __builtin_mma_stxvp(vp, 0LL, vp2); +} + +// CHECK-LABEL: @test67( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 [[OFFSET:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8* +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 [[OFFSET]] +// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]]) +// CHECK-NEXT: ret void +// +void test67(const __vector_pair *vpp, signed long long offset, const __vector_pair *vp2) { + __vector_pair vp = __builtin_mma_lxvp(offset, vpp); + __builtin_mma_stxvp(vp, offset, vp2); +} + +// CHECK-LABEL: @test68( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 18 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8* +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 18 +// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]]) +// CHECK-NEXT: ret void +// +void test68(const __vector_pair *vpp, const __vector_pair *vp2) { + __vector_pair vp = __builtin_mma_lxvp(18LL, vpp); + __builtin_mma_stxvp(vp, 18LL, vp2); +} + +// CHECK-LABEL: @test69( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8* +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 1 +// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]]) +// CHECK-NEXT: ret void +// +void test69(const __vector_pair *vpp, const __vector_pair *vp2) { + __vector_pair vp = __builtin_mma_lxvp(1LL, vpp); + __builtin_mma_stxvp(vp, 1LL, vp2); +} + +// CHECK-LABEL: @test70( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 42 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8* +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 42 +// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]]) +// CHECK-NEXT: ret void +// +void test70(const __vector_pair *vpp, const __vector_pair *vp2) { + __vector_pair vp = __builtin_mma_lxvp(42LL, vpp); + __builtin_mma_stxvp(vp, 42LL, vp2); +} + +// CHECK-LABEL: @test71( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr <256 x i1>, <256 x i1>* [[VPP:%.*]], i64 128 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <256 x i1>* [[TMP0]] to i8* +// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr <256 x i1>, <256 x i1>* [[VP2:%.*]], i64 128 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <256 x i1>* [[TMP3]] to i8* +// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]]) +// CHECK-NEXT: ret void +// +void test71(const __vector_pair *vpp, const __vector_pair *vp2) { + __vector_pair vp = __builtin_mma_lxvp(32768LL, vpp); + __builtin_mma_stxvp(vp, 32768LL, vp2); +} + +// CHECK-LABEL: @test72( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 32799 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8* +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 32799 +// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]]) +// CHECK-NEXT: ret void +// +void test72(const __vector_pair *vpp, const __vector_pair *vp2) { + __vector_pair vp = __builtin_mma_lxvp(32799LL, vpp); + __builtin_mma_stxvp(vp, 32799LL, vp2); +} + +// CHECK-LABEL: @test73( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, [[TBAA2:!tbaa !.*]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP2]], i64 8 +// CHECK-NEXT: [[TMP4:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> [[TMP1]], <256 x i1> [[TMP4]], <16 x i8> [[VC:%.*]], i32 0, i32 0) +// CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP5]], <512 x i1>* [[TMP6]], align 64, [[TBAA2]] +// CHECK-NEXT: ret void +// +void test73(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = __builtin_mma_lxvp(8LL, vpp); + __builtin_mma_pmxvf64gernn(&vq, vp, vc, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test74( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, [[TBAA2]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* +// CHECK-NEXT: [[TMP3:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP1]], <256 x i1> [[TMP3]], <16 x i8> [[VC:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP4]], <512 x i1>* [[TMP5]], align 64, [[TBAA2]] +// CHECK-NEXT: ret void +// +void test74(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = __builtin_mma_lxvp(0LL, vpp); + __builtin_mma_xvf64gernp(&vq, vp, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test75( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, [[TBAA2:!tbaa !.*]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[OFFS:%.*]] +// CHECK-NEXT: [[TMP4:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP1]], <256 x i1> [[TMP4]], <16 x i8> [[VC:%.*]]) +// CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP5]], <512 x i1>* [[TMP6]], align 64, [[TBAA2]] +// CHECK-NEXT: ret void +// +void test75(unsigned char *vqp, signed long long offs, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = __builtin_mma_lxvp(offs, vpp); + __builtin_mma_xvf64gernp(&vq, vp, vc); + *((__vector_quad *)resp) = vq; +} diff --git a/clang/test/Sema/ppc-mma-types.c b/clang/test/Sema/ppc-mma-types.c --- a/clang/test/Sema/ppc-mma-types.c +++ b/clang/test/Sema/ppc-mma-types.c @@ -319,3 +319,17 @@ __vector_pair vp2 = (__vector_pair)vpp; // expected-error {{used type '__vector_pair' where arithmetic or pointer type is required}} } +void testBuiltinTypes1(const __vector_pair *vpp, const __vector_pair *vp2, float f) { + __vector_pair vp = __builtin_mma_lxvp(f, vpp); // expected-error {{passing 'float' to parameter of incompatible type 'long long'}} + __builtin_mma_stxvp(vp, 32799, vp2); // expected-error {{passing 'int' to parameter of incompatible type 'long long'}} +} + +void testBuiltinTypes2(__vector_pair *vpp, const __vector_pair *vp2, unsigned char c) { + __vector_pair vp = __builtin_mma_lxvp(6LL, vpp); // expected-error {{passing '__vector_pair *' to parameter of incompatible type 'const __vector_pair *'}} + __builtin_mma_stxvp(vp, c, vp2); // expected-error {{passing 'unsigned char' to parameter of incompatible type 'long long'}} +} + +void testBuiltinTypes3(vector int v, __vector_pair *vp2, signed long long ll, unsigned short s) { + __vector_pair vp = __builtin_mma_lxvp(ll, v); // expected-error {{passing '__vector int' (vector of 4 'int' values) to parameter of incompatible type 'const __vector_pair *'}} + __builtin_mma_stxvp(vp, ll, s); // expected-error {{passing 'unsigned short' to parameter of incompatible type 'const __vector_pair *'}} +} diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td --- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td +++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td @@ -1422,6 +1422,14 @@ def int_ppc_mma_xxsetaccz : Intrinsic<[llvm_v512i1_ty], [], [IntrNoMem]>; + def int_ppc_mma_lxvp : + Intrinsic<[llvm_v256i1_ty], [llvm_ptr_ty], + [IntrReadMem, IntrArgMemOnly]>; + + def int_ppc_mma_stxvp : + Intrinsic<[], [llvm_v256i1_ty, llvm_ptr_ty], + [IntrWriteMem, IntrArgMemOnly]>; + // MMA Reduced-Precision: Outer Product Intrinsic Definitions. defm int_ppc_mma_xvi4ger8 : PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>; diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -293,6 +293,13 @@ Align(16)); } + /// SelectAddrImmX34 - Returns true if the address N can be represented by + /// a base register plus a signed 34-bit displacement. Suitable for use by + /// PSTXVP and friends. + bool SelectAddrImmX34(SDValue N, SDValue &Disp, SDValue &Base) { + return PPCLowering->SelectAddressRegImm34(N, Disp, Base, *CurDAG); + } + // Select an address into a single register. bool SelectAddr(SDValue N, SDValue &Base) { Base = N; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -770,6 +770,8 @@ bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base, SelectionDAG &DAG, MaybeAlign EncodingAlignment) const; + bool SelectAddressRegImm34(SDValue N, SDValue &Disp, SDValue &Base, + SelectionDAG &DAG) const; /// SelectAddressRegRegOnly - Given the specified addressed, force it to be /// represented as an indexed [r+r] operation. @@ -1325,6 +1327,8 @@ bool isIntS16Immediate(SDNode *N, int16_t &Imm); bool isIntS16Immediate(SDValue Op, int16_t &Imm); + bool isIntS34Immediate(SDNode *N, int64_t &Imm); + bool isIntS34Immediate(SDValue Op, int64_t &Imm); bool convertToNonDenormSingle(APInt &ArgAPInt); bool convertToNonDenormSingle(APFloat &ArgAPFloat); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -2399,6 +2399,20 @@ return false; } +/// isIntS34Immediate - This method tests if value of node given can be +/// accurately represented as a sign extension from a 34-bit value. If so, +/// this returns true and the immediate. +bool llvm::isIntS34Immediate(SDNode *N, int64_t &Imm) { + if (!isa(N)) + return false; + + Imm = (int64_t)cast(N)->getZExtValue(); + return isInt<34>(Imm); +} +bool llvm::isIntS34Immediate(SDValue Op, int64_t &Imm) { + return isIntS34Immediate(Op.getNode(), Imm); +} + /// SelectAddressRegReg - Given the specified addressed, check to see if it /// can be represented as an indexed [r+r] operation. Returns false if it /// can be more efficiently represented as [r+imm]. If \p EncodingAlignment is @@ -2599,6 +2613,55 @@ return true; // [r+0] } +/// Similar to the 16-bit case but for instructions that take a 34-bit +/// displacement field (prefixed loads/stores). +bool PPCTargetLowering::SelectAddressRegImm34(SDValue N, SDValue &Disp, + SDValue &Base, + SelectionDAG &DAG) const { + // Only on 64-bit targets. + if (N.getValueType() != MVT::i64) + return false; + + SDLoc dl(N); + int64_t Imm = 0; + + if (N.getOpcode() == ISD::ADD) { + if (!isIntS34Immediate(N.getOperand(1), Imm)) + return false; + Disp = DAG.getTargetConstant(Imm, dl, N.getValueType()); + if (FrameIndexSDNode *FI = dyn_cast(N.getOperand(0))) + Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); + else + Base = N.getOperand(0); + return true; + } + + if (N.getOpcode() == ISD::OR) { + if (!isIntS34Immediate(N.getOperand(1), Imm)) + return false; + // If this is an or of disjoint bitfields, we can codegen this as an add + // (for better address arithmetic) if the LHS and RHS of the OR are + // provably disjoint. + KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0)); + if ((LHSKnown.Zero.getZExtValue() | ~(uint64_t)Imm) != ~0ULL) + return false; + if (FrameIndexSDNode *FI = dyn_cast(N.getOperand(0))) + Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); + else + Base = N.getOperand(0); + Disp = DAG.getTargetConstant(Imm, dl, N.getValueType()); + return true; + } + + if (isIntS34Immediate(N, Imm)) { // If the address is a 34-bit const. + Disp = DAG.getTargetConstant(Imm, dl, N.getValueType()); + Base = DAG.getRegister(PPC::ZERO8, N.getValueType()); + return true; + } + + return false; +} + /// SelectAddressRegRegOnly - Given the specified addressed, force it to be /// represented as an indexed [r+r] operation. bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base, diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -1031,11 +1031,13 @@ // Define PowerPC specific addressing mode. // d-form -def iaddr : ComplexPattern; // "stb" +def iaddr : ComplexPattern; // "stb" // ds-form -def iaddrX4 : ComplexPattern; // "std" +def iaddrX4 : ComplexPattern; // "std" // dq-form -def iaddrX16 : ComplexPattern; // "stxv" +def iaddrX16 : ComplexPattern; // "stxv" +// 8LS:d-form +def iaddrX34 : ComplexPattern; // "pstxvp" // Below forms are all x-form addressing mode, use three different ones so we // can make a accurate check for x-form instructions in ISEL. diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -1654,6 +1654,24 @@ "pstxvp $XTp, $D_RA", IIC_LdStLFD>; } +let Predicates = [PairedVectorMemops] in { + // Intrinsics for Paired Vector Loads. + def : Pat<(v256i1 (int_ppc_mma_lxvp iaddrX16:$src)), (LXVP memrix16:$src)>; + def : Pat<(v256i1 (int_ppc_mma_lxvp xaddrX16:$src)), (LXVPX xaddrX16:$src)>; + let Predicates = [PairedVectorMemops, PrefixInstrs] in { + def : Pat<(v256i1 (int_ppc_mma_lxvp iaddrX34:$src)), (PLXVP memri34:$src)>; + } + // Intrinsics for Paired Vector Stores. + def : Pat<(int_ppc_mma_stxvp v256i1:$XSp, iaddrX16:$dst), + (STXVP $XSp, memrix16:$dst)>; + def : Pat<(int_ppc_mma_stxvp v256i1:$XSp, xaddrX16:$dst), + (STXVPX $XSp, xaddrX16:$dst)>; + let Predicates = [PairedVectorMemops, PrefixInstrs] in { + def : Pat<(int_ppc_mma_stxvp v256i1:$XSp, iaddrX34:$dst), + (PSTXVP $XSp, memri34:$dst)>; + } +} + // TODO: We have an added complexity of 500 here. This is only a temporary // solution to have tablegen consider these patterns first. The way we do // addressing for PowerPC is complex depending on available D form, X form, or diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp --- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp +++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp @@ -60,6 +60,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsPowerPC.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" @@ -277,8 +278,11 @@ } else if (StoreInst *SMemI = dyn_cast(MemI)) { return SMemI->getPointerOperand(); } else if (IntrinsicInst *IMemI = dyn_cast(MemI)) { - if (IMemI->getIntrinsicID() == Intrinsic::prefetch) + if (IMemI->getIntrinsicID() == Intrinsic::prefetch || + IMemI->getIntrinsicID() == Intrinsic::ppc_mma_lxvp) return IMemI->getArgOperand(0); + if (IMemI->getIntrinsicID() == Intrinsic::ppc_mma_stxvp) + return IMemI->getArgOperand(1); } return nullptr; @@ -345,9 +349,13 @@ MemI = SMemI; PtrValue = SMemI->getPointerOperand(); } else if (IntrinsicInst *IMemI = dyn_cast(&J)) { - if (IMemI->getIntrinsicID() == Intrinsic::prefetch) { + if (IMemI->getIntrinsicID() == Intrinsic::prefetch || + IMemI->getIntrinsicID() == Intrinsic::ppc_mma_lxvp) { MemI = IMemI; PtrValue = IMemI->getArgOperand(0); + } else if (IMemI->getIntrinsicID() == Intrinsic::ppc_mma_stxvp) { + MemI = IMemI; + PtrValue = IMemI->getArgOperand(1); } else continue; } else continue; @@ -827,6 +835,11 @@ if (ST && ST->hasAltivec() && PtrValue->getType()->getPointerElementType()->isVectorTy()) return false; + // There are no update forms for P10 lxvp/stxvp intrinsic. + auto *II = dyn_cast(I); + if (II && ((II->getIntrinsicID() == Intrinsic::ppc_mma_lxvp) || + II->getIntrinsicID() == Intrinsic::ppc_mma_stxvp)) + return false; // See getPreIndexedAddressParts, the displacement for LDU/STDU has to // be 4's multiple (DS-form). For i64 loads/stores when the displacement // fits in a 16-bit signed field but isn't a multiple of 4, it will be @@ -864,7 +877,13 @@ // Check if a load/store has DQ form. auto isDQFormCandidate = [&] (const Instruction *I, const Value *PtrValue) { assert((PtrValue && I) && "Invalid parameter!"); - return !isa(I) && ST && ST->hasP9Vector() && + // Check if it is a P10 lxvp/stxvp intrinsic. + auto *II = dyn_cast(I); + if (II) + return II->getIntrinsicID() == Intrinsic::ppc_mma_lxvp || + II->getIntrinsicID() == Intrinsic::ppc_mma_stxvp; + // Check if it is a P9 vector load/store. + return ST && ST->hasP9Vector() && (PtrValue->getType()->getPointerElementType()->isVectorTy()); }; diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -1223,7 +1223,8 @@ case Intrinsic::ppc_vsx_lxvd2x_be: case Intrinsic::ppc_vsx_lxvw4x_be: case Intrinsic::ppc_vsx_lxvl: - case Intrinsic::ppc_vsx_lxvll: { + case Intrinsic::ppc_vsx_lxvll: + case Intrinsic::ppc_mma_lxvp: { Info.PtrVal = Inst->getArgOperand(0); Info.ReadMem = true; Info.WriteMem = false; @@ -1239,7 +1240,8 @@ case Intrinsic::ppc_vsx_stxvd2x_be: case Intrinsic::ppc_vsx_stxvw4x_be: case Intrinsic::ppc_vsx_stxvl: - case Intrinsic::ppc_vsx_stxvll: { + case Intrinsic::ppc_vsx_stxvll: + case Intrinsic::ppc_mma_stxvp: { Info.PtrVal = Inst->getArgOperand(1); Info.ReadMem = false; Info.WriteMem = true; diff --git a/llvm/test/CodeGen/PowerPC/dform-pair-load-store.ll b/llvm/test/CodeGen/PowerPC/dform-pair-load-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/dform-pair-load-store.ll @@ -0,0 +1,103 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names < %s | FileCheck %s \ +; RUN: --check-prefix=CHECK-BE + +; This test checks that LSR properly recognizes lxvp/stxvp as load/store +; intrinsics to avoid generating x-form instructions instead of d-forms. + +declare <256 x i1> @llvm.ppc.mma.lxvp(i8*) +declare void @llvm.ppc.mma.stxvp(<256 x i1>, i8*) +define void @foo(i32 zeroext %n, <256 x i1>* %ptr, <256 x i1>* %ptr2) { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmplwi r3, 0 +; CHECK-NEXT: beqlr cr0 +; CHECK-NEXT: # %bb.1: # %for.body.lr.ph +; CHECK-NEXT: clrldi r6, r3, 32 +; CHECK-NEXT: addi r3, r4, 64 +; CHECK-NEXT: addi r4, r5, 64 +; CHECK-NEXT: mtctr r6 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_2: # %for.body +; CHECK-NEXT: # +; CHECK-NEXT: lxvp vsp0, -64(r3) +; CHECK-NEXT: lxvp vsp2, -32(r3) +; CHECK-NEXT: lxvp vsp4, 0(r3) +; CHECK-NEXT: lxvp vsp6, 32(r3) +; CHECK-NEXT: addi r3, r3, 1 +; CHECK-NEXT: stxvp vsp0, -64(r4) +; CHECK-NEXT: stxvp vsp2, -32(r4) +; CHECK-NEXT: stxvp vsp4, 0(r4) +; CHECK-NEXT: stxvp vsp6, 32(r4) +; CHECK-NEXT: addi r4, r4, 1 +; CHECK-NEXT: bdnz .LBB0_2 +; CHECK-NEXT: # %bb.3: # %for.cond.cleanup +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: foo: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: cmplwi r3, 0 +; CHECK-BE-NEXT: beqlr cr0 +; CHECK-BE-NEXT: # %bb.1: # %for.body.lr.ph +; CHECK-BE-NEXT: clrldi r6, r3, 32 +; CHECK-BE-NEXT: addi r3, r4, 64 +; CHECK-BE-NEXT: addi r4, r5, 64 +; CHECK-BE-NEXT: mtctr r6 +; CHECK-BE-NEXT: .p2align 4 +; CHECK-BE-NEXT: .LBB0_2: # %for.body +; CHECK-BE-NEXT: # +; CHECK-BE-NEXT: lxvp vsp0, -64(r3) +; CHECK-BE-NEXT: lxvp vsp2, -32(r3) +; CHECK-BE-NEXT: lxvp vsp4, 0(r3) +; CHECK-BE-NEXT: lxvp vsp6, 32(r3) +; CHECK-BE-NEXT: addi r3, r3, 1 +; CHECK-BE-NEXT: stxvp vsp0, -64(r4) +; CHECK-BE-NEXT: stxvp vsp2, -32(r4) +; CHECK-BE-NEXT: stxvp vsp4, 0(r4) +; CHECK-BE-NEXT: stxvp vsp6, 32(r4) +; CHECK-BE-NEXT: addi r4, r4, 1 +; CHECK-BE-NEXT: bdnz .LBB0_2 +; CHECK-BE-NEXT: # %bb.3: # %for.cond.cleanup +; CHECK-BE-NEXT: blr +entry: + %cmp35.not = icmp eq i32 %n, 0 + br i1 %cmp35.not, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: + %0 = bitcast <256 x i1>* %ptr to i8* + %1 = bitcast <256 x i1>* %ptr2 to i8* + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %2 = getelementptr i8, i8* %0, i64 %indvars.iv + %3 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %2) + %add2 = add nuw nsw i64 %indvars.iv, 32 + %4 = getelementptr i8, i8* %0, i64 %add2 + %5 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %4) + %add4 = add nuw nsw i64 %indvars.iv, 64 + %6 = getelementptr i8, i8* %0, i64 %add4 + %7 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %6) + %add6 = add nuw nsw i64 %indvars.iv, 96 + %8 = getelementptr i8, i8* %0, i64 %add6 + %9 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %8) + %10 = getelementptr i8, i8* %1, i64 %indvars.iv + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %3, i8* %10) + %11 = getelementptr i8, i8* %1, i64 %add2 + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %5, i8* %11) + %12 = getelementptr i8, i8* %1, i64 %add4 + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %7, i8* %12) + %13 = getelementptr i8, i8* %1, i64 %add6 + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %9, i8* %13) + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + diff --git a/llvm/test/CodeGen/PowerPC/loop-p10-pair-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-p10-pair-prepare.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/loop-p10-pair-prepare.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs -disable-lsr \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr10 < %s | FileCheck %s +; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs -disable-lsr \ +; RUN: -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr10 < %s | FileCheck %s \ +; RUN: --check-prefix=CHECK-BE + +; This test checks the PPCLoopInstrFormPrep pass supports the lxvp and stxvp +; intrinsics so we generate more dq-form instructions instead of x-forms. + +%_elem_type_of_x = type <{ double }> +%_elem_type_of_y = type <{ double }> + +define void @foo(i64* %.n, [0 x %_elem_type_of_x]* %.x, [0 x %_elem_type_of_y]* %.y, <2 x double>* %.sum) { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: ld r5, 0(r3) +; CHECK-NEXT: cmpdi r5, 1 +; CHECK-NEXT: bltlr cr0 +; CHECK-NEXT: # %bb.1: # %_loop_1_do_.lr.ph +; CHECK-NEXT: addi r3, r4, 1 +; CHECK-NEXT: addi r4, r5, -1 +; CHECK-NEXT: lxv vs0, 0(r6) +; CHECK-NEXT: rldicl r4, r4, 60, 4 +; CHECK-NEXT: addi r4, r4, 1 +; CHECK-NEXT: mtctr r4 +; CHECK-NEXT: .p2align 5 +; CHECK-NEXT: .LBB0_2: # %_loop_1_do_ +; CHECK-NEXT: # +; CHECK-NEXT: lxvp vsp2, 0(r3) +; CHECK-NEXT: lxvp vsp4, 32(r3) +; CHECK-NEXT: addi r3, r3, 128 +; CHECK-NEXT: xvadddp vs0, vs0, vs3 +; CHECK-NEXT: xvadddp vs0, vs0, vs2 +; CHECK-NEXT: xvadddp vs0, vs0, vs5 +; CHECK-NEXT: xvadddp vs0, vs0, vs4 +; CHECK-NEXT: bdnz .LBB0_2 +; CHECK-NEXT: # %bb.3: # %_loop_1_loopHeader_._return_bb_crit_edge +; CHECK-NEXT: stxv vs0, 0(r6) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: foo: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: ld r5, 0(r3) +; CHECK-BE-NEXT: cmpdi r5, 1 +; CHECK-BE-NEXT: bltlr cr0 +; CHECK-BE-NEXT: # %bb.1: # %_loop_1_do_.lr.ph +; CHECK-BE-NEXT: addi r3, r4, 1 +; CHECK-BE-NEXT: addi r4, r5, -1 +; CHECK-BE-NEXT: lxv vs0, 0(r6) +; CHECK-BE-NEXT: rldicl r4, r4, 60, 4 +; CHECK-BE-NEXT: addi r4, r4, 1 +; CHECK-BE-NEXT: mtctr r4 +; CHECK-BE-NEXT: .p2align 5 +; CHECK-BE-NEXT: .LBB0_2: # %_loop_1_do_ +; CHECK-BE-NEXT: # +; CHECK-BE-NEXT: lxvp vsp2, 0(r3) +; CHECK-BE-NEXT: lxvp vsp4, 32(r3) +; CHECK-BE-NEXT: addi r3, r3, 128 +; CHECK-BE-NEXT: xvadddp vs0, vs0, vs2 +; CHECK-BE-NEXT: xvadddp vs0, vs0, vs3 +; CHECK-BE-NEXT: xvadddp vs0, vs0, vs4 +; CHECK-BE-NEXT: xvadddp vs0, vs0, vs5 +; CHECK-BE-NEXT: bdnz .LBB0_2 +; CHECK-BE-NEXT: # %bb.3: # %_loop_1_loopHeader_._return_bb_crit_edge +; CHECK-BE-NEXT: stxv vs0, 0(r6) +; CHECK-BE-NEXT: blr +entry: + %_val_n_2 = load i64, i64* %.n, align 8 + %_grt_tmp7 = icmp slt i64 %_val_n_2, 1 + br i1 %_grt_tmp7, label %_return_bb, label %_loop_1_do_.lr.ph + +_loop_1_do_.lr.ph: ; preds = %entry + %x_rvo_based_addr_5 = getelementptr inbounds [0 x %_elem_type_of_x], [0 x %_elem_type_of_x]* %.x, i64 0, i64 -1 + %.sum.promoted = load <2 x double>, <2 x double>* %.sum, align 16 + br label %_loop_1_do_ + +_loop_1_do_: ; preds = %_loop_1_do_.lr.ph, %_loop_1_do_ + %_val_sum_9 = phi <2 x double> [ %.sum.promoted, %_loop_1_do_.lr.ph ], [ %_add_tmp49, %_loop_1_do_ ] + %i.08 = phi i64 [ 1, %_loop_1_do_.lr.ph ], [ %_loop_1_update_loop_ix, %_loop_1_do_ ] + %x_ix_dim_0_6 = getelementptr %_elem_type_of_x, %_elem_type_of_x* %x_rvo_based_addr_5, i64 %i.08 + %x_ix_dim_0_ = bitcast %_elem_type_of_x* %x_ix_dim_0_6 to i8* + %0 = getelementptr i8, i8* %x_ix_dim_0_, i64 1 + %1 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %0) + %2 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %1) + %.fca.0.extract1 = extractvalue { <16 x i8>, <16 x i8> } %2, 0 + %.fca.1.extract2 = extractvalue { <16 x i8>, <16 x i8> } %2, 1 + %3 = getelementptr i8, i8* %x_ix_dim_0_, i64 33 + %4 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %3) + %5 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %4) + %.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %5, 0 + %.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %5, 1 + %6 = bitcast <16 x i8> %.fca.0.extract1 to <2 x double> + %_add_tmp23 = fadd contract <2 x double> %_val_sum_9, %6 + %7 = bitcast <16 x i8> %.fca.1.extract2 to <2 x double> + %_add_tmp32 = fadd contract <2 x double> %_add_tmp23, %7 + %8 = bitcast <16 x i8> %.fca.0.extract to <2 x double> + %_add_tmp40 = fadd contract <2 x double> %_add_tmp32, %8 + %9 = bitcast <16 x i8> %.fca.1.extract to <2 x double> + %_add_tmp49 = fadd contract <2 x double> %_add_tmp40, %9 + %_loop_1_update_loop_ix = add nuw nsw i64 %i.08, 16 + %_grt_tmp = icmp sgt i64 %_loop_1_update_loop_ix, %_val_n_2 + br i1 %_grt_tmp, label %_loop_1_loopHeader_._return_bb_crit_edge, label %_loop_1_do_ + +_loop_1_loopHeader_._return_bb_crit_edge: ; preds = %_loop_1_do_ + store <2 x double> %_add_tmp49, <2 x double>* %.sum, align 16 + br label %_return_bb + +_return_bb: ; preds = %_loop_1_loopHeader_._return_bb_crit_edge, %entry + ret void +} + +declare <256 x i1> @llvm.ppc.mma.lxvp(i8*) +declare { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1>) diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll --- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll @@ -698,3 +698,315 @@ declare <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1>, <256 x i1>, <16 x i8>, i32, i32) declare <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1>, <256 x i1>, <16 x i8>) + +; Function Attrs: nounwind +define void @test_ldst_1(<256 x i1>* %vpp, <256 x i1>* %vp2) { +; CHECK-LABEL: test_ldst_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxvp vsp0, 0(r3) +; CHECK-NEXT: stxvp vsp0, 0(r4) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_1: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxvp vsp0, 0(r3) +; CHECK-BE-NEXT: stxvp vsp0, 0(r4) +; CHECK-BE-NEXT: blr +entry: + %0 = bitcast <256 x i1>* %vpp to i8* + %1 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %0) + %2 = bitcast <256 x i1>* %vp2 to i8* + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %1, i8* %2) + ret void +} + +; Function Attrs: argmemonly nounwind readonly +declare <256 x i1> @llvm.ppc.mma.lxvp(i8*) + +; Function Attrs: argmemonly nounwind writeonly +declare void @llvm.ppc.mma.stxvp(<256 x i1>, i8*) + +; Function Attrs: nounwind +define void @test_ldst_2(<256 x i1>* %vpp, i64 %offset, <256 x i1>* %vp2) { +; CHECK-LABEL: test_ldst_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxvpx vsp0, r3, r4 +; CHECK-NEXT: stxvpx vsp0, r5, r4 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_2: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxvpx vsp0, r3, r4 +; CHECK-BE-NEXT: stxvpx vsp0, r5, r4 +; CHECK-BE-NEXT: blr +entry: + %0 = bitcast <256 x i1>* %vpp to i8* + %1 = getelementptr i8, i8* %0, i64 %offset + %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1) + %3 = bitcast <256 x i1>* %vp2 to i8* + %4 = getelementptr i8, i8* %3, i64 %offset + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4) + ret void +} + +; Function Attrs: nounwind +define void @test_ldst_3(<256 x i1>* %vpp, <256 x i1>* %vp2) { +; CHECK-LABEL: test_ldst_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li r5, 18 +; CHECK-NEXT: lxvpx vsp0, r3, r5 +; CHECK-NEXT: stxvpx vsp0, r4, r5 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_3: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: li r5, 18 +; CHECK-BE-NEXT: lxvpx vsp0, r3, r5 +; CHECK-BE-NEXT: stxvpx vsp0, r4, r5 +; CHECK-BE-NEXT: blr +entry: + %0 = bitcast <256 x i1>* %vpp to i8* + %1 = getelementptr i8, i8* %0, i64 18 + %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1) + %3 = bitcast <256 x i1>* %vp2 to i8* + %4 = getelementptr i8, i8* %3, i64 18 + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4) + ret void +} + +; Function Attrs: nounwind +define void @test_ldst_4(<256 x i1>* %vpp, <256 x i1>* %vp2) { +; CHECK-LABEL: test_ldst_4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: lxvpx vsp0, r3, r5 +; CHECK-NEXT: stxvpx vsp0, r4, r5 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_4: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: li r5, 1 +; CHECK-BE-NEXT: lxvpx vsp0, r3, r5 +; CHECK-BE-NEXT: stxvpx vsp0, r4, r5 +; CHECK-BE-NEXT: blr +entry: + %0 = bitcast <256 x i1>* %vpp to i8* + %1 = getelementptr i8, i8* %0, i64 1 + %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1) + %3 = bitcast <256 x i1>* %vp2 to i8* + %4 = getelementptr i8, i8* %3, i64 1 + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4) + ret void +} + +; Function Attrs: nounwind +define void @test_ldst_5(<256 x i1>* %vpp, <256 x i1>* %vp2) { +; CHECK-LABEL: test_ldst_5: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li r5, 42 +; CHECK-NEXT: lxvpx vsp0, r3, r5 +; CHECK-NEXT: stxvpx vsp0, r4, r5 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_5: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: li r5, 42 +; CHECK-BE-NEXT: lxvpx vsp0, r3, r5 +; CHECK-BE-NEXT: stxvpx vsp0, r4, r5 +; CHECK-BE-NEXT: blr +entry: + %0 = bitcast <256 x i1>* %vpp to i8* + %1 = getelementptr i8, i8* %0, i64 42 + %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1) + %3 = bitcast <256 x i1>* %vp2 to i8* + %4 = getelementptr i8, i8* %3, i64 42 + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4) + ret void +} + +; Function Attrs: nounwind +define void @test_ldst_6(<256 x i1>* %vpp, <256 x i1>* %vp2) { +; CHECK-LABEL: test_ldst_6: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxvp vsp0, 4096(r3) +; CHECK-NEXT: stxvp vsp0, 4096(r4) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_6: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxvp vsp0, 4096(r3) +; CHECK-BE-NEXT: stxvp vsp0, 4096(r4) +; CHECK-BE-NEXT: blr +entry: + %0 = getelementptr <256 x i1>, <256 x i1>* %vpp, i64 128 + %1 = bitcast <256 x i1>* %0 to i8* + %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1) + %3 = getelementptr <256 x i1>, <256 x i1>* %vp2, i64 128 + %4 = bitcast <256 x i1>* %3 to i8* + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4) + ret void +} + +; Function Attrs: nounwind +define void @test_ldst_7(<256 x i1>* %vpp, <256 x i1>* %vp2) { +; FIXME: A prefixed load (plxvp) is expected here as the offset in this +; test case is a constant that fits within 34-bits. +; CHECK-LABEL: test_ldst_7: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li r5, 0 +; CHECK-NEXT: ori r5, r5, 32799 +; CHECK-NEXT: lxvpx vsp0, r3, r5 +; CHECK-NEXT: stxvpx vsp0, r4, r5 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_7: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: li r5, 0 +; CHECK-BE-NEXT: ori r5, r5, 32799 +; CHECK-BE-NEXT: lxvpx vsp0, r3, r5 +; CHECK-BE-NEXT: stxvpx vsp0, r4, r5 +; CHECK-BE-NEXT: blr +entry: + %0 = bitcast <256 x i1>* %vpp to i8* + %1 = getelementptr i8, i8* %0, i64 32799 + %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1) + %3 = bitcast <256 x i1>* %vp2 to i8* + %4 = getelementptr i8, i8* %3, i64 32799 + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4) + ret void +} + +; Function Attrs: nofree nounwind +define void @test_ldst_8(i8* nocapture readonly %vqp, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp) { +; CHECK-LABEL: test_ldst_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxv vs1, 32(r3) +; CHECK-NEXT: lxv vs0, 48(r3) +; CHECK-NEXT: lxv vs3, 0(r3) +; CHECK-NEXT: lxv vs2, 16(r3) +; CHECK-NEXT: li r3, 8 +; CHECK-NEXT: lxvpx vsp4, r4, r3 +; CHECK-NEXT: xxmtacc acc0 +; CHECK-NEXT: pmxvf64gernn acc0, vsp4, v2, 0, 0 +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs0, 48(r7) +; CHECK-NEXT: stxv vs1, 32(r7) +; CHECK-NEXT: stxv vs2, 16(r7) +; CHECK-NEXT: stxv vs3, 0(r7) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_8: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) +; CHECK-BE-NEXT: lxv vs3, 48(r3) +; CHECK-BE-NEXT: lxv vs2, 32(r3) +; CHECK-BE-NEXT: li r3, 8 +; CHECK-BE-NEXT: lxvpx vsp4, r4, r3 +; CHECK-BE-NEXT: xxmtacc acc0 +; CHECK-BE-NEXT: pmxvf64gernn acc0, vsp4, v2, 0, 0 +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs1, 16(r7) +; CHECK-BE-NEXT: stxv vs0, 0(r7) +; CHECK-BE-NEXT: stxv vs3, 48(r7) +; CHECK-BE-NEXT: stxv vs2, 32(r7) +; CHECK-BE-NEXT: blr +entry: + %0 = bitcast i8* %vqp to <512 x i1>* + %1 = load <512 x i1>, <512 x i1>* %0, align 64 + %2 = bitcast <256 x i1>* %vpp to i8* + %3 = getelementptr i8, i8* %2, i64 8 + %4 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %3) + %5 = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> %1, <256 x i1> %4, <16 x i8> %vc, i32 0, i32 0) + %6 = bitcast i8* %resp to <512 x i1>* + store <512 x i1> %5, <512 x i1>* %6, align 64 + ret void +} + +; Function Attrs: nofree nounwind +define void @test_ldst_9(i8* nocapture readonly %vqp, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp) { +; CHECK-LABEL: test_ldst_9: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxv vs1, 32(r3) +; CHECK-NEXT: lxv vs0, 48(r3) +; CHECK-NEXT: lxv vs3, 0(r3) +; CHECK-NEXT: lxv vs2, 16(r3) +; CHECK-NEXT: lxvp vsp4, 0(r4) +; CHECK-NEXT: xxmtacc acc0 +; CHECK-NEXT: xvf64gernp acc0, vsp4, v2 +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs0, 48(r7) +; CHECK-NEXT: stxv vs1, 32(r7) +; CHECK-NEXT: stxv vs2, 16(r7) +; CHECK-NEXT: stxv vs3, 0(r7) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_9: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) +; CHECK-BE-NEXT: lxv vs3, 48(r3) +; CHECK-BE-NEXT: lxv vs2, 32(r3) +; CHECK-BE-NEXT: lxvp vsp4, 0(r4) +; CHECK-BE-NEXT: xxmtacc acc0 +; CHECK-BE-NEXT: xvf64gernp acc0, vsp4, v2 +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs1, 16(r7) +; CHECK-BE-NEXT: stxv vs0, 0(r7) +; CHECK-BE-NEXT: stxv vs3, 48(r7) +; CHECK-BE-NEXT: stxv vs2, 32(r7) +; CHECK-BE-NEXT: blr +entry: + %0 = bitcast i8* %vqp to <512 x i1>* + %1 = load <512 x i1>, <512 x i1>* %0, align 64 + %2 = bitcast <256 x i1>* %vpp to i8* + %3 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %2) + %4 = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> %1, <256 x i1> %3, <16 x i8> %vc) + %5 = bitcast i8* %resp to <512 x i1>* + store <512 x i1> %4, <512 x i1>* %5, align 64 + ret void +} + +; Function Attrs: nofree nounwind +define void @test_ldst_10(i8* nocapture readonly %vqp, i64 %offs, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp) { +; CHECK-LABEL: test_ldst_10: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxv vs1, 32(r3) +; CHECK-NEXT: lxv vs0, 48(r3) +; CHECK-NEXT: lxv vs3, 0(r3) +; CHECK-NEXT: lxv vs2, 16(r3) +; CHECK-NEXT: lxvp vsp4, 0(r5) +; CHECK-NEXT: xxmtacc acc0 +; CHECK-NEXT: xvf64gernp acc0, vsp4, v2 +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs0, 48(r9) +; CHECK-NEXT: stxv vs1, 32(r9) +; CHECK-NEXT: stxv vs2, 16(r9) +; CHECK-NEXT: stxv vs3, 0(r9) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_10: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) +; CHECK-BE-NEXT: lxv vs3, 48(r3) +; CHECK-BE-NEXT: lxv vs2, 32(r3) +; CHECK-BE-NEXT: lxvp vsp4, 0(r5) +; CHECK-BE-NEXT: xxmtacc acc0 +; CHECK-BE-NEXT: xvf64gernp acc0, vsp4, v2 +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs1, 16(r9) +; CHECK-BE-NEXT: stxv vs0, 0(r9) +; CHECK-BE-NEXT: stxv vs3, 48(r9) +; CHECK-BE-NEXT: stxv vs2, 32(r9) +; CHECK-BE-NEXT: blr +entry: + %0 = bitcast i8* %vqp to <512 x i1>* + %1 = load <512 x i1>, <512 x i1>* %0, align 64 + %2 = bitcast <256 x i1>* %vpp to i8* + %3 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %2) + %4 = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> %1, <256 x i1> %3, <16 x i8> %vc) + %5 = bitcast i8* %resp to <512 x i1>* + store <512 x i1> %4, <512 x i1>* %5, align 64 + ret void +}