diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def --- a/clang/include/clang/Basic/BuiltinsPPC.def +++ b/clang/include/clang/Basic/BuiltinsPPC.def @@ -7,8 +7,9 @@ //===----------------------------------------------------------------------===// // // This file defines the PowerPC-specific builtin function database. Users of -// this file must define the BUILTIN macro or the MMA_BUILTIN macro to make use -// of this information. +// this file must define the BUILTIN macro or the CUSTOM_BUILTIN macro to +// make use of this information. The latter is used for builtins requiring +// custom code generation and checking. // //===----------------------------------------------------------------------===// @@ -18,9 +19,9 @@ // The format of this database matches clang/Basic/Builtins.def except for the // MMA builtins that are using their own format documented below. -#if defined(BUILTIN) && !defined(MMA_BUILTIN) -# define MMA_BUILTIN(ID, TYPES, ACCUMULATE) BUILTIN(__builtin_mma_##ID, "i.", "t") -#elif defined(MMA_BUILTIN) && !defined(BUILTIN) +#if defined(BUILTIN) && !defined(CUSTOM_BUILTIN) +# define CUSTOM_BUILTIN(ID, TYPES, ACCUMULATE) BUILTIN(__builtin_##ID, "i.", "t") +#elif defined(CUSTOM_BUILTIN) && !defined(BUILTIN) # define BUILTIN(ID, TYPES, ATTRS) #endif @@ -659,94 +660,94 @@ // Cache built-ins BUILTIN(__builtin_dcbf, "vvC*", "") -// MMA built-ins -// All MMA built-ins are declared here using the MMA_BUILTIN macro. Because -// these built-ins rely on target-dependent types and to avoid pervasive change, -// they are type checked manually in Sema using custom type descriptors. -// The first argument of the MMA_BUILTIN macro is the name of the built-in, the -// second argument specifies the type of the function (result value, then each -// argument) as follows: +// Built-ins requiring custom code generation. +// Because these built-ins rely on target-dependent types and to avoid pervasive +// change, they are type checked manually in Sema using custom type descriptors. +// The first argument of the CUSTOM_BUILTIN macro is the name of the built-in +// with its prefix, the second argument specifies the type of the function +// (result value, then each argument) as follows: // i -> Unsigned integer followed by the greatest possible value for that // argument or 0 if no constraint on the value. // (e.g. i15 for a 4-bits value) -// v -> void // V -> Vector type used with MMA builtins (vector unsigned char) -// W -> MMA vector type followed by the size of the vector type. +// W -> PPC Vector type followed by the size of the vector type. // (e.g. W512 for __vector_quad) +// any other descriptor -> Fall back to generic type descriptor decoding. // The 'C' suffix can be used as a suffix to specify the const type. // The '*' suffix can be used as a suffix to specify a pointer to a type. // The third argument is set to true if the builtin accumulates its result into // its given accumulator. -MMA_BUILTIN(assemble_acc, "vW512*VVVV", false) -MMA_BUILTIN(disassemble_acc, "vv*W512*", false) -MMA_BUILTIN(assemble_pair, "vW256*VV", false) -MMA_BUILTIN(disassemble_pair, "vv*W256*", false) -MMA_BUILTIN(xxmtacc, "vW512*", true) -MMA_BUILTIN(xxmfacc, "vW512*", true) -MMA_BUILTIN(xxsetaccz, "vW512*", false) -MMA_BUILTIN(xvi4ger8, "vW512*VV", false) -MMA_BUILTIN(xvi8ger4, "vW512*VV", false) -MMA_BUILTIN(xvi16ger2, "vW512*VV", false) -MMA_BUILTIN(xvi16ger2s, "vW512*VV", false) -MMA_BUILTIN(xvf16ger2, "vW512*VV", false) -MMA_BUILTIN(xvf32ger, "vW512*VV", false) -MMA_BUILTIN(xvf64ger, "vW512*W256V", false) -MMA_BUILTIN(pmxvi4ger8, "vW512*VVi15i15i255", false) -MMA_BUILTIN(pmxvi8ger4, "vW512*VVi15i15i15", false) -MMA_BUILTIN(pmxvi16ger2, "vW512*VVi15i15i3", false) -MMA_BUILTIN(pmxvi16ger2s, "vW512*VVi15i15i3", false) -MMA_BUILTIN(pmxvf16ger2, "vW512*VVi15i15i3", false) -MMA_BUILTIN(pmxvf32ger, "vW512*VVi15i15", false) -MMA_BUILTIN(pmxvf64ger, "vW512*W256Vi15i3", false) -MMA_BUILTIN(xvi4ger8pp, "vW512*VV", true) -MMA_BUILTIN(xvi8ger4pp, "vW512*VV", true) -MMA_BUILTIN(xvi8ger4spp, "vW512*VV", true) -MMA_BUILTIN(xvi16ger2pp, "vW512*VV", true) -MMA_BUILTIN(xvi16ger2spp, "vW512*VV", true) -MMA_BUILTIN(pmxvi4ger8pp, "vW512*VVi15i15i255", true) -MMA_BUILTIN(pmxvi8ger4pp, "vW512*VVi15i15i15", true) -MMA_BUILTIN(pmxvi8ger4spp, "vW512*VVi15i15i15", true) -MMA_BUILTIN(pmxvi16ger2pp, "vW512*VVi15i15i3", true) -MMA_BUILTIN(pmxvi16ger2spp, "vW512*VVi15i15i3", true) -MMA_BUILTIN(xvf16ger2pp, "vW512*VV", true) -MMA_BUILTIN(xvf16ger2pn, "vW512*VV", true) -MMA_BUILTIN(xvf16ger2np, "vW512*VV", true) -MMA_BUILTIN(xvf16ger2nn, "vW512*VV", true) -MMA_BUILTIN(pmxvf16ger2pp, "vW512*VVi15i15i3", true) -MMA_BUILTIN(pmxvf16ger2pn, "vW512*VVi15i15i3", true) -MMA_BUILTIN(pmxvf16ger2np, "vW512*VVi15i15i3", true) -MMA_BUILTIN(pmxvf16ger2nn, "vW512*VVi15i15i3", true) -MMA_BUILTIN(xvf32gerpp, "vW512*VV", true) -MMA_BUILTIN(xvf32gerpn, "vW512*VV", true) -MMA_BUILTIN(xvf32gernp, "vW512*VV", true) -MMA_BUILTIN(xvf32gernn, "vW512*VV", true) -MMA_BUILTIN(pmxvf32gerpp, "vW512*VVi15i15", true) -MMA_BUILTIN(pmxvf32gerpn, "vW512*VVi15i15", true) -MMA_BUILTIN(pmxvf32gernp, "vW512*VVi15i15", true) -MMA_BUILTIN(pmxvf32gernn, "vW512*VVi15i15", true) -MMA_BUILTIN(xvf64gerpp, "vW512*W256V", true) -MMA_BUILTIN(xvf64gerpn, "vW512*W256V", true) -MMA_BUILTIN(xvf64gernp, "vW512*W256V", true) -MMA_BUILTIN(xvf64gernn, "vW512*W256V", true) -MMA_BUILTIN(pmxvf64gerpp, "vW512*W256Vi15i3", true) -MMA_BUILTIN(pmxvf64gerpn, "vW512*W256Vi15i3", true) -MMA_BUILTIN(pmxvf64gernp, "vW512*W256Vi15i3", true) -MMA_BUILTIN(pmxvf64gernn, "vW512*W256Vi15i3", true) -MMA_BUILTIN(xvbf16ger2, "vW512*VV", false) -MMA_BUILTIN(pmxvbf16ger2, "vW512*VVi15i15i3", false) -MMA_BUILTIN(xvbf16ger2pp, "vW512*VV", true) -MMA_BUILTIN(xvbf16ger2pn, "vW512*VV", true) -MMA_BUILTIN(xvbf16ger2np, "vW512*VV", true) -MMA_BUILTIN(xvbf16ger2nn, "vW512*VV", true) -MMA_BUILTIN(pmxvbf16ger2pp, "vW512*VVi15i15i3", true) -MMA_BUILTIN(pmxvbf16ger2pn, "vW512*VVi15i15i3", true) -MMA_BUILTIN(pmxvbf16ger2np, "vW512*VVi15i15i3", true) -MMA_BUILTIN(pmxvbf16ger2nn, "vW512*VVi15i15i3", true) -MMA_BUILTIN(lxvp, "W256SLLiW256C*", false) -MMA_BUILTIN(stxvp, "vW256SLLiW256C*", false) +CUSTOM_BUILTIN(vsx_lxvp, "W256SLLiW256C*", false) +CUSTOM_BUILTIN(vsx_stxvp, "vW256SLLiW256C*", false) +CUSTOM_BUILTIN(vsx_assemble_pair, "vW256*VV", false) +CUSTOM_BUILTIN(vsx_disassemble_pair, "vv*W256*", false) + +CUSTOM_BUILTIN(mma_assemble_acc, "vW512*VVVV", false) +CUSTOM_BUILTIN(mma_disassemble_acc, "vv*W512*", false) +CUSTOM_BUILTIN(mma_xxmtacc, "vW512*", true) +CUSTOM_BUILTIN(mma_xxmfacc, "vW512*", true) +CUSTOM_BUILTIN(mma_xxsetaccz, "vW512*", false) +CUSTOM_BUILTIN(mma_xvi4ger8, "vW512*VV", false) +CUSTOM_BUILTIN(mma_xvi8ger4, "vW512*VV", false) +CUSTOM_BUILTIN(mma_xvi16ger2, "vW512*VV", false) +CUSTOM_BUILTIN(mma_xvi16ger2s, "vW512*VV", false) +CUSTOM_BUILTIN(mma_xvf16ger2, "vW512*VV", false) +CUSTOM_BUILTIN(mma_xvf32ger, "vW512*VV", false) +CUSTOM_BUILTIN(mma_xvf64ger, "vW512*W256V", false) +CUSTOM_BUILTIN(mma_pmxvi4ger8, "vW512*VVi15i15i255", false) +CUSTOM_BUILTIN(mma_pmxvi8ger4, "vW512*VVi15i15i15", false) +CUSTOM_BUILTIN(mma_pmxvi16ger2, "vW512*VVi15i15i3", false) +CUSTOM_BUILTIN(mma_pmxvi16ger2s, "vW512*VVi15i15i3", false) +CUSTOM_BUILTIN(mma_pmxvf16ger2, "vW512*VVi15i15i3", false) +CUSTOM_BUILTIN(mma_pmxvf32ger, "vW512*VVi15i15", false) +CUSTOM_BUILTIN(mma_pmxvf64ger, "vW512*W256Vi15i3", false) +CUSTOM_BUILTIN(mma_xvi4ger8pp, "vW512*VV", true) +CUSTOM_BUILTIN(mma_xvi8ger4pp, "vW512*VV", true) +CUSTOM_BUILTIN(mma_xvi8ger4spp, "vW512*VV", true) +CUSTOM_BUILTIN(mma_xvi16ger2pp, "vW512*VV", true) +CUSTOM_BUILTIN(mma_xvi16ger2spp, "vW512*VV", true) +CUSTOM_BUILTIN(mma_pmxvi4ger8pp, "vW512*VVi15i15i255", true) +CUSTOM_BUILTIN(mma_pmxvi8ger4pp, "vW512*VVi15i15i15", true) +CUSTOM_BUILTIN(mma_pmxvi8ger4spp, "vW512*VVi15i15i15", true) +CUSTOM_BUILTIN(mma_pmxvi16ger2pp, "vW512*VVi15i15i3", true) +CUSTOM_BUILTIN(mma_pmxvi16ger2spp, "vW512*VVi15i15i3", true) +CUSTOM_BUILTIN(mma_xvf16ger2pp, "vW512*VV", true) +CUSTOM_BUILTIN(mma_xvf16ger2pn, "vW512*VV", true) +CUSTOM_BUILTIN(mma_xvf16ger2np, "vW512*VV", true) +CUSTOM_BUILTIN(mma_xvf16ger2nn, "vW512*VV", true) +CUSTOM_BUILTIN(mma_pmxvf16ger2pp, "vW512*VVi15i15i3", true) +CUSTOM_BUILTIN(mma_pmxvf16ger2pn, "vW512*VVi15i15i3", true) +CUSTOM_BUILTIN(mma_pmxvf16ger2np, "vW512*VVi15i15i3", true) +CUSTOM_BUILTIN(mma_pmxvf16ger2nn, "vW512*VVi15i15i3", true) +CUSTOM_BUILTIN(mma_xvf32gerpp, "vW512*VV", true) +CUSTOM_BUILTIN(mma_xvf32gerpn, "vW512*VV", true) +CUSTOM_BUILTIN(mma_xvf32gernp, "vW512*VV", true) +CUSTOM_BUILTIN(mma_xvf32gernn, "vW512*VV", true) +CUSTOM_BUILTIN(mma_pmxvf32gerpp, "vW512*VVi15i15", true) +CUSTOM_BUILTIN(mma_pmxvf32gerpn, "vW512*VVi15i15", true) +CUSTOM_BUILTIN(mma_pmxvf32gernp, "vW512*VVi15i15", true) +CUSTOM_BUILTIN(mma_pmxvf32gernn, "vW512*VVi15i15", true) +CUSTOM_BUILTIN(mma_xvf64gerpp, "vW512*W256V", true) +CUSTOM_BUILTIN(mma_xvf64gerpn, "vW512*W256V", true) +CUSTOM_BUILTIN(mma_xvf64gernp, "vW512*W256V", true) +CUSTOM_BUILTIN(mma_xvf64gernn, "vW512*W256V", true) +CUSTOM_BUILTIN(mma_pmxvf64gerpp, "vW512*W256Vi15i3", true) +CUSTOM_BUILTIN(mma_pmxvf64gerpn, "vW512*W256Vi15i3", true) +CUSTOM_BUILTIN(mma_pmxvf64gernp, "vW512*W256Vi15i3", true) +CUSTOM_BUILTIN(mma_pmxvf64gernn, "vW512*W256Vi15i3", true) +CUSTOM_BUILTIN(mma_xvbf16ger2, "vW512*VV", false) +CUSTOM_BUILTIN(mma_pmxvbf16ger2, "vW512*VVi15i15i3", false) +CUSTOM_BUILTIN(mma_xvbf16ger2pp, "vW512*VV", true) +CUSTOM_BUILTIN(mma_xvbf16ger2pn, "vW512*VV", true) +CUSTOM_BUILTIN(mma_xvbf16ger2np, "vW512*VV", true) +CUSTOM_BUILTIN(mma_xvbf16ger2nn, "vW512*VV", true) +CUSTOM_BUILTIN(mma_pmxvbf16ger2pp, "vW512*VVi15i15i3", true) +CUSTOM_BUILTIN(mma_pmxvbf16ger2pn, "vW512*VVi15i15i3", true) +CUSTOM_BUILTIN(mma_pmxvbf16ger2np, "vW512*VVi15i15i3", true) +CUSTOM_BUILTIN(mma_pmxvbf16ger2nn, "vW512*VVi15i15i3", true) // FIXME: Obviously incomplete. #undef BUILTIN -#undef MMA_BUILTIN +#undef CUSTOM_BUILTIN diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -14861,8 +14861,8 @@ // use custom code generation to expand a builtin call with a pointer to a // load (if the corresponding instruction accumulates its result) followed by // the call to the intrinsic and a store of the result. -#define MMA_BUILTIN(Name, Types, Accumulate) \ - case PPC::BI__builtin_mma_##Name: +#define CUSTOM_BUILTIN(Name, Types, Accumulate) \ + case PPC::BI__builtin_##Name: #include "clang/Basic/BuiltinsPPC.def" { // The first argument of these two builtins is a pointer used to store their @@ -14870,9 +14870,9 @@ // return values. So, here we emit code extracting these values from the // intrinsic results and storing them using that pointer. if (BuiltinID == PPC::BI__builtin_mma_disassemble_acc || - BuiltinID == PPC::BI__builtin_mma_disassemble_pair) { + BuiltinID == PPC::BI__builtin_vsx_disassemble_pair) { unsigned NumVecs = 2; - auto Intrinsic = Intrinsic::ppc_mma_disassemble_pair; + auto Intrinsic = Intrinsic::ppc_vsx_disassemble_pair; if (BuiltinID == PPC::BI__builtin_mma_disassemble_acc) { NumVecs = 4; Intrinsic = Intrinsic::ppc_mma_disassemble_acc; @@ -14893,16 +14893,16 @@ } bool Accumulate; switch (BuiltinID) { - #define MMA_BUILTIN(Name, Types, Acc) \ - case PPC::BI__builtin_mma_##Name: \ - ID = Intrinsic::ppc_mma_##Name; \ + #define CUSTOM_BUILTIN(Name, Types, Acc) \ + case PPC::BI__builtin_##Name: \ + ID = Intrinsic::ppc_##Name; \ Accumulate = Acc; \ break; #include "clang/Basic/BuiltinsPPC.def" } - if (BuiltinID == PPC::BI__builtin_mma_lxvp || - BuiltinID == PPC::BI__builtin_mma_stxvp) { - if (BuiltinID == PPC::BI__builtin_mma_lxvp) { + if (BuiltinID == PPC::BI__builtin_vsx_lxvp || + BuiltinID == PPC::BI__builtin_vsx_stxvp) { + if (BuiltinID == PPC::BI__builtin_vsx_lxvp) { Ops[1] = Builder.CreateBitCast(Ops[1], Int8PtrTy); Ops[0] = Builder.CreateGEP(Ops[1], Ops[0]); } else { diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3301,8 +3301,8 @@ return SemaBuiltinConstantArgRange(TheCall, 2, 0, 7); case PPC::BI__builtin_vsx_xxpermx: return SemaBuiltinConstantArgRange(TheCall, 3, 0, 7); -#define MMA_BUILTIN(Name, Types, Acc) \ - case PPC::BI__builtin_mma_##Name: \ +#define CUSTOM_BUILTIN(Name, Types, Acc) \ + case PPC::BI__builtin_##Name: \ return SemaBuiltinPPCMMACall(TheCall, Types); #include "clang/Basic/BuiltinsPPC.def" } diff --git a/clang/test/CodeGen/builtins-ppc-mma.c b/clang/test/CodeGen/builtins-ppc-pair-mma.c rename from clang/test/CodeGen/builtins-ppc-mma.c rename to clang/test/CodeGen/builtins-ppc-pair-mma.c --- a/clang/test/CodeGen/builtins-ppc-mma.c +++ b/clang/test/CodeGen/builtins-ppc-pair-mma.c @@ -44,7 +44,7 @@ // CHECK-LABEL: @test3( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call <256 x i1> @llvm.ppc.mma.assemble.pair(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) // CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <256 x i1>* // CHECK-NEXT: store <256 x i1> [[TMP0]], <256 x i1>* [[TMP1]], align 32, !tbaa !6 // CHECK-NEXT: ret void @@ -53,7 +53,7 @@ __vector_quad vq = *((__vector_quad *)vqp); __vector_pair vp = *((__vector_pair *)vpp); __vector_pair res; - __builtin_mma_assemble_pair(&res, vc, vc); + __builtin_vsx_assemble_pair(&res, vc, vc); *((__vector_pair *)resp) = res; } @@ -61,7 +61,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VPP:%.*]] to <256 x i1>* // CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, <256 x i1>* [[TMP0]], align 32 -// CHECK-NEXT: [[TMP2:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[TMP1]]) // CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <16 x i8>* // CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP2]], 0 // CHECK-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[TMP3]], align 16 @@ -72,7 +72,7 @@ // CHECK-NEXT: ret void // void test4(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { - __builtin_mma_disassemble_pair(resp, (__vector_pair*)vpp); + __builtin_vsx_disassemble_pair(resp, (__vector_pair*)vpp); } // CHECK-LABEL: @test5( @@ -1040,104 +1040,104 @@ // CHECK-LABEL: @test66( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* -// CHECK-NEXT: [[TMP1:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP0]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* [[TMP0]]) // CHECK-NEXT: [[TMP2:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8* -// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP1]], i8* [[TMP2]]) +// CHECK-NEXT: tail call void @llvm.ppc.vsx.stxvp(<256 x i1> [[TMP1]], i8* [[TMP2]]) // CHECK-NEXT: ret void // void test66(const __vector_pair *vpp, const __vector_pair *vp2) { - __vector_pair vp = __builtin_mma_lxvp(0LL, vpp); - __builtin_mma_stxvp(vp, 0LL, vp2); + __vector_pair vp = __builtin_vsx_lxvp(0LL, vpp); + __builtin_vsx_stxvp(vp, 0LL, vp2); } // CHECK-LABEL: @test67( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* // CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 [[OFFSET:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* [[TMP1]]) // CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8* // CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 [[OFFSET]] -// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]]) +// CHECK-NEXT: tail call void @llvm.ppc.vsx.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]]) // CHECK-NEXT: ret void // void test67(const __vector_pair *vpp, signed long long offset, const __vector_pair *vp2) { - __vector_pair vp = __builtin_mma_lxvp(offset, vpp); - __builtin_mma_stxvp(vp, offset, vp2); + __vector_pair vp = __builtin_vsx_lxvp(offset, vpp); + __builtin_vsx_stxvp(vp, offset, vp2); } // CHECK-LABEL: @test68( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* // CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 18 -// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* [[TMP1]]) // CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8* // CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 18 -// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]]) +// CHECK-NEXT: tail call void @llvm.ppc.vsx.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]]) // CHECK-NEXT: ret void // void test68(const __vector_pair *vpp, const __vector_pair *vp2) { - __vector_pair vp = __builtin_mma_lxvp(18LL, vpp); - __builtin_mma_stxvp(vp, 18LL, vp2); + __vector_pair vp = __builtin_vsx_lxvp(18LL, vpp); + __builtin_vsx_stxvp(vp, 18LL, vp2); } // CHECK-LABEL: @test69( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* // CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1 -// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* [[TMP1]]) // CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8* // CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 1 -// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]]) +// CHECK-NEXT: tail call void @llvm.ppc.vsx.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]]) // CHECK-NEXT: ret void // void test69(const __vector_pair *vpp, const __vector_pair *vp2) { - __vector_pair vp = __builtin_mma_lxvp(1LL, vpp); - __builtin_mma_stxvp(vp, 1LL, vp2); + __vector_pair vp = __builtin_vsx_lxvp(1LL, vpp); + __builtin_vsx_stxvp(vp, 1LL, vp2); } // CHECK-LABEL: @test70( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* // CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 42 -// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* [[TMP1]]) // CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8* // CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 42 -// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]]) +// CHECK-NEXT: tail call void @llvm.ppc.vsx.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]]) // CHECK-NEXT: ret void // void test70(const __vector_pair *vpp, const __vector_pair *vp2) { - __vector_pair vp = __builtin_mma_lxvp(42LL, vpp); - __builtin_mma_stxvp(vp, 42LL, vp2); + __vector_pair vp = __builtin_vsx_lxvp(42LL, vpp); + __builtin_vsx_stxvp(vp, 42LL, vp2); } // CHECK-LABEL: @test71( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = getelementptr <256 x i1>, <256 x i1>* [[VPP:%.*]], i64 128 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <256 x i1>* [[TMP0]] to i8* -// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* [[TMP1]]) // CHECK-NEXT: [[TMP3:%.*]] = getelementptr <256 x i1>, <256 x i1>* [[VP2:%.*]], i64 128 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <256 x i1>* [[TMP3]] to i8* -// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]]) +// CHECK-NEXT: tail call void @llvm.ppc.vsx.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]]) // CHECK-NEXT: ret void // void test71(const __vector_pair *vpp, const __vector_pair *vp2) { - __vector_pair vp = __builtin_mma_lxvp(32768LL, vpp); - __builtin_mma_stxvp(vp, 32768LL, vp2); + __vector_pair vp = __builtin_vsx_lxvp(32768LL, vpp); + __builtin_vsx_stxvp(vp, 32768LL, vp2); } // CHECK-LABEL: @test72( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* // CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 32799 -// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* [[TMP1]]) // CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8* // CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 32799 -// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]]) +// CHECK-NEXT: tail call void @llvm.ppc.vsx.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]]) // CHECK-NEXT: ret void // void test72(const __vector_pair *vpp, const __vector_pair *vp2) { - __vector_pair vp = __builtin_mma_lxvp(32799LL, vpp); - __builtin_mma_stxvp(vp, 32799LL, vp2); + __vector_pair vp = __builtin_vsx_lxvp(32799LL, vpp); + __builtin_vsx_stxvp(vp, 32799LL, vp2); } // CHECK-LABEL: @test73( @@ -1146,7 +1146,7 @@ // CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, [[TBAA2:!tbaa !.*]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP2]], i64 8 -// CHECK-NEXT: [[TMP4:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP3]]) +// CHECK-NEXT: [[TMP4:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> [[TMP1]], <256 x i1> [[TMP4]], <16 x i8> [[VC:%.*]], i32 0, i32 0) // CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* // CHECK-NEXT: store <512 x i1> [[TMP5]], <512 x i1>* [[TMP6]], align 64, [[TBAA2]] @@ -1154,7 +1154,7 @@ // void test73(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) { __vector_quad vq = *((__vector_quad *)vqp); - __vector_pair vp = __builtin_mma_lxvp(8LL, vpp); + __vector_pair vp = __builtin_vsx_lxvp(8LL, vpp); __builtin_mma_pmxvf64gernn(&vq, vp, vc, 0, 0); *((__vector_quad *)resp) = vq; } @@ -1164,7 +1164,7 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* // CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, [[TBAA2]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* -// CHECK-NEXT: [[TMP3:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* [[TMP2]]) // CHECK-NEXT: [[TMP4:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP1]], <256 x i1> [[TMP3]], <16 x i8> [[VC:%.*]]) // CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* // CHECK-NEXT: store <512 x i1> [[TMP4]], <512 x i1>* [[TMP5]], align 64, [[TBAA2]] @@ -1172,7 +1172,7 @@ // void test74(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) { __vector_quad vq = *((__vector_quad *)vqp); - __vector_pair vp = __builtin_mma_lxvp(0LL, vpp); + __vector_pair vp = __builtin_vsx_lxvp(0LL, vpp); __builtin_mma_xvf64gernp(&vq, vp, vc); *((__vector_quad *)resp) = vq; } @@ -1183,7 +1183,7 @@ // CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, [[TBAA2:!tbaa !.*]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[OFFS:%.*]] -// CHECK-NEXT: [[TMP4:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP3]]) +// CHECK-NEXT: [[TMP4:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP1]], <256 x i1> [[TMP4]], <16 x i8> [[VC:%.*]]) // CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* // CHECK-NEXT: store <512 x i1> [[TMP5]], <512 x i1>* [[TMP6]], align 64, [[TBAA2]] @@ -1191,7 +1191,7 @@ // void test75(unsigned char *vqp, signed long long offs, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) { __vector_quad vq = *((__vector_quad *)vqp); - __vector_pair vp = __builtin_mma_lxvp(offs, vpp); + __vector_pair vp = __builtin_vsx_lxvp(offs, vpp); __builtin_mma_xvf64gernp(&vq, vp, vc); *((__vector_quad *)resp) = vq; } diff --git a/clang/test/Sema/ppc-mma-types.c b/clang/test/Sema/ppc-pair-mma-types.c rename from clang/test/Sema/ppc-mma-types.c rename to clang/test/Sema/ppc-pair-mma-types.c --- a/clang/test/Sema/ppc-mma-types.c +++ b/clang/test/Sema/ppc-pair-mma-types.c @@ -246,7 +246,7 @@ __vector_pair *vpp = (__vector_pair *)ptr; __vector_pair vp1 = *vpp; __vector_pair vp2; - __builtin_mma_assemble_pair(&vp2, vc, vc); + __builtin_vsx_assemble_pair(&vp2, vc, vc); __vector_pair vp3; __vector_quad vq; __builtin_mma_xvf64ger(&vq, vp3, vc); @@ -320,16 +320,16 @@ } void testBuiltinTypes1(const __vector_pair *vpp, const __vector_pair *vp2, float f) { - __vector_pair vp = __builtin_mma_lxvp(f, vpp); // expected-error {{passing 'float' to parameter of incompatible type 'long long'}} - __builtin_mma_stxvp(vp, 32799, vp2); // expected-error {{passing 'int' to parameter of incompatible type 'long long'}} + __vector_pair vp = __builtin_vsx_lxvp(f, vpp); // expected-error {{passing 'float' to parameter of incompatible type 'long long'}} + __builtin_vsx_stxvp(vp, 32799, vp2); // expected-error {{passing 'int' to parameter of incompatible type 'long long'}} } void testBuiltinTypes2(__vector_pair *vpp, const __vector_pair *vp2, unsigned char c) { - __vector_pair vp = __builtin_mma_lxvp(6LL, vpp); // expected-error {{passing '__vector_pair *' to parameter of incompatible type 'const __vector_pair *'}} - __builtin_mma_stxvp(vp, c, vp2); // expected-error {{passing 'unsigned char' to parameter of incompatible type 'long long'}} + __vector_pair vp = __builtin_vsx_lxvp(6LL, vpp); // expected-error {{passing '__vector_pair *' to parameter of incompatible type 'const __vector_pair *'}} + __builtin_vsx_stxvp(vp, c, vp2); // expected-error {{passing 'unsigned char' to parameter of incompatible type 'long long'}} } void testBuiltinTypes3(vector int v, __vector_pair *vp2, signed long long ll, unsigned short s) { - __vector_pair vp = __builtin_mma_lxvp(ll, v); // expected-error {{passing '__vector int' (vector of 4 'int' values) to parameter of incompatible type 'const __vector_pair *'}} - __builtin_mma_stxvp(vp, ll, s); // expected-error {{passing 'unsigned short' to parameter of incompatible type 'const __vector_pair *'}} + __vector_pair vp = __builtin_vsx_lxvp(ll, v); // expected-error {{passing '__vector int' (vector of 4 'int' values) to parameter of incompatible type 'const __vector_pair *'}} + __builtin_vsx_stxvp(vp, ll, s); // expected-error {{passing 'unsigned short' to parameter of incompatible type 'const __vector_pair *'}} } diff --git a/clang/test/SemaCXX/ppc-mma-types.cpp b/clang/test/SemaCXX/ppc-pair-mma-types.cpp rename from clang/test/SemaCXX/ppc-mma-types.cpp rename to clang/test/SemaCXX/ppc-pair-mma-types.cpp --- a/clang/test/SemaCXX/ppc-mma-types.cpp +++ b/clang/test/SemaCXX/ppc-pair-mma-types.cpp @@ -367,7 +367,7 @@ __vector_pair *vpp = (__vector_pair *)ptr; return *vpp; // expected-error {{invalid use of PPC MMA type}} }; - auto f3 = [](vector unsigned char vc) { __vector_pair vp; __builtin_mma_assemble_pair(&vp, vc, vc); return vp; }; // expected-error {{invalid use of PPC MMA type}} + auto f3 = [](vector unsigned char vc) { __vector_pair vp; __builtin_vsx_assemble_pair(&vp, vc, vc); return vp; }; // expected-error {{invalid use of PPC MMA type}} } // cast diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td --- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td +++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td @@ -1132,12 +1132,8 @@ def int_ppc_vsx_lxvll : Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i64_ty], [IntrReadMem, IntrArgMemOnly]>; -def int_ppc_vsx_stxvl : - Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty, llvm_i64_ty], - [IntrWriteMem, IntrArgMemOnly]>; -def int_ppc_vsx_stxvll : - Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty, llvm_i64_ty], - [IntrWriteMem, IntrArgMemOnly]>; +def int_ppc_vsx_lxvp : + Intrinsic<[llvm_v256i1_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>; // Vector store. def int_ppc_vsx_stxvw4x : Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty], @@ -1148,6 +1144,15 @@ [IntrWriteMem, IntrArgMemOnly]>; def int_ppc_vsx_stxvd2x_be : Intrinsic<[], [llvm_v2f64_ty, llvm_ptr_ty], [IntrWriteMem, IntrArgMemOnly]>; +def int_ppc_vsx_stxvl : + Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty, llvm_i64_ty], + [IntrWriteMem, IntrArgMemOnly]>; +def int_ppc_vsx_stxvll : + Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty, llvm_i64_ty], + [IntrWriteMem, IntrArgMemOnly]>; +def int_ppc_vsx_stxvp : + Intrinsic<[], [llvm_v256i1_ty, llvm_ptr_ty], [IntrWriteMem, + IntrArgMemOnly]>; // Vector and scalar maximum. def int_ppc_vsx_xvmaxdp : PowerPC_VSX_Vec_DDD_Intrinsic<"xvmaxdp">; def int_ppc_vsx_xvmaxsp : PowerPC_VSX_Vec_FFF_Intrinsic<"xvmaxsp">; @@ -1406,6 +1411,14 @@ } let TargetPrefix = "ppc" in { + def int_ppc_vsx_assemble_pair : + Intrinsic<[llvm_v256i1_ty], + [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; + + def int_ppc_vsx_disassemble_pair : + Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty], + [llvm_v256i1_ty], [IntrNoMem]>; + def int_ppc_mma_assemble_acc : Intrinsic<[llvm_v512i1_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty], @@ -1415,14 +1428,6 @@ Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty], [llvm_v512i1_ty], [IntrNoMem]>; - def int_ppc_mma_assemble_pair : - Intrinsic<[llvm_v256i1_ty], - [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; - - def int_ppc_mma_disassemble_pair : - Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty], - [llvm_v256i1_ty], [IntrNoMem]>; - def int_ppc_mma_xxmtacc : Intrinsic<[llvm_v512i1_ty], [llvm_v512i1_ty], [IntrNoMem]>; @@ -1432,14 +1437,6 @@ def int_ppc_mma_xxsetaccz : Intrinsic<[llvm_v512i1_ty], [], [IntrNoMem]>; - def int_ppc_mma_lxvp : - Intrinsic<[llvm_v256i1_ty], [llvm_ptr_ty], - [IntrReadMem, IntrArgMemOnly]>; - - def int_ppc_mma_stxvp : - Intrinsic<[], [llvm_v256i1_ty, llvm_ptr_ty], - [IntrWriteMem, IntrArgMemOnly]>; - // MMA Reduced-Precision: Outer Product Intrinsic Definitions. defm int_ppc_mma_xvi4ger8 : PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -10614,7 +10614,7 @@ return DAG.getRegister(PPC::R2, MVT::i32); case Intrinsic::ppc_mma_disassemble_acc: - case Intrinsic::ppc_mma_disassemble_pair: { + case Intrinsic::ppc_vsx_disassemble_pair: { int NumVecs = 2; SDValue WideVec = Op.getOperand(1); if (IntrinsicID == Intrinsic::ppc_mma_disassemble_acc) { diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -1617,7 +1617,7 @@ let Predicates = [PairedVectorMemops] in { def : Pat<(v256i1 (PPCPairBuild v4i32:$vs1, v4i32:$vs0)), Concats.VecsToVecPair0>; - def : Pat<(v256i1 (int_ppc_mma_assemble_pair v16i8:$vs1, v16i8:$vs0)), + def : Pat<(v256i1 (int_ppc_vsx_assemble_pair v16i8:$vs1, v16i8:$vs0)), Concats.VecsToVecPair0>; def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, (i64 0))), (v4i32 (EXTRACT_SUBREG $v, sub_vsx0))>; @@ -1659,18 +1659,18 @@ let Predicates = [PairedVectorMemops] in { // Intrinsics for Paired Vector Loads. - def : Pat<(v256i1 (int_ppc_mma_lxvp iaddrX16:$src)), (LXVP memrix16:$src)>; - def : Pat<(v256i1 (int_ppc_mma_lxvp xaddrX16:$src)), (LXVPX xaddrX16:$src)>; + def : Pat<(v256i1 (int_ppc_vsx_lxvp iaddrX16:$src)), (LXVP memrix16:$src)>; + def : Pat<(v256i1 (int_ppc_vsx_lxvp xaddrX16:$src)), (LXVPX xaddrX16:$src)>; let Predicates = [PairedVectorMemops, PrefixInstrs] in { - def : Pat<(v256i1 (int_ppc_mma_lxvp iaddrX34:$src)), (PLXVP memri34:$src)>; + def : Pat<(v256i1 (int_ppc_vsx_lxvp iaddrX34:$src)), (PLXVP memri34:$src)>; } // Intrinsics for Paired Vector Stores. - def : Pat<(int_ppc_mma_stxvp v256i1:$XSp, iaddrX16:$dst), + def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, iaddrX16:$dst), (STXVP $XSp, memrix16:$dst)>; - def : Pat<(int_ppc_mma_stxvp v256i1:$XSp, xaddrX16:$dst), + def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, xaddrX16:$dst), (STXVPX $XSp, xaddrX16:$dst)>; let Predicates = [PairedVectorMemops, PrefixInstrs] in { - def : Pat<(int_ppc_mma_stxvp v256i1:$XSp, iaddrX34:$dst), + def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, iaddrX34:$dst), (PSTXVP $XSp, memri34:$dst)>; } } diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp --- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp +++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp @@ -276,9 +276,9 @@ return SMemI->getPointerOperand(); } else if (IntrinsicInst *IMemI = dyn_cast(MemI)) { if (IMemI->getIntrinsicID() == Intrinsic::prefetch || - IMemI->getIntrinsicID() == Intrinsic::ppc_mma_lxvp) + IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp) return IMemI->getArgOperand(0); - if (IMemI->getIntrinsicID() == Intrinsic::ppc_mma_stxvp) + if (IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp) return IMemI->getArgOperand(1); } @@ -347,10 +347,10 @@ PtrValue = SMemI->getPointerOperand(); } else if (IntrinsicInst *IMemI = dyn_cast(&J)) { if (IMemI->getIntrinsicID() == Intrinsic::prefetch || - IMemI->getIntrinsicID() == Intrinsic::ppc_mma_lxvp) { + IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp) { MemI = IMemI; PtrValue = IMemI->getArgOperand(0); - } else if (IMemI->getIntrinsicID() == Intrinsic::ppc_mma_stxvp) { + } else if (IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp) { MemI = IMemI; PtrValue = IMemI->getArgOperand(1); } else continue; @@ -834,8 +834,8 @@ return false; // There are no update forms for P10 lxvp/stxvp intrinsic. auto *II = dyn_cast(I); - if (II && ((II->getIntrinsicID() == Intrinsic::ppc_mma_lxvp) || - II->getIntrinsicID() == Intrinsic::ppc_mma_stxvp)) + if (II && ((II->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp) || + II->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp)) return false; // See getPreIndexedAddressParts, the displacement for LDU/STDU has to // be 4's multiple (DS-form). For i64 loads/stores when the displacement @@ -877,8 +877,8 @@ // Check if it is a P10 lxvp/stxvp intrinsic. auto *II = dyn_cast(I); if (II) - return II->getIntrinsicID() == Intrinsic::ppc_mma_lxvp || - II->getIntrinsicID() == Intrinsic::ppc_mma_stxvp; + return II->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp || + II->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp; // Check if it is a P9 vector load/store. return ST && ST->hasP9Vector() && (PtrValue->getType()->getPointerElementType()->isVectorTy()); diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -1224,7 +1224,7 @@ case Intrinsic::ppc_vsx_lxvw4x_be: case Intrinsic::ppc_vsx_lxvl: case Intrinsic::ppc_vsx_lxvll: - case Intrinsic::ppc_mma_lxvp: { + case Intrinsic::ppc_vsx_lxvp: { Info.PtrVal = Inst->getArgOperand(0); Info.ReadMem = true; Info.WriteMem = false; @@ -1241,7 +1241,7 @@ case Intrinsic::ppc_vsx_stxvw4x_be: case Intrinsic::ppc_vsx_stxvl: case Intrinsic::ppc_vsx_stxvll: - case Intrinsic::ppc_mma_stxvp: { + case Intrinsic::ppc_vsx_stxvp: { Info.PtrVal = Inst->getArgOperand(1); Info.ReadMem = false; Info.WriteMem = true; diff --git a/llvm/test/CodeGen/PowerPC/dform-pair-load-store.ll b/llvm/test/CodeGen/PowerPC/dform-pair-load-store.ll --- a/llvm/test/CodeGen/PowerPC/dform-pair-load-store.ll +++ b/llvm/test/CodeGen/PowerPC/dform-pair-load-store.ll @@ -8,8 +8,8 @@ ; This test checks that LSR properly recognizes lxvp/stxvp as load/store ; intrinsics to avoid generating x-form instructions instead of d-forms. -declare <256 x i1> @llvm.ppc.mma.lxvp(i8*) -declare void @llvm.ppc.mma.stxvp(<256 x i1>, i8*) +declare <256 x i1> @llvm.ppc.vsx.lxvp(i8*) +declare void @llvm.ppc.vsx.stxvp(<256 x i1>, i8*) define void @foo(i32 zeroext %n, <256 x i1>* %ptr, <256 x i1>* %ptr2) { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: # %entry @@ -78,24 +78,24 @@ for.body: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] %2 = getelementptr i8, i8* %0, i64 %indvars.iv - %3 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %2) + %3 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %2) %add2 = add nuw nsw i64 %indvars.iv, 32 %4 = getelementptr i8, i8* %0, i64 %add2 - %5 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %4) + %5 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %4) %add4 = add nuw nsw i64 %indvars.iv, 64 %6 = getelementptr i8, i8* %0, i64 %add4 - %7 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %6) + %7 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %6) %add6 = add nuw nsw i64 %indvars.iv, 96 %8 = getelementptr i8, i8* %0, i64 %add6 - %9 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %8) + %9 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %8) %10 = getelementptr i8, i8* %1, i64 %indvars.iv - tail call void @llvm.ppc.mma.stxvp(<256 x i1> %3, i8* %10) + tail call void @llvm.ppc.vsx.stxvp(<256 x i1> %3, i8* %10) %11 = getelementptr i8, i8* %1, i64 %add2 - tail call void @llvm.ppc.mma.stxvp(<256 x i1> %5, i8* %11) + tail call void @llvm.ppc.vsx.stxvp(<256 x i1> %5, i8* %11) %12 = getelementptr i8, i8* %1, i64 %add4 - tail call void @llvm.ppc.mma.stxvp(<256 x i1> %7, i8* %12) + tail call void @llvm.ppc.vsx.stxvp(<256 x i1> %7, i8* %12) %13 = getelementptr i8, i8* %1, i64 %add6 - tail call void @llvm.ppc.mma.stxvp(<256 x i1> %9, i8* %13) + tail call void @llvm.ppc.vsx.stxvp(<256 x i1> %9, i8* %13) %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count br i1 %exitcond.not, label %for.cond.cleanup, label %for.body diff --git a/llvm/test/CodeGen/PowerPC/loop-p10-pair-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-p10-pair-prepare.ll --- a/llvm/test/CodeGen/PowerPC/loop-p10-pair-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/loop-p10-pair-prepare.ll @@ -81,13 +81,13 @@ %x_ix_dim_0_6 = getelementptr %_elem_type_of_x, %_elem_type_of_x* %x_rvo_based_addr_5, i64 %i.08 %x_ix_dim_0_ = bitcast %_elem_type_of_x* %x_ix_dim_0_6 to i8* %0 = getelementptr i8, i8* %x_ix_dim_0_, i64 1 - %1 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %0) - %2 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %1) + %1 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %0) + %2 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> %1) %.fca.0.extract1 = extractvalue { <16 x i8>, <16 x i8> } %2, 0 %.fca.1.extract2 = extractvalue { <16 x i8>, <16 x i8> } %2, 1 %3 = getelementptr i8, i8* %x_ix_dim_0_, i64 33 - %4 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %3) - %5 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %4) + %4 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %3) + %5 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> %4) %.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %5, 0 %.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %5, 1 %6 = bitcast <16 x i8> %.fca.0.extract1 to <2 x double> @@ -110,5 +110,5 @@ ret void } -declare <256 x i1> @llvm.ppc.mma.lxvp(i8*) -declare { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1>) +declare <256 x i1> @llvm.ppc.vsx.lxvp(i8*) +declare { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1>) diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll --- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll @@ -40,28 +40,6 @@ ret void } -; assemble_pair -declare <256 x i1> @llvm.ppc.mma.assemble.pair(<16 x i8>, <16 x i8>) -define void @ass_pair(<256 x i1>* %ptr, <16 x i8> %vc) { -; CHECK-LABEL: ass_pair: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmr v3, v2 -; CHECK-NEXT: stxv v2, 16(r3) -; CHECK-NEXT: stxv v3, 0(r3) -; CHECK-NEXT: blr -; -; CHECK-BE-LABEL: ass_pair: -; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: vmr v3, v2 -; CHECK-BE-NEXT: stxv v2, 16(r3) -; CHECK-BE-NEXT: stxv v2, 0(r3) -; CHECK-BE-NEXT: blr -entry: - %0 = tail call <256 x i1> @llvm.ppc.mma.assemble.pair(<16 x i8> %vc, <16 x i8> %vc) - store <256 x i1> %0, <256 x i1>* %ptr, align 32 - ret void -} - ; xxmtacc declare <512 x i1> @llvm.ppc.mma.xxmtacc(<512 x i1>) define void @int_xxmtacc(<512 x i1>* %ptr, <16 x i8> %vc) { @@ -202,51 +180,23 @@ ret void } -; disassemble_pair -declare { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1>) -define void @disass_pair(<256 x i1>* %ptr1, <16 x i8>* %ptr2, <16 x i8>* %ptr3) { -; CHECK-LABEL: disass_pair: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lxv vs1, 0(r3) -; CHECK-NEXT: lxv vs0, 16(r3) -; CHECK-NEXT: stxv vs1, 0(r4) -; CHECK-NEXT: stxv vs0, 0(r5) -; CHECK-NEXT: blr -; -; CHECK-BE-LABEL: disass_pair: -; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lxv vs1, 16(r3) -; CHECK-BE-NEXT: lxv vs0, 0(r3) -; CHECK-BE-NEXT: stxv vs0, 0(r4) -; CHECK-BE-NEXT: stxv vs1, 0(r5) -; CHECK-BE-NEXT: blr -entry: - %0 = load <256 x i1>, <256 x i1>* %ptr1, align 32 - %1 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %0) - %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0 - %3 = extractvalue { <16 x i8>, <16 x i8> } %1, 1 - store <16 x i8> %2, <16 x i8>* %ptr2, align 16 - store <16 x i8> %3, <16 x i8>* %ptr3, align 16 - ret void -} - declare <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1>, <16 x i8>, <16 x i8>) define void @testBranch(<512 x i1>* %ptr, <16 x i8> %vc, i32 %val) { ; CHECK-LABEL: testBranch: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cmplwi r7, 0 -; CHECK-NEXT: beq cr0, .LBB7_2 +; CHECK-NEXT: beq cr0, .LBB5_2 ; CHECK-NEXT: # %bb.1: # %if.then ; CHECK-NEXT: xxsetaccz acc0 -; CHECK-NEXT: b .LBB7_3 -; CHECK-NEXT: .LBB7_2: # %if.else +; CHECK-NEXT: b .LBB5_3 +; CHECK-NEXT: .LBB5_2: # %if.else ; CHECK-NEXT: lxv vs1, 32(r3) ; CHECK-NEXT: lxv vs0, 48(r3) ; CHECK-NEXT: lxv vs3, 0(r3) ; CHECK-NEXT: lxv vs2, 16(r3) ; CHECK-NEXT: xxmtacc acc0 ; CHECK-NEXT: xvi4ger8pp acc0, v2, v2 -; CHECK-NEXT: .LBB7_3: # %if.end +; CHECK-NEXT: .LBB5_3: # %if.end ; CHECK-NEXT: xxmfacc acc0 ; CHECK-NEXT: stxv vs0, 48(r3) ; CHECK-NEXT: stxv vs1, 32(r3) @@ -257,18 +207,18 @@ ; CHECK-BE-LABEL: testBranch: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: cmplwi r7, 0 -; CHECK-BE-NEXT: beq cr0, .LBB7_2 +; CHECK-BE-NEXT: beq cr0, .LBB5_2 ; CHECK-BE-NEXT: # %bb.1: # %if.then ; CHECK-BE-NEXT: xxsetaccz acc0 -; CHECK-BE-NEXT: b .LBB7_3 -; CHECK-BE-NEXT: .LBB7_2: # %if.else +; CHECK-BE-NEXT: b .LBB5_3 +; CHECK-BE-NEXT: .LBB5_2: # %if.else ; CHECK-BE-NEXT: lxv vs1, 16(r3) ; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: lxv vs3, 48(r3) ; CHECK-BE-NEXT: lxv vs2, 32(r3) ; CHECK-BE-NEXT: xxmtacc acc0 ; CHECK-BE-NEXT: xvi4ger8pp acc0, v2, v2 -; CHECK-BE-NEXT: .LBB7_3: # %if.end +; CHECK-BE-NEXT: .LBB5_3: # %if.end ; CHECK-BE-NEXT: xxmfacc acc0 ; CHECK-BE-NEXT: stxv vs1, 16(r3) ; CHECK-BE-NEXT: stxv vs0, 0(r3) @@ -447,7 +397,7 @@ ; CHECK-NEXT: mtctr r4 ; CHECK-NEXT: li r4, 0 ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB11_2: # %for.body +; CHECK-NEXT: .LBB9_2: # %for.body ; CHECK-NEXT: # ; CHECK-NEXT: rldic r7, r6, 4, 28 ; CHECK-NEXT: addi r6, r6, 6 @@ -482,7 +432,7 @@ ; CHECK-NEXT: stxv vs1, 160(r7) ; CHECK-NEXT: stxv vs2, 144(r7) ; CHECK-NEXT: stxv vs3, 128(r7) -; CHECK-NEXT: bdnz .LBB11_2 +; CHECK-NEXT: bdnz .LBB9_2 ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup ; CHECK-NEXT: blr ; @@ -496,7 +446,7 @@ ; CHECK-BE-NEXT: mtctr r4 ; CHECK-BE-NEXT: li r4, 0 ; CHECK-BE-NEXT: .p2align 4 -; CHECK-BE-NEXT: .LBB11_2: # %for.body +; CHECK-BE-NEXT: .LBB9_2: # %for.body ; CHECK-BE-NEXT: # ; CHECK-BE-NEXT: rldic r7, r6, 4, 28 ; CHECK-BE-NEXT: addi r6, r6, 6 @@ -531,7 +481,7 @@ ; CHECK-BE-NEXT: stxv vs0, 128(r7) ; CHECK-BE-NEXT: stxv vs3, 176(r7) ; CHECK-BE-NEXT: stxv vs2, 160(r7) -; CHECK-BE-NEXT: bdnz .LBB11_2 +; CHECK-BE-NEXT: bdnz .LBB9_2 ; CHECK-BE-NEXT: # %bb.3: # %for.cond.cleanup ; CHECK-BE-NEXT: blr entry: @@ -674,189 +624,12 @@ ret void } -declare <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1>, <256 x i1>, <16 x i8>, i32, i32) -declare <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1>, <256 x i1>, <16 x i8>) - -; Function Attrs: nounwind -define void @test_ldst_1(<256 x i1>* %vpp, <256 x i1>* %vp2) { -; CHECK-LABEL: test_ldst_1: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lxvp vsp0, 0(r3) -; CHECK-NEXT: stxvp vsp0, 0(r4) -; CHECK-NEXT: blr -; -; CHECK-BE-LABEL: test_ldst_1: -; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lxvp vsp0, 0(r3) -; CHECK-BE-NEXT: stxvp vsp0, 0(r4) -; CHECK-BE-NEXT: blr -entry: - %0 = bitcast <256 x i1>* %vpp to i8* - %1 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %0) - %2 = bitcast <256 x i1>* %vp2 to i8* - tail call void @llvm.ppc.mma.stxvp(<256 x i1> %1, i8* %2) - ret void -} - -; Function Attrs: argmemonly nounwind readonly -declare <256 x i1> @llvm.ppc.mma.lxvp(i8*) - -; Function Attrs: argmemonly nounwind writeonly -declare void @llvm.ppc.mma.stxvp(<256 x i1>, i8*) - -; Function Attrs: nounwind -define void @test_ldst_2(<256 x i1>* %vpp, i64 %offset, <256 x i1>* %vp2) { -; CHECK-LABEL: test_ldst_2: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lxvpx vsp0, r3, r4 -; CHECK-NEXT: stxvpx vsp0, r5, r4 -; CHECK-NEXT: blr -; -; CHECK-BE-LABEL: test_ldst_2: -; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lxvpx vsp0, r3, r4 -; CHECK-BE-NEXT: stxvpx vsp0, r5, r4 -; CHECK-BE-NEXT: blr -entry: - %0 = bitcast <256 x i1>* %vpp to i8* - %1 = getelementptr i8, i8* %0, i64 %offset - %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1) - %3 = bitcast <256 x i1>* %vp2 to i8* - %4 = getelementptr i8, i8* %3, i64 %offset - tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4) - ret void -} - -; Function Attrs: nounwind -define void @test_ldst_3(<256 x i1>* %vpp, <256 x i1>* %vp2) { -; CHECK-LABEL: test_ldst_3: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: li r5, 18 -; CHECK-NEXT: lxvpx vsp0, r3, r5 -; CHECK-NEXT: stxvpx vsp0, r4, r5 -; CHECK-NEXT: blr -; -; CHECK-BE-LABEL: test_ldst_3: -; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: li r5, 18 -; CHECK-BE-NEXT: lxvpx vsp0, r3, r5 -; CHECK-BE-NEXT: stxvpx vsp0, r4, r5 -; CHECK-BE-NEXT: blr -entry: - %0 = bitcast <256 x i1>* %vpp to i8* - %1 = getelementptr i8, i8* %0, i64 18 - %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1) - %3 = bitcast <256 x i1>* %vp2 to i8* - %4 = getelementptr i8, i8* %3, i64 18 - tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4) - ret void -} - -; Function Attrs: nounwind -define void @test_ldst_4(<256 x i1>* %vpp, <256 x i1>* %vp2) { -; CHECK-LABEL: test_ldst_4: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: li r5, 1 -; CHECK-NEXT: lxvpx vsp0, r3, r5 -; CHECK-NEXT: stxvpx vsp0, r4, r5 -; CHECK-NEXT: blr -; -; CHECK-BE-LABEL: test_ldst_4: -; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: li r5, 1 -; CHECK-BE-NEXT: lxvpx vsp0, r3, r5 -; CHECK-BE-NEXT: stxvpx vsp0, r4, r5 -; CHECK-BE-NEXT: blr -entry: - %0 = bitcast <256 x i1>* %vpp to i8* - %1 = getelementptr i8, i8* %0, i64 1 - %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1) - %3 = bitcast <256 x i1>* %vp2 to i8* - %4 = getelementptr i8, i8* %3, i64 1 - tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4) - ret void -} - -; Function Attrs: nounwind -define void @test_ldst_5(<256 x i1>* %vpp, <256 x i1>* %vp2) { -; CHECK-LABEL: test_ldst_5: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: li r5, 42 -; CHECK-NEXT: lxvpx vsp0, r3, r5 -; CHECK-NEXT: stxvpx vsp0, r4, r5 -; CHECK-NEXT: blr -; -; CHECK-BE-LABEL: test_ldst_5: -; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: li r5, 42 -; CHECK-BE-NEXT: lxvpx vsp0, r3, r5 -; CHECK-BE-NEXT: stxvpx vsp0, r4, r5 -; CHECK-BE-NEXT: blr -entry: - %0 = bitcast <256 x i1>* %vpp to i8* - %1 = getelementptr i8, i8* %0, i64 42 - %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1) - %3 = bitcast <256 x i1>* %vp2 to i8* - %4 = getelementptr i8, i8* %3, i64 42 - tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4) - ret void -} - -; Function Attrs: nounwind -define void @test_ldst_6(<256 x i1>* %vpp, <256 x i1>* %vp2) { -; CHECK-LABEL: test_ldst_6: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lxvp vsp0, 4096(r3) -; CHECK-NEXT: stxvp vsp0, 4096(r4) -; CHECK-NEXT: blr -; -; CHECK-BE-LABEL: test_ldst_6: -; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lxvp vsp0, 4096(r3) -; CHECK-BE-NEXT: stxvp vsp0, 4096(r4) -; CHECK-BE-NEXT: blr -entry: - %0 = getelementptr <256 x i1>, <256 x i1>* %vpp, i64 128 - %1 = bitcast <256 x i1>* %0 to i8* - %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1) - %3 = getelementptr <256 x i1>, <256 x i1>* %vp2, i64 128 - %4 = bitcast <256 x i1>* %3 to i8* - tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4) - ret void -} - -; Function Attrs: nounwind -define void @test_ldst_7(<256 x i1>* %vpp, <256 x i1>* %vp2) { -; FIXME: A prefixed load (plxvp) is expected here as the offset in this -; test case is a constant that fits within 34-bits. -; CHECK-LABEL: test_ldst_7: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: li r5, 0 -; CHECK-NEXT: ori r5, r5, 32799 -; CHECK-NEXT: lxvpx vsp0, r3, r5 -; CHECK-NEXT: stxvpx vsp0, r4, r5 -; CHECK-NEXT: blr -; -; CHECK-BE-LABEL: test_ldst_7: -; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: li r5, 0 -; CHECK-BE-NEXT: ori r5, r5, 32799 -; CHECK-BE-NEXT: lxvpx vsp0, r3, r5 -; CHECK-BE-NEXT: stxvpx vsp0, r4, r5 -; CHECK-BE-NEXT: blr -entry: - %0 = bitcast <256 x i1>* %vpp to i8* - %1 = getelementptr i8, i8* %0, i64 32799 - %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1) - %3 = bitcast <256 x i1>* %vp2 to i8* - %4 = getelementptr i8, i8* %3, i64 32799 - tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4) - ret void -} +declare <256 x i1> @llvm.ppc.vsx.lxvp(i8*) +declare void @llvm.ppc.vsx.stxvp(<256 x i1>, i8*) ; Function Attrs: nofree nounwind -define void @test_ldst_8(i8* nocapture readonly %vqp, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp) { -; CHECK-LABEL: test_ldst_8: +define void @test_ldst_1(i8* nocapture readonly %vqp, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp) { +; CHECK-LABEL: test_ldst_1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lxv vs1, 32(r3) ; CHECK-NEXT: lxv vs0, 48(r3) @@ -873,7 +646,7 @@ ; CHECK-NEXT: stxv vs3, 0(r7) ; CHECK-NEXT: blr ; -; CHECK-BE-LABEL: test_ldst_8: +; CHECK-BE-LABEL: test_ldst_1: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 16(r3) ; CHECK-BE-NEXT: lxv vs0, 0(r3) @@ -894,7 +667,7 @@ %1 = load <512 x i1>, <512 x i1>* %0, align 64 %2 = bitcast <256 x i1>* %vpp to i8* %3 = getelementptr i8, i8* %2, i64 8 - %4 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %3) + %4 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %3) %5 = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> %1, <256 x i1> %4, <16 x i8> %vc, i32 0, i32 0) %6 = bitcast i8* %resp to <512 x i1>* store <512 x i1> %5, <512 x i1>* %6, align 64 @@ -902,8 +675,8 @@ } ; Function Attrs: nofree nounwind -define void @test_ldst_9(i8* nocapture readonly %vqp, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp) { -; CHECK-LABEL: test_ldst_9: +define void @test_ldst_2(i8* nocapture readonly %vqp, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp) { +; CHECK-LABEL: test_ldst_2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lxv vs1, 32(r3) ; CHECK-NEXT: lxv vs0, 48(r3) @@ -919,7 +692,7 @@ ; CHECK-NEXT: stxv vs3, 0(r7) ; CHECK-NEXT: blr ; -; CHECK-BE-LABEL: test_ldst_9: +; CHECK-BE-LABEL: test_ldst_2: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 16(r3) ; CHECK-BE-NEXT: lxv vs0, 0(r3) @@ -938,7 +711,7 @@ %0 = bitcast i8* %vqp to <512 x i1>* %1 = load <512 x i1>, <512 x i1>* %0, align 64 %2 = bitcast <256 x i1>* %vpp to i8* - %3 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %2) + %3 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %2) %4 = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> %1, <256 x i1> %3, <16 x i8> %vc) %5 = bitcast i8* %resp to <512 x i1>* store <512 x i1> %4, <512 x i1>* %5, align 64 @@ -946,8 +719,8 @@ } ; Function Attrs: nofree nounwind -define void @test_ldst_10(i8* nocapture readonly %vqp, i64 %offs, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp) { -; CHECK-LABEL: test_ldst_10: +define void @test_ldst_3(i8* nocapture readonly %vqp, i64 %offs, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp) { +; CHECK-LABEL: test_ldst_3: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lxv vs1, 32(r3) ; CHECK-NEXT: lxv vs0, 48(r3) @@ -963,7 +736,7 @@ ; CHECK-NEXT: stxv vs3, 0(r9) ; CHECK-NEXT: blr ; -; CHECK-BE-LABEL: test_ldst_10: +; CHECK-BE-LABEL: test_ldst_3: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 16(r3) ; CHECK-BE-NEXT: lxv vs0, 0(r3) @@ -982,9 +755,12 @@ %0 = bitcast i8* %vqp to <512 x i1>* %1 = load <512 x i1>, <512 x i1>* %0, align 64 %2 = bitcast <256 x i1>* %vpp to i8* - %3 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %2) + %3 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %2) %4 = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> %1, <256 x i1> %3, <16 x i8> %vc) %5 = bitcast i8* %resp to <512 x i1>* store <512 x i1> %4, <512 x i1>* %5, align 64 ret void } + +declare <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1>, <256 x i1>, <16 x i8>, i32, i32) +declare <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1>, <256 x i1>, <16 x i8>) diff --git a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll --- a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll +++ b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll @@ -7,7 +7,7 @@ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) -declare <256 x i1> @llvm.ppc.mma.assemble.pair(<16 x i8>, <16 x i8>) +declare <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8>, <16 x i8>) define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i8> %vc4, i8* %ptr) { ; CHECK-LABEL: intrinsics1: ; CHECK: # %bb.0: @@ -62,7 +62,7 @@ %2 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> %1, <16 x i8> %vc1, <16 x i8> %vc2) %3 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %2, <16 x i8> %vc1, <16 x i8> %vc3) %4 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpn(<512 x i1> %3, <16 x i8> %vc2, <16 x i8> %vc4, i32 0, i32 0) - %5 = tail call <256 x i1> @llvm.ppc.mma.assemble.pair(<16 x i8> %vc4, <16 x i8> %vc1) + %5 = tail call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %vc4, <16 x i8> %vc1) %6 = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernp(<512 x i1> %4, <256 x i1> %5, <16 x i8> %vc1, i32 0, i32 0) %7 = bitcast i8* %ptr to <512 x i1>* store <512 x i1> %6, <512 x i1>* %7, align 64 @@ -126,7 +126,7 @@ %2 = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4pp(<512 x i1> %1, <16 x i8> %vc1, <16 x i8> %vc2) %3 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pn(<512 x i1> %2, <16 x i8> %vc1, <16 x i8> %vc3) %4 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernn(<512 x i1> %3, <16 x i8> %vc2, <16 x i8> %vc4, i32 0, i32 0) - %5 = tail call <256 x i1> @llvm.ppc.mma.assemble.pair(<16 x i8> %vc4, <16 x i8> %vc1) + %5 = tail call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %vc4, <16 x i8> %vc1) %6 = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> %4, <256 x i1> %5, <16 x i8> %vc1, i32 0, i32 0) %7 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %6) %8 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %7, 0 diff --git a/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll --- a/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll +++ b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll @@ -6,7 +6,7 @@ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE -declare <256 x i1> @llvm.ppc.mma.assemble.pair(<16 x i8>, <16 x i8>) +declare <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8>, <16 x i8>) declare <512 x i1> @llvm.ppc.mma.xxsetaccz() declare <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1>, <256 x i1>, <16 x i8>) declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1>) @@ -68,7 +68,7 @@ %0 = load <16 x i8>, <16 x i8>* %Src, align 16 %arrayidx1 = getelementptr inbounds <16 x i8>, <16 x i8>* %Src, i64 1 %1 = load <16 x i8>, <16 x i8>* %arrayidx1, align 16 - %2 = tail call <256 x i1> @llvm.ppc.mma.assemble.pair(<16 x i8> %0, <16 x i8> %1) + %2 = tail call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %0, <16 x i8> %1) %3 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz() %cmp11 = icmp sgt i32 %Len, 2 br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup @@ -165,7 +165,7 @@ %0 = load <16 x i8>, <16 x i8>* %Src, align 16 %arrayidx1 = getelementptr inbounds <16 x i8>, <16 x i8>* %Src, i64 1 %1 = load <16 x i8>, <16 x i8>* %arrayidx1, align 16 - %2 = tail call <256 x i1> @llvm.ppc.mma.assemble.pair(<16 x i8> %0, <16 x i8> %1) + %2 = tail call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %0, <16 x i8> %1) %arrayidx2 = getelementptr inbounds <16 x i8>, <16 x i8>* %Src, i64 2 %3 = load <16 x i8>, <16 x i8>* %arrayidx2, align 16 %4 = tail call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> %2, <16 x i8> %3) diff --git a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll --- a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll @@ -469,38 +469,38 @@ %_ix_x_len = shl nuw nsw i64 %indvars.iv, 3 %x_ix_dim_0_113 = getelementptr inbounds %_elem_type_of_x, %_elem_type_of_x* %x_rvo_based_addr_112, i64 %indvars.iv %x_ix_dim_0_ = bitcast %_elem_type_of_x* %x_ix_dim_0_113 to i8* - %55 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* nonnull %x_ix_dim_0_) + %55 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* nonnull %x_ix_dim_0_) %a_ix_dim_1_ = getelementptr inbounds i8, i8* %a_ix_dim_0_, i64 %_ix_x_len - %56 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* nonnull %a_ix_dim_1_) + %56 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* nonnull %a_ix_dim_1_) %a_ix_dim_1_29 = getelementptr inbounds i8, i8* %a_ix_dim_0_25, i64 %_ix_x_len - %57 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* nonnull %a_ix_dim_1_29) + %57 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* nonnull %a_ix_dim_1_29) %a_ix_dim_1_45 = getelementptr inbounds i8, i8* %a_ix_dim_0_41, i64 %_ix_x_len - %58 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* nonnull %a_ix_dim_1_45) + %58 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* nonnull %a_ix_dim_1_45) %a_ix_dim_1_61 = getelementptr inbounds i8, i8* %a_ix_dim_0_57, i64 %_ix_x_len - %59 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* nonnull %a_ix_dim_1_61) + %59 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* nonnull %a_ix_dim_1_61) %a_ix_dim_1_77 = getelementptr inbounds i8, i8* %a_ix_dim_0_73, i64 %_ix_x_len - %60 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* nonnull %a_ix_dim_1_77) + %60 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* nonnull %a_ix_dim_1_77) %a_ix_dim_1_93 = getelementptr inbounds i8, i8* %a_ix_dim_0_89, i64 %_ix_x_len - %61 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* nonnull %a_ix_dim_1_93) - %62 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %55) + %61 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* nonnull %a_ix_dim_1_93) + %62 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> %55) %.fca.0.extract35 = extractvalue { <16 x i8>, <16 x i8> } %62, 0 %.fca.1.extract36 = extractvalue { <16 x i8>, <16 x i8> } %62, 1 - %63 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %56) + %63 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> %56) %.fca.0.extract29 = extractvalue { <16 x i8>, <16 x i8> } %63, 0 %.fca.1.extract30 = extractvalue { <16 x i8>, <16 x i8> } %63, 1 - %64 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %57) + %64 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> %57) %.fca.0.extract23 = extractvalue { <16 x i8>, <16 x i8> } %64, 0 %.fca.1.extract24 = extractvalue { <16 x i8>, <16 x i8> } %64, 1 - %65 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %58) + %65 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> %58) %.fca.0.extract17 = extractvalue { <16 x i8>, <16 x i8> } %65, 0 %.fca.1.extract18 = extractvalue { <16 x i8>, <16 x i8> } %65, 1 - %66 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %59) + %66 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> %59) %.fca.0.extract11 = extractvalue { <16 x i8>, <16 x i8> } %66, 0 %.fca.1.extract12 = extractvalue { <16 x i8>, <16 x i8> } %66, 1 - %67 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %60) + %67 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> %60) %.fca.0.extract5 = extractvalue { <16 x i8>, <16 x i8> } %67, 0 %.fca.1.extract6 = extractvalue { <16 x i8>, <16 x i8> } %67, 1 - %68 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %61) + %68 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> %61) %.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %68, 0 %.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %68, 1 %69 = bitcast <16 x i8> %.fca.0.extract29 to <2 x double> @@ -518,38 +518,38 @@ %81 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %80, <2 x double> %70, <2 x double> %49) %82 = getelementptr %_elem_type_of_x, %_elem_type_of_x* %x_ix_dim_0_113, i64 4 %83 = bitcast %_elem_type_of_x* %82 to i8* - %84 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %83) + %84 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %83) %85 = getelementptr i8, i8* %a_ix_dim_1_, i64 32 - %86 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %85) + %86 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %85) %87 = getelementptr i8, i8* %a_ix_dim_1_29, i64 32 - %88 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %87) + %88 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %87) %89 = getelementptr i8, i8* %a_ix_dim_1_45, i64 32 - %90 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %89) + %90 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %89) %91 = getelementptr i8, i8* %a_ix_dim_1_61, i64 32 - %92 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %91) + %92 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %91) %93 = getelementptr i8, i8* %a_ix_dim_1_77, i64 32 - %94 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %93) + %94 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %93) %95 = getelementptr i8, i8* %a_ix_dim_1_93, i64 32 - %96 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %95) - %97 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %84) + %96 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %95) + %97 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> %84) %.fca.0.extract37 = extractvalue { <16 x i8>, <16 x i8> } %97, 0 %.fca.1.extract39 = extractvalue { <16 x i8>, <16 x i8> } %97, 1 - %98 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %86) + %98 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> %86) %.fca.0.extract31 = extractvalue { <16 x i8>, <16 x i8> } %98, 0 %.fca.1.extract33 = extractvalue { <16 x i8>, <16 x i8> } %98, 1 - %99 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %88) + %99 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> %88) %.fca.0.extract25 = extractvalue { <16 x i8>, <16 x i8> } %99, 0 %.fca.1.extract27 = extractvalue { <16 x i8>, <16 x i8> } %99, 1 - %100 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %90) + %100 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> %90) %.fca.0.extract19 = extractvalue { <16 x i8>, <16 x i8> } %100, 0 %.fca.1.extract21 = extractvalue { <16 x i8>, <16 x i8> } %100, 1 - %101 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %92) + %101 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> %92) %.fca.0.extract13 = extractvalue { <16 x i8>, <16 x i8> } %101, 0 %.fca.1.extract15 = extractvalue { <16 x i8>, <16 x i8> } %101, 1 - %102 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %94) + %102 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> %94) %.fca.0.extract7 = extractvalue { <16 x i8>, <16 x i8> } %102, 0 %.fca.1.extract9 = extractvalue { <16 x i8>, <16 x i8> } %102, 1 - %103 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %96) + %103 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> %96) %.fca.0.extract1 = extractvalue { <16 x i8>, <16 x i8> } %103, 0 %.fca.1.extract3 = extractvalue { <16 x i8>, <16 x i8> } %103, 1 %104 = bitcast <16 x i8> %.fca.1.extract30 to <2 x double> @@ -631,7 +631,7 @@ ret void } -declare <256 x i1> @llvm.ppc.mma.lxvp(i8*) -declare { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1>) +declare <256 x i1> @llvm.ppc.vsx.lxvp(i8*) +declare { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1>) declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) diff --git a/llvm/test/CodeGen/PowerPC/paired-vector-intrinsics-without-mma.ll b/llvm/test/CodeGen/PowerPC/paired-vector-intrinsics-without-mma.ll deleted file mode 100644 --- a/llvm/test/CodeGen/PowerPC/paired-vector-intrinsics-without-mma.ll +++ /dev/null @@ -1,59 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O3 \ -; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr -mattr=-mma \ -; RUN: < %s | FileCheck %s - -; This test is to check that the paired vector intrinsics are available even -; when MMA is disabled. - -define <16 x i8> @test1(<256 x i1>* %ptr) { -; CHECK-LABEL: test1: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lxv v3, 0(r3) -; CHECK-NEXT: lxv v2, 16(r3) -; CHECK-NEXT: vaddubm v2, v3, v2 -; CHECK-NEXT: blr -entry: - %0 = load <256 x i1>, <256 x i1>* %ptr, align 32 - %1 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %0) - %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0 - %3 = extractvalue { <16 x i8>, <16 x i8> } %1, 1 - %add = add <16 x i8> %2, %3 - ret <16 x i8> %add -} - -declare { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1>) - -define void @test2(<16 x i8> %v1, <16 x i8> %v2, <256 x i1>* %ptr) { -; CHECK-LABEL: test2: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmr v4, v3 -; CHECK-NEXT: vmr v5, v2 -; CHECK-NEXT: stxv v4, 16(r7) -; CHECK-NEXT: stxv v5, 0(r7) -; CHECK-NEXT: blr -entry: - %0 = tail call <256 x i1> @llvm.ppc.mma.assemble.pair(<16 x i8> %v2, <16 x i8> %v1) - store <256 x i1> %0, <256 x i1>* %ptr, align 32 - ret void -} - -declare <256 x i1> @llvm.ppc.mma.assemble.pair(<16 x i8>, <16 x i8>) - -define void @test3(<256 x i1>* %ptr) { -; CHECK-LABEL: test3: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lxvp vsp0, 0(r3) -; CHECK-NEXT: stxvp vsp0, 32(r3) -; CHECK-NEXT: blr -entry: - %0 = bitcast <256 x i1>* %ptr to i8* - %1 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %0) - %add.ptr1 = getelementptr inbounds <256 x i1>, <256 x i1>* %ptr, i64 1 - %2 = bitcast <256 x i1>* %add.ptr1 to i8* - tail call void @llvm.ppc.mma.stxvp(<256 x i1> %1, i8* %2) - ret void -} - -declare <256 x i1> @llvm.ppc.mma.lxvp(i8*) -declare void @llvm.ppc.mma.stxvp(<256 x i1>, i8*) diff --git a/llvm/test/CodeGen/PowerPC/paired-vector-intrinsics.ll b/llvm/test/CodeGen/PowerPC/paired-vector-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/paired-vector-intrinsics.ll @@ -0,0 +1,357 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O3 \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O3 \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr -mattr=-mma \ +; RUN: < %s | FileCheck %s --check-prefix=CHECK-NOMMA +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O3 \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: < %s | FileCheck %s --check-prefix=CHECK-BE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O3 \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr -mattr=-mma \ +; RUN: < %s | FileCheck %s --check-prefix=CHECK-BE-NOMMA + +; This test also checks that the paired vector intrinsics are available even +; when MMA is disabled. + +; assemble_pair +declare <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8>, <16 x i8>) +define void @ass_pair(<256 x i1>* %ptr, <16 x i8> %vc) { +; CHECK-LABEL: ass_pair: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmr v3, v2 +; CHECK-NEXT: stxv v2, 16(r3) +; CHECK-NEXT: stxv v3, 0(r3) +; CHECK-NEXT: blr +; +; CHECK-NOMMA-LABEL: ass_pair: +; CHECK-NOMMA: # %bb.0: # %entry +; CHECK-NOMMA-NEXT: vmr v3, v2 +; CHECK-NOMMA-NEXT: stxv v2, 16(r3) +; CHECK-NOMMA-NEXT: stxv v3, 0(r3) +; CHECK-NOMMA-NEXT: blr +; +; CHECK-BE-LABEL: ass_pair: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: vmr v3, v2 +; CHECK-BE-NEXT: stxv v2, 16(r3) +; CHECK-BE-NEXT: stxv v2, 0(r3) +; CHECK-BE-NEXT: blr +; +; CHECK-BE-NOMMA-LABEL: ass_pair: +; CHECK-BE-NOMMA: # %bb.0: # %entry +; CHECK-BE-NOMMA-NEXT: vmr v3, v2 +; CHECK-BE-NOMMA-NEXT: stxv v2, 16(r3) +; CHECK-BE-NOMMA-NEXT: stxv v2, 0(r3) +; CHECK-BE-NOMMA-NEXT: blr +entry: + %0 = tail call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %vc, <16 x i8> %vc) + store <256 x i1> %0, <256 x i1>* %ptr, align 32 + ret void +} + +; disassemble_pair +declare { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1>) +define void @disass_pair(<256 x i1>* %ptr1, <16 x i8>* %ptr2, <16 x i8>* %ptr3) { +; CHECK-LABEL: disass_pair: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxv vs1, 0(r3) +; CHECK-NEXT: lxv vs0, 16(r3) +; CHECK-NEXT: stxv vs1, 0(r4) +; CHECK-NEXT: stxv vs0, 0(r5) +; CHECK-NEXT: blr +; +; CHECK-NOMMA-LABEL: disass_pair: +; CHECK-NOMMA: # %bb.0: # %entry +; CHECK-NOMMA-NEXT: lxv vs1, 0(r3) +; CHECK-NOMMA-NEXT: lxv vs0, 16(r3) +; CHECK-NOMMA-NEXT: stxv vs1, 0(r4) +; CHECK-NOMMA-NEXT: stxv vs0, 0(r5) +; CHECK-NOMMA-NEXT: blr +; +; CHECK-BE-LABEL: disass_pair: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) +; CHECK-BE-NEXT: stxv vs0, 0(r4) +; CHECK-BE-NEXT: stxv vs1, 0(r5) +; CHECK-BE-NEXT: blr +; +; CHECK-BE-NOMMA-LABEL: disass_pair: +; CHECK-BE-NOMMA: # %bb.0: # %entry +; CHECK-BE-NOMMA-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NOMMA-NEXT: lxv vs0, 0(r3) +; CHECK-BE-NOMMA-NEXT: stxv vs0, 0(r4) +; CHECK-BE-NOMMA-NEXT: stxv vs1, 0(r5) +; CHECK-BE-NOMMA-NEXT: blr +entry: + %0 = load <256 x i1>, <256 x i1>* %ptr1, align 32 + %1 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> %0) + %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0 + %3 = extractvalue { <16 x i8>, <16 x i8> } %1, 1 + store <16 x i8> %2, <16 x i8>* %ptr2, align 16 + store <16 x i8> %3, <16 x i8>* %ptr3, align 16 + ret void +} + +define void @test_ldst_1(<256 x i1>* %vpp, <256 x i1>* %vp2) { +; CHECK-LABEL: test_ldst_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxvp vsp0, 0(r3) +; CHECK-NEXT: stxvp vsp0, 0(r4) +; CHECK-NEXT: blr +; +; CHECK-NOMMA-LABEL: test_ldst_1: +; CHECK-NOMMA: # %bb.0: # %entry +; CHECK-NOMMA-NEXT: lxvp vsp0, 0(r3) +; CHECK-NOMMA-NEXT: stxvp vsp0, 0(r4) +; CHECK-NOMMA-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_1: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxvp vsp0, 0(r3) +; CHECK-BE-NEXT: stxvp vsp0, 0(r4) +; CHECK-BE-NEXT: blr +; +; CHECK-BE-NOMMA-LABEL: test_ldst_1: +; CHECK-BE-NOMMA: # %bb.0: # %entry +; CHECK-BE-NOMMA-NEXT: lxvp vsp0, 0(r3) +; CHECK-BE-NOMMA-NEXT: stxvp vsp0, 0(r4) +; CHECK-BE-NOMMA-NEXT: blr +entry: + %0 = bitcast <256 x i1>* %vpp to i8* + %1 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %0) + %2 = bitcast <256 x i1>* %vp2 to i8* + tail call void @llvm.ppc.vsx.stxvp(<256 x i1> %1, i8* %2) + ret void +} + +declare <256 x i1> @llvm.ppc.vsx.lxvp(i8*) +declare void @llvm.ppc.vsx.stxvp(<256 x i1>, i8*) + +define void @test_ldst_2(<256 x i1>* %vpp, i64 %offset, <256 x i1>* %vp2) { +; CHECK-LABEL: test_ldst_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxvpx vsp0, r3, r4 +; CHECK-NEXT: stxvpx vsp0, r5, r4 +; CHECK-NEXT: blr +; +; CHECK-NOMMA-LABEL: test_ldst_2: +; CHECK-NOMMA: # %bb.0: # %entry +; CHECK-NOMMA-NEXT: lxvpx vsp0, r3, r4 +; CHECK-NOMMA-NEXT: stxvpx vsp0, r5, r4 +; CHECK-NOMMA-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_2: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxvpx vsp0, r3, r4 +; CHECK-BE-NEXT: stxvpx vsp0, r5, r4 +; CHECK-BE-NEXT: blr +; +; CHECK-BE-NOMMA-LABEL: test_ldst_2: +; CHECK-BE-NOMMA: # %bb.0: # %entry +; CHECK-BE-NOMMA-NEXT: lxvpx vsp0, r3, r4 +; CHECK-BE-NOMMA-NEXT: stxvpx vsp0, r5, r4 +; CHECK-BE-NOMMA-NEXT: blr +entry: + %0 = bitcast <256 x i1>* %vpp to i8* + %1 = getelementptr i8, i8* %0, i64 %offset + %2 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %1) + %3 = bitcast <256 x i1>* %vp2 to i8* + %4 = getelementptr i8, i8* %3, i64 %offset + tail call void @llvm.ppc.vsx.stxvp(<256 x i1> %2, i8* %4) + ret void +} + +define void @test_ldst_3(<256 x i1>* %vpp, <256 x i1>* %vp2) { +; CHECK-LABEL: test_ldst_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li r5, 18 +; CHECK-NEXT: lxvpx vsp0, r3, r5 +; CHECK-NEXT: stxvpx vsp0, r4, r5 +; CHECK-NEXT: blr +; +; CHECK-NOMMA-LABEL: test_ldst_3: +; CHECK-NOMMA: # %bb.0: # %entry +; CHECK-NOMMA-NEXT: li r5, 18 +; CHECK-NOMMA-NEXT: lxvpx vsp0, r3, r5 +; CHECK-NOMMA-NEXT: stxvpx vsp0, r4, r5 +; CHECK-NOMMA-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_3: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: li r5, 18 +; CHECK-BE-NEXT: lxvpx vsp0, r3, r5 +; CHECK-BE-NEXT: stxvpx vsp0, r4, r5 +; CHECK-BE-NEXT: blr +; +; CHECK-BE-NOMMA-LABEL: test_ldst_3: +; CHECK-BE-NOMMA: # %bb.0: # %entry +; CHECK-BE-NOMMA-NEXT: li r5, 18 +; CHECK-BE-NOMMA-NEXT: lxvpx vsp0, r3, r5 +; CHECK-BE-NOMMA-NEXT: stxvpx vsp0, r4, r5 +; CHECK-BE-NOMMA-NEXT: blr +entry: + %0 = bitcast <256 x i1>* %vpp to i8* + %1 = getelementptr i8, i8* %0, i64 18 + %2 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %1) + %3 = bitcast <256 x i1>* %vp2 to i8* + %4 = getelementptr i8, i8* %3, i64 18 + tail call void @llvm.ppc.vsx.stxvp(<256 x i1> %2, i8* %4) + ret void +} + +define void @test_ldst_4(<256 x i1>* %vpp, <256 x i1>* %vp2) { +; CHECK-LABEL: test_ldst_4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: lxvpx vsp0, r3, r5 +; CHECK-NEXT: stxvpx vsp0, r4, r5 +; CHECK-NEXT: blr +; +; CHECK-NOMMA-LABEL: test_ldst_4: +; CHECK-NOMMA: # %bb.0: # %entry +; CHECK-NOMMA-NEXT: li r5, 1 +; CHECK-NOMMA-NEXT: lxvpx vsp0, r3, r5 +; CHECK-NOMMA-NEXT: stxvpx vsp0, r4, r5 +; CHECK-NOMMA-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_4: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: li r5, 1 +; CHECK-BE-NEXT: lxvpx vsp0, r3, r5 +; CHECK-BE-NEXT: stxvpx vsp0, r4, r5 +; CHECK-BE-NEXT: blr +; +; CHECK-BE-NOMMA-LABEL: test_ldst_4: +; CHECK-BE-NOMMA: # %bb.0: # %entry +; CHECK-BE-NOMMA-NEXT: li r5, 1 +; CHECK-BE-NOMMA-NEXT: lxvpx vsp0, r3, r5 +; CHECK-BE-NOMMA-NEXT: stxvpx vsp0, r4, r5 +; CHECK-BE-NOMMA-NEXT: blr +entry: + %0 = bitcast <256 x i1>* %vpp to i8* + %1 = getelementptr i8, i8* %0, i64 1 + %2 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %1) + %3 = bitcast <256 x i1>* %vp2 to i8* + %4 = getelementptr i8, i8* %3, i64 1 + tail call void @llvm.ppc.vsx.stxvp(<256 x i1> %2, i8* %4) + ret void +} + +define void @test_ldst_5(<256 x i1>* %vpp, <256 x i1>* %vp2) { +; CHECK-LABEL: test_ldst_5: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li r5, 42 +; CHECK-NEXT: lxvpx vsp0, r3, r5 +; CHECK-NEXT: stxvpx vsp0, r4, r5 +; CHECK-NEXT: blr +; +; CHECK-NOMMA-LABEL: test_ldst_5: +; CHECK-NOMMA: # %bb.0: # %entry +; CHECK-NOMMA-NEXT: li r5, 42 +; CHECK-NOMMA-NEXT: lxvpx vsp0, r3, r5 +; CHECK-NOMMA-NEXT: stxvpx vsp0, r4, r5 +; CHECK-NOMMA-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_5: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: li r5, 42 +; CHECK-BE-NEXT: lxvpx vsp0, r3, r5 +; CHECK-BE-NEXT: stxvpx vsp0, r4, r5 +; CHECK-BE-NEXT: blr +; +; CHECK-BE-NOMMA-LABEL: test_ldst_5: +; CHECK-BE-NOMMA: # %bb.0: # %entry +; CHECK-BE-NOMMA-NEXT: li r5, 42 +; CHECK-BE-NOMMA-NEXT: lxvpx vsp0, r3, r5 +; CHECK-BE-NOMMA-NEXT: stxvpx vsp0, r4, r5 +; CHECK-BE-NOMMA-NEXT: blr +entry: + %0 = bitcast <256 x i1>* %vpp to i8* + %1 = getelementptr i8, i8* %0, i64 42 + %2 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %1) + %3 = bitcast <256 x i1>* %vp2 to i8* + %4 = getelementptr i8, i8* %3, i64 42 + tail call void @llvm.ppc.vsx.stxvp(<256 x i1> %2, i8* %4) + ret void +} + +define void @test_ldst_6(<256 x i1>* %vpp, <256 x i1>* %vp2) { +; CHECK-LABEL: test_ldst_6: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxvp vsp0, 4096(r3) +; CHECK-NEXT: stxvp vsp0, 4096(r4) +; CHECK-NEXT: blr +; +; CHECK-NOMMA-LABEL: test_ldst_6: +; CHECK-NOMMA: # %bb.0: # %entry +; CHECK-NOMMA-NEXT: lxvp vsp0, 4096(r3) +; CHECK-NOMMA-NEXT: stxvp vsp0, 4096(r4) +; CHECK-NOMMA-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_6: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxvp vsp0, 4096(r3) +; CHECK-BE-NEXT: stxvp vsp0, 4096(r4) +; CHECK-BE-NEXT: blr +; +; CHECK-BE-NOMMA-LABEL: test_ldst_6: +; CHECK-BE-NOMMA: # %bb.0: # %entry +; CHECK-BE-NOMMA-NEXT: lxvp vsp0, 4096(r3) +; CHECK-BE-NOMMA-NEXT: stxvp vsp0, 4096(r4) +; CHECK-BE-NOMMA-NEXT: blr +entry: + %0 = getelementptr <256 x i1>, <256 x i1>* %vpp, i64 128 + %1 = bitcast <256 x i1>* %0 to i8* + %2 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %1) + %3 = getelementptr <256 x i1>, <256 x i1>* %vp2, i64 128 + %4 = bitcast <256 x i1>* %3 to i8* + tail call void @llvm.ppc.vsx.stxvp(<256 x i1> %2, i8* %4) + ret void +} + +define void @test_ldst_7(<256 x i1>* %vpp, <256 x i1>* %vp2) { +; FIXME: A prefixed load (plxvp) is expected here as the offset in this +; test case is a constant that fits within 34-bits. +; CHECK-LABEL: test_ldst_7: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li r5, 0 +; CHECK-NEXT: ori r5, r5, 32799 +; CHECK-NEXT: lxvpx vsp0, r3, r5 +; CHECK-NEXT: stxvpx vsp0, r4, r5 +; CHECK-NEXT: blr +; +; CHECK-NOMMA-LABEL: test_ldst_7: +; CHECK-NOMMA: # %bb.0: # %entry +; CHECK-NOMMA-NEXT: li r5, 0 +; CHECK-NOMMA-NEXT: ori r5, r5, 32799 +; CHECK-NOMMA-NEXT: lxvpx vsp0, r3, r5 +; CHECK-NOMMA-NEXT: stxvpx vsp0, r4, r5 +; CHECK-NOMMA-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_7: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: li r5, 0 +; CHECK-BE-NEXT: ori r5, r5, 32799 +; CHECK-BE-NEXT: lxvpx vsp0, r3, r5 +; CHECK-BE-NEXT: stxvpx vsp0, r4, r5 +; CHECK-BE-NEXT: blr +; +; CHECK-BE-NOMMA-LABEL: test_ldst_7: +; CHECK-BE-NOMMA: # %bb.0: # %entry +; CHECK-BE-NOMMA-NEXT: li r5, 0 +; CHECK-BE-NOMMA-NEXT: ori r5, r5, 32799 +; CHECK-BE-NOMMA-NEXT: lxvpx vsp0, r3, r5 +; CHECK-BE-NOMMA-NEXT: stxvpx vsp0, r4, r5 +; CHECK-BE-NOMMA-NEXT: blr +entry: + %0 = bitcast <256 x i1>* %vpp to i8* + %1 = getelementptr i8, i8* %0, i64 32799 + %2 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %1) + %3 = bitcast <256 x i1>* %vp2 to i8* + %4 = getelementptr i8, i8* %3, i64 32799 + tail call void @llvm.ppc.vsx.stxvp(<256 x i1> %2, i8* %4) + ret void +}