diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -2047,6 +2047,10 @@ GE_Missing_ucontext }; + QualType DecodeTypeStr(const char *&Str, const ASTContext &Context, + ASTContext::GetBuiltinTypeError &Error, + bool &RequireICE, bool AllowTypeModifiers) const; + /// Return the type for the specified builtin. /// /// If \p IntegerConstantArgs is non-null, it is filled in with a bitmask of diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def --- a/clang/include/clang/Basic/BuiltinsPPC.def +++ b/clang/include/clang/Basic/BuiltinsPPC.def @@ -7,14 +7,22 @@ //===----------------------------------------------------------------------===// // // This file defines the PowerPC-specific builtin function database. Users of -// this file must define the BUILTIN macro to make use of this information. +// this file must define the BUILTIN macro or the MMA_BUILTIN macro to make use +// of this information. // //===----------------------------------------------------------------------===// // FIXME: this needs to be the full list supported by GCC. Right now, I'm just // adding stuff on demand. -// The format of this database matches clang/Basic/Builtins.def. +// The format of this database matches clang/Basic/Builtins.def except for the +// MMA builtins that are using their own format documented below. + +#if defined(BUILTIN) && !defined(MMA_BUILTIN) +# define MMA_BUILTIN(ID, TYPES, ACCUMULATE) BUILTIN(__builtin_mma_##ID, "i.", "t") +#elif defined(MMA_BUILTIN) && !defined(BUILTIN) +# define BUILTIN(ID, TYPES, ATTRS) +#endif BUILTIN(__builtin_ppc_get_timebase, "ULLi", "n") @@ -646,6 +654,92 @@ // Cache built-ins BUILTIN(__builtin_dcbf, "vvC*", "") +// MMA built-ins +// All MMA built-ins are declared here using the MMA_BUILTIN macro. Because +// these built-ins rely on target-dependent types and to avoid pervasive change, +// they are type checked manually in Sema using custom type descriptors. +// The first argument of the MMA_BUILTIN macro is the name of the built-in, the +// second argument specifies the type of the function (result value, then each +// argument) as follows: +// i -> Unsigned integer followed by the greatest possible value for that +// argument or 0 if no constraint on the value. +// (e.g. i15 for a 4-bits value) +// v -> void +// V -> Vector type used with MMA builtins (vector unsigned char) +// W -> MMA vector type followed by the size of the vector type. +// (e.g. W512 for __vector_quad) +// The 'C' suffix can be used as a suffix to specify the const type. +// The '*' suffix can be used as a suffix to specify a pointer to a type. +// The third argument is set to true if the builtin accumulates its result into +// its given accumulator. + +MMA_BUILTIN(assemble_acc, "vW512*VVVV", false) +MMA_BUILTIN(disassemble_acc, "vv*W512*", false) +MMA_BUILTIN(assemble_pair, "vW256*VV", false) +MMA_BUILTIN(disassemble_pair, "vv*W256*", false) +MMA_BUILTIN(xxmtacc, "vW512*", true) +MMA_BUILTIN(xxmfacc, "vW512*", true) +MMA_BUILTIN(xxsetaccz, "vW512*", false) +MMA_BUILTIN(xvi4ger8, "vW512*VV", false) +MMA_BUILTIN(xvi8ger4, "vW512*VV", false) +MMA_BUILTIN(xvi16ger2, "vW512*VV", false) +MMA_BUILTIN(xvi16ger2s, "vW512*VV", false) +MMA_BUILTIN(xvf16ger2, "vW512*VV", false) +MMA_BUILTIN(xvf32ger, "vW512*VV", false) +MMA_BUILTIN(xvf64ger, "vW512*W256V", false) +MMA_BUILTIN(pmxvi4ger8, "vW512*VVi15i15i255", false) +MMA_BUILTIN(pmxvi8ger4, "vW512*VVi15i15i15", false) +MMA_BUILTIN(pmxvi16ger2, "vW512*VVi15i15i3", false) +MMA_BUILTIN(pmxvi16ger2s, "vW512*VVi15i15i3", false) +MMA_BUILTIN(pmxvf16ger2, "vW512*VVi15i15i3", false) +MMA_BUILTIN(pmxvf32ger, "vW512*VVi15i15", false) +MMA_BUILTIN(pmxvf64ger, "vW512*W256Vi15i3", false) +MMA_BUILTIN(xvi4ger8pp, "vW512*VV", true) +MMA_BUILTIN(xvi8ger4pp, "vW512*VV", true) +MMA_BUILTIN(xvi8ger4spp, "vW512*VV", true) +MMA_BUILTIN(xvi16ger2pp, "vW512*VV", true) +MMA_BUILTIN(xvi16ger2spp, "vW512*VV", true) +MMA_BUILTIN(pmxvi4ger8pp, "vW512*VVi15i15i255", true) +MMA_BUILTIN(pmxvi8ger4pp, "vW512*VVi15i15i15", true) +MMA_BUILTIN(pmxvi8ger4spp, "vW512*VVi15i15i15", true) +MMA_BUILTIN(pmxvi16ger2pp, "vW512*VVi15i15i3", true) +MMA_BUILTIN(pmxvi16ger2spp, "vW512*VVi15i15i3", true) +MMA_BUILTIN(xvf16ger2pp, "vW512*VV", true) +MMA_BUILTIN(xvf16ger2pn, "vW512*VV", true) +MMA_BUILTIN(xvf16ger2np, "vW512*VV", true) +MMA_BUILTIN(xvf16ger2nn, "vW512*VV", true) +MMA_BUILTIN(pmxvf16ger2pp, "vW512*VVi15i15i3", true) +MMA_BUILTIN(pmxvf16ger2pn, "vW512*VVi15i15i3", true) +MMA_BUILTIN(pmxvf16ger2np, "vW512*VVi15i15i3", true) +MMA_BUILTIN(pmxvf16ger2nn, "vW512*VVi15i15i3", true) +MMA_BUILTIN(xvf32gerpp, "vW512*VV", true) +MMA_BUILTIN(xvf32gerpn, "vW512*VV", true) +MMA_BUILTIN(xvf32gernp, "vW512*VV", true) +MMA_BUILTIN(xvf32gernn, "vW512*VV", true) +MMA_BUILTIN(pmxvf32gerpp, "vW512*VVi15i15", true) +MMA_BUILTIN(pmxvf32gerpn, "vW512*VVi15i15", true) +MMA_BUILTIN(pmxvf32gernp, "vW512*VVi15i15", true) +MMA_BUILTIN(pmxvf32gernn, "vW512*VVi15i15", true) +MMA_BUILTIN(xvf64gerpp, "vW512*W256V", true) +MMA_BUILTIN(xvf64gerpn, "vW512*W256V", true) +MMA_BUILTIN(xvf64gernp, "vW512*W256V", true) +MMA_BUILTIN(xvf64gernn, "vW512*W256V", true) +MMA_BUILTIN(pmxvf64gerpp, "vW512*W256Vi15i3", true) +MMA_BUILTIN(pmxvf64gerpn, "vW512*W256Vi15i3", true) +MMA_BUILTIN(pmxvf64gernp, "vW512*W256Vi15i3", true) +MMA_BUILTIN(pmxvf64gernn, "vW512*W256Vi15i3", true) +MMA_BUILTIN(xvbf16ger2, "vW512*VV", false) +MMA_BUILTIN(pmxvbf16ger2, "vW512*VVi15i15i3", false) +MMA_BUILTIN(xvbf16ger2pp, "vW512*VV", true) +MMA_BUILTIN(xvbf16ger2pn, "vW512*VV", true) +MMA_BUILTIN(xvbf16ger2np, "vW512*VV", true) +MMA_BUILTIN(xvbf16ger2nn, "vW512*VV", true) +MMA_BUILTIN(pmxvbf16ger2pp, "vW512*VVi15i15i3", true) +MMA_BUILTIN(pmxvbf16ger2pn, "vW512*VVi15i15i3", true) +MMA_BUILTIN(pmxvbf16ger2np, "vW512*VVi15i15i3", true) +MMA_BUILTIN(pmxvbf16ger2nn, "vW512*VVi15i15i3", true) + // FIXME: Obviously incomplete. #undef BUILTIN +#undef MMA_BUILTIN diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -12348,6 +12348,7 @@ int ArgNum, unsigned ExpectedFieldNum, bool AllowName); bool SemaBuiltinARMMemoryTaggingCall(unsigned BuiltinID, CallExpr *TheCall); + bool SemaBuiltinPPCMMACall(CallExpr *TheCall, const char *TypeDesc); // Matrix builtin handling. ExprResult SemaBuiltinMatrixTranspose(CallExpr *TheCall, diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -10327,6 +10327,18 @@ return Type; } +// On some targets such as PowerPC, some of the builtins are defined with custom +// type decriptors for target-dependent types. These descriptors are decoded in +// other functions, but it may be useful to be able to fall back to default +// descriptor decoding to define builtins mixing target-dependent and target- +// independent types. This function allows decoding one type descriptor with +// default decoding. +QualType ASTContext::DecodeTypeStr(const char *&Str, const ASTContext &Context, + GetBuiltinTypeError &Error, bool &RequireICE, + bool AllowTypeModifiers) const { + return DecodeTypeFromStr(Str, Context, Error, RequireICE, AllowTypeModifiers); +} + /// GetBuiltinType - Return the type for the specified builtin. QualType ASTContext::GetBuiltinType(unsigned Id, GetBuiltinTypeError &Error, diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -14707,6 +14707,64 @@ return Builder.CreateExtractElement(Unpacked, Index); } + + // The PPC MMA builtins take a pointer to a __vector_quad as an argument. + // Some of the MMA instructions accumulate their result into an existing + // accumulator whereas the others generate a new accumulator. So we need to + // use custom code generation to expand a builtin call with a pointer to a + // load (if the corresponding instruction accumulates its result) followed by + // the call to the intrinsic and a store of the result. +#define MMA_BUILTIN(Name, Types, Accumulate) \ + case PPC::BI__builtin_mma_##Name: +#include "clang/Basic/BuiltinsPPC.def" + { + // The first argument of these two builtins is a pointer used to store their + // result. However, the llvm intrinsics return their result in multiple + // return values. So, here we emit code extracting these values from the + // intrinsic results and storing them using that pointer. + if (BuiltinID == PPC::BI__builtin_mma_disassemble_acc || + BuiltinID == PPC::BI__builtin_mma_disassemble_pair) { + unsigned NumVecs = 2; + auto Intrinsic = Intrinsic::ppc_mma_disassemble_pair; + if (BuiltinID == PPC::BI__builtin_mma_disassemble_acc) { + NumVecs = 4; + Intrinsic = Intrinsic::ppc_mma_disassemble_acc; + } + llvm::Function *F = CGM.getIntrinsic(Intrinsic); + Address Addr = EmitPointerWithAlignment(E->getArg(1)); + Value *Vec = Builder.CreateLoad(Addr); + Value *Call = Builder.CreateCall(F, {Vec}); + llvm::Type *VTy = llvm::FixedVectorType::get(Int8Ty, 16); + Value *Ptr = Builder.CreateBitCast(Ops[0], VTy->getPointerTo()); + for (unsigned i=0; i CallOps; + if (Accumulate) { + Address Addr = EmitPointerWithAlignment(E->getArg(0)); + Value *Acc = Builder.CreateLoad(Addr); + CallOps.push_back(Acc); + } + for (unsigned i=1; isetType(type); + + while (*TypeStr != '\0') { + Mask = 0; + QualType ExpectedType = DecodePPCMMATypeFromStr(Context, TypeStr, Mask); + if (ArgNum >= TheCall->getNumArgs()) { + ArgNum++; + break; + } + + Expr *Arg = TheCall->getArg(ArgNum); + QualType ArgType = Arg->getType(); + + if ((ExpectedType->isVoidPointerType() && !ArgType->isPointerType()) || + (!ExpectedType->isVoidPointerType() && + ArgType.getCanonicalType() != ExpectedType)) + return Diag(Arg->getBeginLoc(), diag::err_typecheck_convert_incompatible) + << ArgType << ExpectedType << 1 << 0 << 0; + + // If the value of the Mask is not 0, we have a constraint in the size of + // the integer argument so here we ensure the argument is a constant that + // is in the valid range. + if (Mask != 0 && + SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, Mask, true)) + return true; + + ArgNum++; + } + + // In case we exited early from the previous loop, there are other types to + // read from TypeStr. So we need to read them all to ensure we have the right + // number of arguments in TheCall and if it is not the case, to display a + // better error message. + while (*TypeStr != '\0') { + (void) DecodePPCMMATypeFromStr(Context, TypeStr, Mask); + ArgNum++; + } + if (checkArgCount(*this, TheCall, ArgNum)) + return true; + + return false; +} + /// SemaBuiltinLongjmp - Handle __builtin_longjmp(void *env[5], int val). /// This checks that the target supports __builtin_longjmp and /// that val is a constant 1. diff --git a/clang/test/CodeGen/builtins-ppc-mma.c b/clang/test/CodeGen/builtins-ppc-mma.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/builtins-ppc-mma.c @@ -0,0 +1,1038 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -O3 -triple powerpc64le-unknown-unknown -target-cpu future -emit-llvm %s -o - | FileCheck %s + +// CHECK-LABEL: @test1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]], <16 x i8> [[VC]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test1(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __vector_quad res; + __builtin_mma_assemble_acc(&res, vc, vc, vc, vc); + *((__vector_quad *)resp) = res; +} + +// CHECK-LABEL: @test2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64 +// CHECK-NEXT: [[TMP2:%.*]] = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <16 x i8>* +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP2]], 0 +// CHECK-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[TMP3]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[RESP]], i64 16 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>* +// CHECK-NEXT: store <16 x i8> [[TMP5]], <16 x i8>* [[TMP7]], align 16 +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP2]], 2 +// CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[RESP]], i64 32 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>* +// CHECK-NEXT: store <16 x i8> [[TMP8]], <16 x i8>* [[TMP10]], align 16 +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP2]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[RESP]], i64 48 +// CHECK-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <16 x i8>* +// CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[TMP13]], align 16 +// CHECK-NEXT: ret void +// +void test2(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __builtin_mma_disassemble_acc(resp, (__vector_quad*)vqp); +} + +// CHECK-LABEL: @test3( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <256 x i1> @llvm.ppc.mma.assemble.pair(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <256 x i1>* +// CHECK-NEXT: store <256 x i1> [[TMP0]], <256 x i1>* [[TMP1]], align 32, !tbaa !6 +// CHECK-NEXT: ret void +// +void test3(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __vector_pair res; + __builtin_mma_assemble_pair(&res, vc, vc); + *((__vector_pair *)resp) = res; +} + +// CHECK-LABEL: @test4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VPP:%.*]] to <256 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, <256 x i1>* [[TMP0]], align 32 +// CHECK-NEXT: [[TMP2:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <16 x i8>* +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP2]], 0 +// CHECK-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[TMP3]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[RESP]], i64 16 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>* +// CHECK-NEXT: store <16 x i8> [[TMP5]], <16 x i8>* [[TMP7]], align 16 +// CHECK-NEXT: ret void +// +void test4(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __builtin_mma_disassemble_pair(resp, (__vector_pair*)vpp); +} + +// CHECK-LABEL: @test5( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xxmtacc(<512 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test5(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xxmtacc(&vq); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test6( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xxmfacc(<512 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test6(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xxmfacc(&vq); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test7( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz() +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test7(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xxsetaccz(&vq); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test8(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvi4ger8(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test9( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test9(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvi8ger4(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test10( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test10(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvi16ger2(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test11( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2s(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test11(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvi16ger2s(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test12( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test12(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf16ger2(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test13( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32ger(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test13(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf32ger(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test14( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VPP:%.*]] to <256 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, <256 x i1>* [[TMP0]], align 32, !tbaa !6 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test14(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf64ger(&vq, vp, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test15( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi4ger8(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test15(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvi4ger8(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test16(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvi8ger4(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test17( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test17(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvi16ger2(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test18( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2s(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test18(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvi16ger2s(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test19( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test19(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf16ger2(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test20( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf32ger(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test20(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf32ger(&vq, vc, vc, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test21( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VPP:%.*]] to <256 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, <256 x i1>* [[TMP0]], align 32, !tbaa !6 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64ger(<256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test21(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf64ger(&vq, vp, vc, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test22( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test22(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvi4ger8pp(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test23( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4pp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test23(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvi8ger4pp(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test24( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4spp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test24(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvi8ger4spp(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test25( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2pp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test25(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvi16ger2pp(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test26( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2spp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test26(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvi16ger2spp(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test27( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi4ger8pp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test27(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvi4ger8pp(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test28( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4pp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test28(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvi8ger4pp(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test29( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4spp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test29(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvi8ger4spp(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test30( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2pp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test30(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvi16ger2pp(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test31( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2spp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test31(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvi16ger2spp(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test32(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf16ger2pp(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test33( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pn(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test33(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf16ger2pn(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test34( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2np(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test34(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf16ger2np(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test35( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2nn(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test35(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf16ger2nn(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test36( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test36(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf16ger2pp(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test37( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pn(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test37(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf16ger2pn(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test38( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2np(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test38(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf16ger2np(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test39( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2nn(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test39(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf16ger2nn(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test40( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test40(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf32gerpp(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test41( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test41(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf32gerpn(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test42( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test42(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf32gernp(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test43( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32gernn(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test43(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf32gernn(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test44( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test44(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf32gerpp(&vq, vc, vc, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test45( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpn(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test45(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf32gerpn(&vq, vc, vc, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test46( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test46(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf32gernp(&vq, vc, vc, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test47( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernn(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test47(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf32gernn(&vq, vc, vc, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test48( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[VPP:%.*]] to <256 x i1>* +// CHECK-NEXT: [[TMP3:%.*]] = load <256 x i1>, <256 x i1>* [[TMP2]], align 32, !tbaa !6 +// CHECK-NEXT: [[TMP4:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> [[TMP1]], <256 x i1> [[TMP3]], <16 x i8> [[VC:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP4]], <512 x i1>* [[TMP5]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test48(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf64gerpp(&vq, vp, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test49( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[VPP:%.*]] to <256 x i1>* +// CHECK-NEXT: [[TMP3:%.*]] = load <256 x i1>, <256 x i1>* [[TMP2]], align 32, !tbaa !6 +// CHECK-NEXT: [[TMP4:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpn(<512 x i1> [[TMP1]], <256 x i1> [[TMP3]], <16 x i8> [[VC:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP4]], <512 x i1>* [[TMP5]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test49(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf64gerpn(&vq, vp, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test50( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[VPP:%.*]] to <256 x i1>* +// CHECK-NEXT: [[TMP3:%.*]] = load <256 x i1>, <256 x i1>* [[TMP2]], align 32, !tbaa !6 +// CHECK-NEXT: [[TMP4:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP1]], <256 x i1> [[TMP3]], <16 x i8> [[VC:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP4]], <512 x i1>* [[TMP5]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test50(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf64gernp(&vq, vp, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test51( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[VPP:%.*]] to <256 x i1>* +// CHECK-NEXT: [[TMP3:%.*]] = load <256 x i1>, <256 x i1>* [[TMP2]], align 32, !tbaa !6 +// CHECK-NEXT: [[TMP4:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernn(<512 x i1> [[TMP1]], <256 x i1> [[TMP3]], <16 x i8> [[VC:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP4]], <512 x i1>* [[TMP5]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test51(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf64gernn(&vq, vp, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test52( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[VPP:%.*]] to <256 x i1>* +// CHECK-NEXT: [[TMP3:%.*]] = load <256 x i1>, <256 x i1>* [[TMP2]], align 32, !tbaa !6 +// CHECK-NEXT: [[TMP4:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gerpp(<512 x i1> [[TMP1]], <256 x i1> [[TMP3]], <16 x i8> [[VC:%.*]], i32 0, i32 0) +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP4]], <512 x i1>* [[TMP5]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test52(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf64gerpp(&vq, vp, vc, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test53( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[VPP:%.*]] to <256 x i1>* +// CHECK-NEXT: [[TMP3:%.*]] = load <256 x i1>, <256 x i1>* [[TMP2]], align 32, !tbaa !6 +// CHECK-NEXT: [[TMP4:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gerpn(<512 x i1> [[TMP1]], <256 x i1> [[TMP3]], <16 x i8> [[VC:%.*]], i32 0, i32 0) +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP4]], <512 x i1>* [[TMP5]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test53(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf64gerpn(&vq, vp, vc, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test54( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[VPP:%.*]] to <256 x i1>* +// CHECK-NEXT: [[TMP3:%.*]] = load <256 x i1>, <256 x i1>* [[TMP2]], align 32, !tbaa !6 +// CHECK-NEXT: [[TMP4:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernp(<512 x i1> [[TMP1]], <256 x i1> [[TMP3]], <16 x i8> [[VC:%.*]], i32 0, i32 0) +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP4]], <512 x i1>* [[TMP5]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test54(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf64gernp(&vq, vp, vc, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test55( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[VPP:%.*]] to <256 x i1>* +// CHECK-NEXT: [[TMP3:%.*]] = load <256 x i1>, <256 x i1>* [[TMP2]], align 32, !tbaa !6 +// CHECK-NEXT: [[TMP4:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> [[TMP1]], <256 x i1> [[TMP3]], <16 x i8> [[VC:%.*]], i32 0, i32 0) +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP4]], <512 x i1>* [[TMP5]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test55(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf64gernn(&vq, vp, vc, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test56( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test56(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvbf16ger2(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test57( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test57(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvbf16ger2(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test58( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2pp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test58(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvbf16ger2pp(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test59( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2pn(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test59(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvbf16ger2pn(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test60( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2np(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test60(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvbf16ger2np(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test61( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2nn(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test61(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvbf16ger2nn(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test62( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2pp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test62(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvbf16ger2pp(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test63( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2pn(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test63(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvbf16ger2pn(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2np(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test64(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvbf16ger2np(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test65( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2nn(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test65(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvbf16ger2nn(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +}