diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def --- a/clang/include/clang/Basic/BuiltinsPPC.def +++ b/clang/include/clang/Basic/BuiltinsPPC.def @@ -7,14 +7,22 @@ //===----------------------------------------------------------------------===// // // This file defines the PowerPC-specific builtin function database. Users of -// this file must define the BUILTIN macro to make use of this information. +// this file must define the BUILTIN macro or the MMA_BUILTIN macro to make use +// of this information. // //===----------------------------------------------------------------------===// // FIXME: this needs to be the full list supported by GCC. Right now, I'm just // adding stuff on demand. -// The format of this database matches clang/Basic/Builtins.def. +// The format of this database matches clang/Basic/Builtins.def except for the +// MMA builtins that are using their own format documented below. + +#if defined(BUILTIN) && !defined(MMA_BUILTIN) +# define MMA_BUILTIN(ID, TYPES, ACCUMULATE) BUILTIN(__builtin_mma_##ID, "i.", "t") +#elif defined(MMA_BUILTIN) && !defined(BUILTIN) +# define BUILTIN(ID, TYPES, ATTRS) +#endif BUILTIN(__builtin_ppc_get_timebase, "ULLi", "n") @@ -481,6 +489,94 @@ // Cache built-ins BUILTIN(__builtin_dcbf, "vvC*", "") +// MMA built-ins +// All MMA built-ins are declared here using the MMA_BUILTIN macro. Because +// these built-ins rely on target-dependent types and to avoid pervasive change, +// they are type checked manually in Sema using custom type descriptors. +// The first argument of the MMA_BUILTIN macro is the name of the built-in, the +// second argument specifies the type of the function (result value, then each +// argument) as follows: +// i -> Unsigned integer followed by the greatest possible value for that +// argument or 0 if no constraint on the value. +// (e.g. i15 for a 4-bits value) +// v -> void +// V -> Vector type used with MMA builtins (vector unsigned char) +// W -> MMA vector type followed by the size of the vector type. +// (e.g. W512 for __vector_quad) +// The '*' suffix can be used as a suffix to specify a pointer to a type. +// This suffix can only be used with void and vector types (e.g. W512*) +// The third argument is set to true if the builtin needs an input accumulator +// to accumulate its result. + +MMA_BUILTIN(assemble_acc, "vW512*VVVV", false) +MMA_BUILTIN(disassemble_acc, "vv*W512*", false) +MMA_BUILTIN(assemble_pair, "vW256*VV", false) +MMA_BUILTIN(disassemble_pair, "vv*W256*", false) +MMA_BUILTIN(xxmtacc, "vW512*", true) +MMA_BUILTIN(xxmfacc, "vW512*", true) +MMA_BUILTIN(xxsetaccz, "vW512*", false) +MMA_BUILTIN(xvi4ger8, "vW512*VV", false) +MMA_BUILTIN(xvi8ger4, "vW512*VV", false) +MMA_BUILTIN(xvi16ger2, "vW512*VV", false) +MMA_BUILTIN(xvi16ger2s, "vW512*VV", false) +MMA_BUILTIN(xvf16ger2, "vW512*VV", false) +MMA_BUILTIN(xvf32ger, "vW512*VV", false) +MMA_BUILTIN(xvf64ger, "vW512*W256V", false) +MMA_BUILTIN(pmxvi4ger8, "vW512*VVi15i15i255", false) +MMA_BUILTIN(pmxvi8ger4, "vW512*VVi15i15i15", false) +MMA_BUILTIN(pmxvi16ger2, "vW512*VVi15i15i3", false) +MMA_BUILTIN(pmxvi16ger2s, "vW512*VVi15i15i3", false) +MMA_BUILTIN(pmxvf16ger2, "vW512*VVi15i15i3", false) +MMA_BUILTIN(pmxvf32ger, "vW512*VVi15i15", false) +MMA_BUILTIN(pmxvf64ger, "vW512*W256Vi15i3", false) +MMA_BUILTIN(xvi4ger8pp, "vW512*VV", true) +MMA_BUILTIN(xvi8ger4pp, "vW512*VV", true) +MMA_BUILTIN(xvi8ger4spp, "vW512*VV", true) +MMA_BUILTIN(xvi16ger2pp, "vW512*VV", true) +MMA_BUILTIN(xvi16ger2spp, "vW512*VV", true) +MMA_BUILTIN(pmxvi4ger8pp, "vW512*VVi15i15i255", true) +MMA_BUILTIN(pmxvi8ger4pp, "vW512*VVi15i15i15", true) +MMA_BUILTIN(pmxvi8ger4spp, "vW512*VVi15i15i15", true) +MMA_BUILTIN(pmxvi16ger2pp, "vW512*VVi15i15i3", true) +MMA_BUILTIN(pmxvi16ger2spp, "vW512*VVi15i15i3", true) +MMA_BUILTIN(xvf16ger2pp, "vW512*VV", true) +MMA_BUILTIN(xvf16ger2pn, "vW512*VV", true) +MMA_BUILTIN(xvf16ger2np, "vW512*VV", true) +MMA_BUILTIN(xvf16ger2nn, "vW512*VV", true) +MMA_BUILTIN(pmxvf16ger2pp, "vW512*VVi15i15i3", true) +MMA_BUILTIN(pmxvf16ger2pn, "vW512*VVi15i15i3", true) +MMA_BUILTIN(pmxvf16ger2np, "vW512*VVi15i15i3", true) +MMA_BUILTIN(pmxvf16ger2nn, "vW512*VVi15i15i3", true) +MMA_BUILTIN(xvf32gerpp, "vW512*VV", true) +MMA_BUILTIN(xvf32gerpn, "vW512*VV", true) +MMA_BUILTIN(xvf32gernp, "vW512*VV", true) +MMA_BUILTIN(xvf32gernn, "vW512*VV", true) +MMA_BUILTIN(pmxvf32gerpp, "vW512*VVi15i15", true) +MMA_BUILTIN(pmxvf32gerpn, "vW512*VVi15i15", true) +MMA_BUILTIN(pmxvf32gernp, "vW512*VVi15i15", true) +MMA_BUILTIN(pmxvf32gernn, "vW512*VVi15i15", true) +MMA_BUILTIN(xvf64gerpp, "vW512*W256V", true) +MMA_BUILTIN(xvf64gerpn, "vW512*W256V", true) +MMA_BUILTIN(xvf64gernp, "vW512*W256V", true) +MMA_BUILTIN(xvf64gernn, "vW512*W256V", true) +MMA_BUILTIN(pmxvf64gerpp, "vW512*W256Vi15i3", true) +MMA_BUILTIN(pmxvf64gerpn, "vW512*W256Vi15i3", true) +MMA_BUILTIN(pmxvf64gernp, "vW512*W256Vi15i3", true) +MMA_BUILTIN(pmxvf64gernn, "vW512*W256Vi15i3", true) +MMA_BUILTIN(xvbf16ger2, "vW512*VV", false) +MMA_BUILTIN(pmxvbf16ger2, "vW512*VVi15i15i3", false) +MMA_BUILTIN(xvbf16ger2pp, "vW512*VV", true) +MMA_BUILTIN(xvbf16ger2pn, "vW512*VV", true) +MMA_BUILTIN(xvbf16ger2np, "vW512*VV", true) +MMA_BUILTIN(xvbf16ger2nn, "vW512*VV", true) +MMA_BUILTIN(pmxvbf16ger2pp, "vW512*VVi15i15i3", true) +MMA_BUILTIN(pmxvbf16ger2pn, "vW512*VVi15i15i3", true) +MMA_BUILTIN(pmxvbf16ger2np, "vW512*VVi15i15i3", true) +MMA_BUILTIN(pmxvbf16ger2nn, "vW512*VVi15i15i3", true) +BUILTIN(__builtin_xvcvspbf16, "V16UcV16Uc", "") +BUILTIN(__builtin_xvcvbf16sp, "V16UcV16Uc", "") + // FIXME: Obviously incomplete. #undef BUILTIN +#undef MMA_BUILTIN diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -12114,6 +12114,7 @@ int ArgNum, unsigned ExpectedFieldNum, bool AllowName); bool SemaBuiltinARMMemoryTaggingCall(unsigned BuiltinID, CallExpr *TheCall); + bool SemaBuiltinPPCMMACall(CallExpr *TheCall, const char *TypeDesc); public: enum FormatStringType { FST_Scanf, diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -14244,6 +14244,63 @@ return Builder.CreateExtractElement(Unpacked, Index); } + // The PPC MMA builtins take a pointer to an accumulator register as an + // argument. Some of the MMA intrinsics generate a new accumulator and return + // it. The others accumulate their result into a given accumulator and return + // it. So we need to use custom code generation to expand a builtin call to a + // load (only if the associated intrinsic accumulates its result) followed by + // the intrinsic call and a store. +#define MMA_BUILTIN(Name, Types, Accumulate) \ + case PPC::BI__builtin_mma_##Name: +#include "clang/Basic/BuiltinsPPC.def" + { + // The first argument of these two builtins is a pointer used to store their + // result. However, the llvm intrinsics return their result in multiple + // return values. So, here we emit code extracting these values from the + // intrinsic results and storing them using that pointer. + if (BuiltinID == PPC::BI__builtin_mma_disassemble_acc || + BuiltinID == PPC::BI__builtin_mma_disassemble_pair) { + unsigned NumVecs = 2; + auto Intrinsic = Intrinsic::ppc_mma_disassemble_pair; + if (BuiltinID == PPC::BI__builtin_mma_disassemble_acc) { + NumVecs = 4; + Intrinsic = Intrinsic::ppc_mma_disassemble_acc; + } + llvm::Function *F = CGM.getIntrinsic(Intrinsic); + Address Addr = EmitPointerWithAlignment(E->getArg(1)); + Value *Vec = Builder.CreateLoad(Addr); + Value *Call = Builder.CreateCall(F, {Vec}); + llvm::Type *VTy = llvm::VectorType::get(Int8Ty, 16); + Value *Ptr = Builder.CreateBitCast(Ops[0], VTy->getPointerTo()); + for (unsigned i=0; i CallOps; + if (Accumulate) { + Address Addr = EmitPointerWithAlignment(E->getArg(0)); + Value *Acc = Builder.CreateLoad(Addr); + CallOps.push_back(Acc); + } + for (unsigned i=1; igetArg(argCount - 1)->getEndLoc()); return S.Diag(range.getBegin(), diag::err_typecheck_call_too_many_args) - << 0 /*function call*/ << desiredArgCount << argCount - << call->getArg(1)->getSourceRange(); + << 0 /*function call*/ << desiredArgCount << argCount << range; } /// Check that the first argument to __builtin_annotation is an integer @@ -3002,6 +3001,53 @@ SemaBuiltinConstantArgMultiple(TheCall, i, m); } +/// DecodePPCMMATypeFromStr - This decodes one PPC MMA type descriptor from Str, +/// advancing the pointer over the consumed characters. The decoded type is +/// returned. If the decoded type represents a constant integer with a +/// constraint on its value then Mask is set to that value. The type descriptors +/// used in Str are specific to PPC MMA builtins and are documented in the file +/// defining the PPC builtins. +static QualType DecodePPCMMATypeFromStr(ASTContext &Context, const char *&Str, + unsigned &Mask) { + switch (*Str++) { + case 'v': { + if (*Str != '*') + return Context.VoidTy; + Str++; + return Context.getPointerType(Context.VoidTy); + } + case 'V': + return Context.getVectorType(Context.UnsignedCharTy, 16, + VectorType::VectorKind::AltiVecVector); + case 'i': { + char *End; + unsigned size = strtoul(Str, &End, 10); + assert(End != Str && "Missing constant parameter constraint"); + Str = End; + Mask = size; + return Context.IntTy; + } + case 'W': { + char *End; + unsigned size = strtoul(Str, &End, 10); + assert(End != Str && "Missing PowerPC MMA type size"); + Str = End; + QualType Type; + switch (size) { + #define PPC_MMA_VECTOR_TYPE(typeName, Id, size) \ + case size: Type = Context.Id##Ty; break; + #include "clang/Basic/PPCTypes.def" + default: llvm_unreachable("Invalid PowerPC MMA vector type"); + } + if (*Str != '*') + return Type; + Str++; + return Context.getPointerType(Type); + } + default: llvm_unreachable("Invalid PowerPC MMA type for builtins"); + } +} + bool Sema::CheckPPCBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, CallExpr *TheCall) { unsigned i = 0, l = 0, u = 0; @@ -3060,6 +3106,10 @@ SemaBuiltinConstantArgRange(TheCall, 1, 0, 1); case PPC::BI__builtin_pack_vector_int128: return SemaVSXCheck(TheCall); +#define MMA_BUILTIN(Name, Types, Acc) \ + case PPC::BI__builtin_mma_##Name: \ + return SemaBuiltinPPCMMACall(TheCall, Types); +#include "clang/Basic/BuiltinsPPC.def" } return SemaBuiltinConstantArgRange(TheCall, i, l, u); } @@ -6427,6 +6477,64 @@ return false; } +/// SemaBuiltinPPCMMACall - Check the call to a PPC MMA builtin for validity. +/// Emit an error and return true on failure; return false on success. +/// TypeStr is a string containing the type descriptor of the value returned by +/// the builtin and the descriptors of the expected type of the arguments. +bool Sema::SemaBuiltinPPCMMACall(CallExpr *TheCall, const char *TypeStr) { + + assert((TypeStr[0] != '\0') && + "Invalid types in PPC MMA builtin declaration"); + + unsigned Mask = 0; + unsigned ArgNum = 0; + + // The first type in TypeStr is the type of the value returned by the + // builtin. So we first read that type and change the type of TheCall. + QualType type = DecodePPCMMATypeFromStr(Context, TypeStr, Mask); + TheCall->setType(type); + + while (*TypeStr != '\0') { + Mask = 0; + QualType ExpectedType = DecodePPCMMATypeFromStr(Context, TypeStr, Mask); + if (ArgNum >= TheCall->getNumArgs()) { + ArgNum++; + break; + } + + Expr *Arg = TheCall->getArg(ArgNum); + QualType ArgType = Arg->getType(); + + if ((ExpectedType->isVoidPointerType() && !ArgType->isPointerType()) || + (!ExpectedType->isVoidPointerType() && + ArgType.getCanonicalType() != ExpectedType)) + return Diag(Arg->getBeginLoc(), diag::err_typecheck_convert_incompatible) + << ArgType << ExpectedType << 1 << 0 << 0; + + // If the value of the Mask is not 0, we have a constraint in the size of + // the integer argument so here we ensure the argument is a constant that + // is in the valid range. + if (Mask != 0 && + SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, Mask, true)) + return true; + + ArgNum++; + } + + // In case we exited early from the previous loop, there are other types to + // read from TypeStr. So we need to read them all to ensure we have the right + // number of arguments in TheCall and if it is not the case, to display a + // better error message. + while (*TypeStr != '\0') { + (void) DecodePPCMMATypeFromStr(Context, TypeStr, Mask); + ArgNum++; + } + if (checkArgCount(*this, TheCall, ArgNum)) + return true; + + return false; +} + /// SemaBuiltinLongjmp - Handle __builtin_longjmp(void *env[5], int val). /// This checks that the target supports __builtin_longjmp and /// that val is a constant 1. diff --git a/clang/test/CodeGen/builtins-ppc-mma.c b/clang/test/CodeGen/builtins-ppc-mma.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/builtins-ppc-mma.c @@ -0,0 +1,1066 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -O3 -triple powerpc64le-unknown-unknown -target-cpu future -emit-llvm %s -o - | FileCheck %s + +// CHECK-LABEL: @test1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]], <16 x i8> [[VC]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test1(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __vector_quad res; + __builtin_mma_assemble_acc(&res, vc, vc, vc, vc); + *((__vector_quad *)resp) = res; +} + +// CHECK-LABEL: @test2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64 +// CHECK-NEXT: [[TMP2:%.*]] = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <16 x i8>* +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP2]], 0 +// CHECK-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[TMP3]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[RESP]], i64 16 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>* +// CHECK-NEXT: store <16 x i8> [[TMP5]], <16 x i8>* [[TMP7]], align 16 +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP2]], 2 +// CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[RESP]], i64 32 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>* +// CHECK-NEXT: store <16 x i8> [[TMP8]], <16 x i8>* [[TMP10]], align 16 +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP2]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[RESP]], i64 48 +// CHECK-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <16 x i8>* +// CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[TMP13]], align 16 +// CHECK-NEXT: ret void +// +void test2(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __builtin_mma_disassemble_acc(resp, (__vector_quad*)vqp); +} + +// CHECK-LABEL: @test3( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <256 x i1> @llvm.ppc.mma.assemble.pair(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <256 x i1>* +// CHECK-NEXT: store <256 x i1> [[TMP0]], <256 x i1>* [[TMP1]], align 32, !tbaa !6 +// CHECK-NEXT: ret void +// +void test3(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __vector_pair res; + __builtin_mma_assemble_pair(&res, vc, vc); + *((__vector_pair *)resp) = res; +} + +// CHECK-LABEL: @test4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VPP:%.*]] to <256 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, <256 x i1>* [[TMP0]], align 32 +// CHECK-NEXT: [[TMP2:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <16 x i8>* +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP2]], 0 +// CHECK-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[TMP3]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[RESP]], i64 16 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>* +// CHECK-NEXT: store <16 x i8> [[TMP5]], <16 x i8>* [[TMP7]], align 16 +// CHECK-NEXT: ret void +// +void test4(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __builtin_mma_disassemble_pair(resp, (__vector_pair*)vpp); +} + +// CHECK-LABEL: @test5( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xxmtacc(<512 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test5(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xxmtacc(&vq); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test6( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xxmfacc(<512 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test6(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xxmfacc(&vq); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test7( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz() +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test7(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xxsetaccz(&vq); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test8(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvi4ger8(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test9( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test9(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvi8ger4(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test10( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test10(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvi16ger2(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test11( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2s(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test11(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvi16ger2s(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test12( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test12(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf16ger2(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test13( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32ger(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test13(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf32ger(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test14( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VPP:%.*]] to <256 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, <256 x i1>* [[TMP0]], align 32, !tbaa !6 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test14(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf64ger(&vq, vp, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test15( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi4ger8(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test15(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvi4ger8(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test16(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvi8ger4(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test17( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test17(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvi16ger2(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test18( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2s(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test18(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvi16ger2s(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test19( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test19(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf16ger2(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test20( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf32ger(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test20(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf32ger(&vq, vc, vc, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test21( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VPP:%.*]] to <256 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, <256 x i1>* [[TMP0]], align 32, !tbaa !6 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64ger(<256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test21(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf64ger(&vq, vp, vc, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test22( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test22(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvi4ger8pp(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test23( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4pp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test23(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvi8ger4pp(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test24( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4spp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test24(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvi8ger4spp(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test25( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2pp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test25(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvi16ger2pp(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test26( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2spp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test26(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvi16ger2spp(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test27( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi4ger8pp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test27(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvi4ger8pp(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test28( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4pp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test28(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvi8ger4pp(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test29( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4spp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test29(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvi8ger4spp(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test30( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2pp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test30(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvi16ger2pp(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test31( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2spp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test31(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvi16ger2spp(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test32(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf16ger2pp(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test33( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pn(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test33(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf16ger2pn(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test34( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2np(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test34(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf16ger2np(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test35( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2nn(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test35(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf16ger2nn(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test36( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test36(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf16ger2pp(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test37( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pn(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test37(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf16ger2pn(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test38( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2np(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test38(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf16ger2np(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test39( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2nn(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test39(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf16ger2nn(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test40( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test40(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf32gerpp(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test41( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test41(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf32gerpn(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test42( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test42(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf32gernp(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test43( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32gernn(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test43(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf32gernn(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test44( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test44(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf32gerpp(&vq, vc, vc, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test45( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpn(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test45(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf32gerpn(&vq, vc, vc, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test46( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test46(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf32gernp(&vq, vc, vc, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test47( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernn(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test47(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf32gernn(&vq, vc, vc, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test48( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[VPP:%.*]] to <256 x i1>* +// CHECK-NEXT: [[TMP3:%.*]] = load <256 x i1>, <256 x i1>* [[TMP2]], align 32, !tbaa !6 +// CHECK-NEXT: [[TMP4:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> [[TMP1]], <256 x i1> [[TMP3]], <16 x i8> [[VC:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP4]], <512 x i1>* [[TMP5]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test48(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf64gerpp(&vq, vp, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test49( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[VPP:%.*]] to <256 x i1>* +// CHECK-NEXT: [[TMP3:%.*]] = load <256 x i1>, <256 x i1>* [[TMP2]], align 32, !tbaa !6 +// CHECK-NEXT: [[TMP4:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpn(<512 x i1> [[TMP1]], <256 x i1> [[TMP3]], <16 x i8> [[VC:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP4]], <512 x i1>* [[TMP5]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test49(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf64gerpn(&vq, vp, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test50( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[VPP:%.*]] to <256 x i1>* +// CHECK-NEXT: [[TMP3:%.*]] = load <256 x i1>, <256 x i1>* [[TMP2]], align 32, !tbaa !6 +// CHECK-NEXT: [[TMP4:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP1]], <256 x i1> [[TMP3]], <16 x i8> [[VC:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP4]], <512 x i1>* [[TMP5]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test50(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf64gernp(&vq, vp, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test51( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[VPP:%.*]] to <256 x i1>* +// CHECK-NEXT: [[TMP3:%.*]] = load <256 x i1>, <256 x i1>* [[TMP2]], align 32, !tbaa !6 +// CHECK-NEXT: [[TMP4:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernn(<512 x i1> [[TMP1]], <256 x i1> [[TMP3]], <16 x i8> [[VC:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP4]], <512 x i1>* [[TMP5]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test51(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvf64gernn(&vq, vp, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test52( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[VPP:%.*]] to <256 x i1>* +// CHECK-NEXT: [[TMP3:%.*]] = load <256 x i1>, <256 x i1>* [[TMP2]], align 32, !tbaa !6 +// CHECK-NEXT: [[TMP4:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gerpp(<512 x i1> [[TMP1]], <256 x i1> [[TMP3]], <16 x i8> [[VC:%.*]], i32 0, i32 0) +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP4]], <512 x i1>* [[TMP5]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test52(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf64gerpp(&vq, vp, vc, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test53( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[VPP:%.*]] to <256 x i1>* +// CHECK-NEXT: [[TMP3:%.*]] = load <256 x i1>, <256 x i1>* [[TMP2]], align 32, !tbaa !6 +// CHECK-NEXT: [[TMP4:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gerpn(<512 x i1> [[TMP1]], <256 x i1> [[TMP3]], <16 x i8> [[VC:%.*]], i32 0, i32 0) +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP4]], <512 x i1>* [[TMP5]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test53(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf64gerpn(&vq, vp, vc, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test54( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[VPP:%.*]] to <256 x i1>* +// CHECK-NEXT: [[TMP3:%.*]] = load <256 x i1>, <256 x i1>* [[TMP2]], align 32, !tbaa !6 +// CHECK-NEXT: [[TMP4:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernp(<512 x i1> [[TMP1]], <256 x i1> [[TMP3]], <16 x i8> [[VC:%.*]], i32 0, i32 0) +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP4]], <512 x i1>* [[TMP5]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test54(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf64gernp(&vq, vp, vc, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test55( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[VPP:%.*]] to <256 x i1>* +// CHECK-NEXT: [[TMP3:%.*]] = load <256 x i1>, <256 x i1>* [[TMP2]], align 32, !tbaa !6 +// CHECK-NEXT: [[TMP4:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> [[TMP1]], <256 x i1> [[TMP3]], <16 x i8> [[VC:%.*]], i32 0, i32 0) +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP4]], <512 x i1>* [[TMP5]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test55(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvf64gernn(&vq, vp, vc, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test56( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test56(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvbf16ger2(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test57( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP0]], <512 x i1>* [[TMP1]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test57(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvbf16ger2(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test58( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2pp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test58(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvbf16ger2pp(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test59( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2pn(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test59(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvbf16ger2pn(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test60( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2np(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test60(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvbf16ger2np(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test61( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2nn(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test61(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_xvbf16ger2nn(&vq, vc, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test62( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2pp(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test62(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvbf16ger2pp(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test63( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2pn(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test63(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvbf16ger2pn(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2np(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test64(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvbf16ger2np(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test65( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, !tbaa !2 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2nn(<512 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP2]], <512 x i1>* [[TMP3]], align 64, !tbaa !2 +// CHECK-NEXT: ret void +// +void test65(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmxvbf16ger2nn(&vq, vc, vc, 0, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test66( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.ppc.mma.xvcvspbf16(<16 x i8> [[VC:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <16 x i8>* +// CHECK-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[TMP1]], align 16, !tbaa !8 +// CHECK-NEXT: ret void +// +void test66(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + vector unsigned char res = __builtin_xvcvspbf16(vc); + *((vector unsigned char *)resp) = res; +} + +// CHECK-LABEL: @test67( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.ppc.mma.xvcvbf16sp(<16 x i8> [[VC:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <16 x i8>* +// CHECK-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[TMP1]], align 16, !tbaa !8 +// CHECK-NEXT: ret void +// +void test67(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + vector unsigned char res = __builtin_xvcvbf16sp(vc); + *((vector unsigned char *)resp) = res; +}