diff --git a/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h --- a/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h @@ -45,6 +45,21 @@ Xor }; +/// Enums used to templatize and share lowering of PowerPC MMA intrinsics. +enum class MMAOp { + AssembleAcc, + AssemblePair, + DisassembleAcc, + DisassemblePair, +}; + +enum class MMAHandlerOp { + NoOp, + SubToFunc, + SubToFuncReverseArgOnLE, + FirstArgIsResult, +}; + // Wrapper struct to encapsulate information for a vector type. Preserves // sign of eleTy if eleTy is signed/unsigned integer. Helps with vector type // conversions. @@ -120,13 +135,16 @@ PPCIntrinsicLibrary() = delete; PPCIntrinsicLibrary(const PPCIntrinsicLibrary &) = delete; + // PPC MMA intrinsic generic handler + template + void genMmaIntr(llvm::ArrayRef); + // PPC intrinsic handlers. template void genMtfsf(llvm::ArrayRef); fir::ExtendedValue genVecAbs(mlir::Type resultType, llvm::ArrayRef args); - template fir::ExtendedValue genVecAddAndMulSubXor(mlir::Type resultType, diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -5746,6 +5746,9 @@ if (const IntrinsicHandler *handler = findIntrinsicHandler(name)) if (!handler->argLoweringRules.hasDefaultRules()) return &handler->argLoweringRules; + if (const IntrinsicHandler *ppcHandler = findPPCIntrinsicHandler(name)) + if (!ppcHandler->argLoweringRules.hasDefaultRules()) + return &ppcHandler->argLoweringRules; return nullptr; } diff --git a/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp --- a/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp @@ -14,8 +14,8 @@ //===----------------------------------------------------------------------===// #include "flang/Optimizer/Builder/PPCIntrinsicCall.h" +#include "flang/Evaluate/common.h" #include "flang/Optimizer/Builder/FIRBuilder.h" -#include "flang/Optimizer/Builder/IntrinsicCall.h" #include "flang/Optimizer/Builder/MutableBox.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" @@ -25,6 +25,40 @@ // PPC specific intrinsic handlers. static constexpr IntrinsicHandler ppcHandlers[]{ + {"__ppc_mma_assemble_acc", + static_cast( + &PI::genMmaIntr), + {{{"acc", asAddr}, + {"arg1", asValue}, + {"arg2", asValue}, + {"arg3", asValue}, + {"arg4", asValue}}}, + /*isElemental=*/true}, + {"__ppc_mma_assemble_pair", + static_cast( + &PI::genMmaIntr), + {{{"pair", asAddr}, {"arg1", asValue}, {"arg2", asValue}}}, + /*isElemental=*/true}, + {"__ppc_mma_build_acc", + static_cast( + &PI::genMmaIntr), + {{{"acc", asAddr}, + {"arg1", asValue}, + {"arg2", asValue}, + {"arg3", asValue}, + {"arg4", asValue}}}, + /*isElemental=*/true}, + {"__ppc_mma_disassemble_acc", + static_cast( + &PI::genMmaIntr), + {{{"data", asAddr}, {"acc", asValue}}}, + /*isElemental=*/true}, + {"__ppc_mma_disassemble_pair", + static_cast( + &PI::genMmaIntr), + {{{"data", asAddr}, {"pair", asValue}}}, + /*isElemental=*/true}, {"__ppc_mtfsf", static_cast(&PI::genMtfsf), {{{"mask", asValue}, {"r", asValue}}}, @@ -326,6 +360,103 @@ return ppcMathOps.equal_range(name); } +static mlir::FunctionType genMmaVpFuncType(mlir::MLIRContext *context, + int quadCnt, int pairCnt, int vecCnt, + int intCnt = 0, + int vecElemBitSize = 8, + int intBitSize = 32) { + // Constructs a function type with the following signature: + // Result type: __vector_pair + // Arguments: + // quadCnt: number of arguments that has __vector_quad type, followed by + // pairCnt: number of arguments that has __vector_pair type, followed by + // vecCnt: number of arguments that has vector(integer) type, followed by + // intCnt: number of arguments that has integer type + // vecElemBitSize: specifies the size of vector elements in bits + // intBitSize: specifies the size of integer arguments in bits + auto vType{mlir::VectorType::get( + 128 / vecElemBitSize, mlir::IntegerType::get(context, vecElemBitSize))}; + auto vpType{fir::VectorType::get(256, mlir::IntegerType::get(context, 1))}; + auto vqType{fir::VectorType::get(512, mlir::IntegerType::get(context, 1))}; + auto iType{mlir::IntegerType::get(context, intBitSize)}; + llvm::SmallVector argTypes; + for (int i = 0; i < quadCnt; ++i) { + argTypes.push_back(vqType); + } + for (int i = 0; i < pairCnt; ++i) { + argTypes.push_back(vpType); + } + for (int i = 0; i < vecCnt; ++i) { + argTypes.push_back(vType); + } + for (int i = 0; i < intCnt; ++i) { + argTypes.push_back(iType); + } + + return mlir::FunctionType::get(context, argTypes, {vpType}); +} + +static mlir::FunctionType genMmaVqFuncType(mlir::MLIRContext *context, + int quadCnt, int pairCnt, int vecCnt, + int intCnt = 0, + int vecElemBitSize = 8, + int intBitSize = 32) { + // Constructs a function type with the following signature: + // Result type: __vector_quad + // Arguments: + // quadCnt: number of arguments that has __vector_quad type, followed by + // pairCnt: number of arguments that has __vector_pair type, followed by + // vecCnt: number of arguments that has vector(integer) type, followed by + // intCnt: number of arguments that has integer type + // vecElemBitSize: specifies the size of vector elements in bits + // intBitSize: specifies the size of integer arguments in bits + auto vType{mlir::VectorType::get( + 128 / vecElemBitSize, mlir::IntegerType::get(context, vecElemBitSize))}; + auto vpType{fir::VectorType::get(256, mlir::IntegerType::get(context, 1))}; + auto vqType{fir::VectorType::get(512, mlir::IntegerType::get(context, 1))}; + auto iType{mlir::IntegerType::get(context, intBitSize)}; + llvm::SmallVector argTypes; + for (int i = 0; i < quadCnt; ++i) { + argTypes.push_back(vqType); + } + for (int i = 0; i < pairCnt; ++i) { + argTypes.push_back(vpType); + } + for (int i = 0; i < vecCnt; ++i) { + argTypes.push_back(vType); + } + for (int i = 0; i < intCnt; ++i) { + argTypes.push_back(iType); + } + + return mlir::FunctionType::get(context, argTypes, {vqType}); +} + +mlir::FunctionType genMmaDisassembleFuncType(mlir::MLIRContext *context, + MMAOp mmaOp) { + auto vType{mlir::VectorType::get(16, mlir::IntegerType::get(context, 8))}; + llvm::SmallVector members; + + if (mmaOp == MMAOp::DisassembleAcc) { + auto vqType{fir::VectorType::get(512, mlir::IntegerType::get(context, 1))}; + members.push_back(vType); + members.push_back(vType); + members.push_back(vType); + members.push_back(vType); + auto resType{mlir::LLVM::LLVMStructType::getLiteral(context, members)}; + return mlir::FunctionType::get(context, {vqType}, {resType}); + } else if (mmaOp == MMAOp::DisassemblePair) { + auto vpType{fir::VectorType::get(256, mlir::IntegerType::get(context, 1))}; + members.push_back(vType); + members.push_back(vType); + auto resType{mlir::LLVM::LLVMStructType::getLiteral(context, members)}; + return mlir::FunctionType::get(context, {vpType}, {resType}); + } else { + llvm_unreachable( + "Unsupported intrinsic code for function signature generator"); + } +} + //===----------------------------------------------------------------------===// // PowerPC specific intrinsic handlers. //===----------------------------------------------------------------------===// @@ -1130,4 +1261,114 @@ return shftRes; } +const char *getMmaIrIntrName(MMAOp mmaOp) { + switch (mmaOp) { + case MMAOp::AssembleAcc: + return "llvm.ppc.mma.assemble.acc"; + case MMAOp::AssemblePair: + return "llvm.ppc.vsx.assemble.pair"; + case MMAOp::DisassembleAcc: + return "llvm.ppc.mma.disassemble.acc"; + case MMAOp::DisassemblePair: + return "llvm.ppc.vsx.disassemble.pair"; + } +} + +mlir::FunctionType getMmaIrFuncType(mlir::MLIRContext *context, MMAOp mmaOp) { + switch (mmaOp) { + case MMAOp::AssembleAcc: + return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 4); + case MMAOp::AssemblePair: + return genMmaVpFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2); + case MMAOp::DisassembleAcc: + return genMmaDisassembleFuncType(context, mmaOp); + case MMAOp::DisassemblePair: + return genMmaDisassembleFuncType(context, mmaOp); + } +} + +template +void PPCIntrinsicLibrary::genMmaIntr(llvm::ArrayRef args) { + auto context{builder.getContext()}; + mlir::FunctionType intrFuncType{getMmaIrFuncType(context, IntrId)}; + mlir::func::FuncOp funcOp{ + builder.addNamedFunction(loc, getMmaIrIntrName(IntrId), intrFuncType)}; + llvm::SmallVector intrArgs; + + // Depending on SubToFunc, change the subroutine call to a function call. + // First argument represents the result. Rest of the arguments + // are shifted one position to form the actual argument list. + size_t argStart{0}; + size_t argStep{1}; + size_t e{args.size()}; + if (HandlerOp == MMAHandlerOp::SubToFunc) { + // The first argument becomes function result. Start from the second + // argument. + argStart = 1; + } else if (HandlerOp == MMAHandlerOp::SubToFuncReverseArgOnLE) { + // Reverse argument order on little-endian target only. + // The reversal does not depend on the setting of non-native-order option. + if (Fortran::evaluate::isHostLittleEndian) { + // Load the arguments in reverse order. + argStart = args.size() - 1; + // The first argument becomes function result. Stop at the second + // argument. + e = 0; + argStep = -1; + } else { + // Load the arguments in natural order. + // The first argument becomes function result. Start from the second + // argument. + argStart = 1; + } + } + + for (size_t i = argStart, j = 0; i != e; i += argStep, ++j) { + auto v{fir::getBase(args[i])}; + if (i == 0 && HandlerOp == MMAHandlerOp::FirstArgIsResult) { + // First argument is passed in as an address. We need to load + // the content to match the LLVM interface. + v = builder.create(loc, v); + } + auto vType{v.getType()}; + mlir::Type targetType{intrFuncType.getInput(j)}; + if (vType != targetType) { + if (targetType.isa()) { + // Perform vector type conversion for arguments passed by value. + auto eleTy{vType.dyn_cast().getEleTy()}; + auto len{vType.dyn_cast().getLen()}; + mlir::VectorType mlirType = mlir::VectorType::get(len, eleTy); + auto v0{builder.createConvert(loc, mlirType, v)}; + auto v1{builder.create(loc, targetType, v0)}; + intrArgs.push_back(v1); + } else if (targetType.isa() && + vType.isa()) { + auto v0{builder.createConvert(loc, targetType, v)}; + intrArgs.push_back(v0); + } else { + llvm::errs() << "\nUnexpected type conversion requested: " + << " from " << vType << " to " << targetType << "\n"; + llvm_unreachable("Unsupported type conversion for argument to PowerPC " + "MMA intrinsic"); + } + } else { + intrArgs.push_back(v); + } + } + auto callSt{builder.create(loc, funcOp, intrArgs)}; + if (HandlerOp == MMAHandlerOp::SubToFunc || + HandlerOp == MMAHandlerOp::SubToFuncReverseArgOnLE || + HandlerOp == MMAHandlerOp::FirstArgIsResult) { + // Convert pointer type if needed. + mlir::Value callResult{callSt.getResult(0)}; + mlir::Value destPtr{fir::getBase(args[0])}; + mlir::Type callResultPtrType{builder.getRefType(callResult.getType())}; + if (destPtr.getType() != callResultPtrType) { + destPtr = builder.create(loc, callResultPtrType, destPtr); + } + // Copy the result. + builder.create(loc, callResult, destPtr); + } +} + } // namespace fir diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp --- a/flang/lib/Optimizer/Dialect/FIROps.cpp +++ b/flang/lib/Optimizer/Dialect/FIROps.cpp @@ -947,20 +947,21 @@ } static std::optional getVectorElementType(mlir::Type ty) { - if (mlir::isa(ty)) { - auto elemTy = mlir::dyn_cast(ty).getEleTy(); - - // fir.vector<4:ui32> is converted to mlir.vector<4xi32> - if (elemTy.isUnsignedInteger()) { - elemTy = mlir::IntegerType::get( - ty.getContext(), - mlir::dyn_cast(elemTy).getWidth()); - } - return elemTy; - } else if (mlir::isa(ty)) - return mlir::dyn_cast(ty).getElementType(); + mlir::Type elemTy; + if (mlir::isa(ty)) + elemTy = mlir::dyn_cast(ty).getEleTy(); + else if (mlir::isa(ty)) + elemTy = mlir::dyn_cast(ty).getElementType(); + else + return std::nullopt; - return std::nullopt; + // e.g. fir.vector<4:ui32> => mlir.vector<4xi32> + // e.g. mlir.vector<4xui32> => mlir.vector<4xi32> + if (elemTy.isUnsignedInteger()) { + elemTy = mlir::IntegerType::get( + ty.getContext(), mlir::dyn_cast(elemTy).getWidth()); + } + return elemTy; } static std::optional getVectorLen(mlir::Type ty) { diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp --- a/flang/lib/Semantics/check-call.cpp +++ b/flang/lib/Semantics/check-call.cpp @@ -298,7 +298,8 @@ actualFirstSymbol && actualFirstSymbol->attrs().test(Attr::ASYNCHRONOUS)}; bool actualIsVolatile{ actualFirstSymbol && actualFirstSymbol->attrs().test(Attr::VOLATILE)}; - if (const auto *derived{evaluate::GetDerivedTypeSpec(actualType.type())}) { + const auto *derived{evaluate::GetDerivedTypeSpec(actualType.type())}; + if (derived && !derived->IsVectorType()) { if (dummy.type.type().IsAssumedType()) { if (!derived->parameters().empty()) { // 15.5.2.4(2) messages.Say( diff --git a/flang/lib/Semantics/semantics.cpp b/flang/lib/Semantics/semantics.cpp --- a/flang/lib/Semantics/semantics.cpp +++ b/flang/lib/Semantics/semantics.cpp @@ -518,8 +518,11 @@ .statement.v.source == "__ppc_types")) { // Don't try to read the builtins module when we're actually building it. } else if (frontModule && - std::get>(frontModule->value().t) - .statement.v.source == "__ppc_intrinsics") { + (std::get>(frontModule->value().t) + .statement.v.source == "__ppc_intrinsics" || + std::get>( + frontModule->value().t) + .statement.v.source == "mma")) { // The derived type definition for the vectors is needed. context_.UsePPCBuiltinTypesModule(); } else { diff --git a/flang/module/mma.f90 b/flang/module/mma.f90 new file mode 100644 --- /dev/null +++ b/flang/module/mma.f90 @@ -0,0 +1,220 @@ +!===-- module/mma.f90 ------------------------------------------------------===! +! +! Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +! See https://llvm.org/LICENSE.txt for license information. +! SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +! +!===------------------------------------------------------------------------===! + +module mma + implicit none + private + + abstract interface + +!! ========== 3 arguments subroutine interface ===============================!! +!! subroutine s(__vector_pair, vector(i), vector(i)) +#define ELEM_SUB_VPVIVI(VKIND) \ + elemental subroutine sub_vpvi##VKIND##vi##VKIND(pair, arg1, arg2); \ + __vector_pair, intent(out) :: pair ; \ + vector(integer(VKIND)), intent(in) :: arg1, arg2; \ + end subroutine ; + +!! subroutine s(__vector_pair, vector(u), vector(u)) +#define ELEM_SUB_VPVUVU(VKIND) \ + elemental subroutine sub_vpvu##VKIND##vu##VKIND(pair, arg1, arg2); \ + __vector_pair, intent(out) :: pair ; \ + vector(unsigned(VKIND)), intent(in) :: arg1, arg2; \ + end subroutine ; + +!! subroutine s(__vector_pair, vector(r), vector(r)) +#define ELEM_SUB_VPVRVR(VKIND) \ + elemental subroutine sub_vpvr##VKIND##vr##VKIND(pair, arg1, arg2); \ + __vector_pair, intent(out) :: pair ; \ + vector(real(VKIND)), intent(in) :: arg1, arg2; \ + end subroutine ; + + ELEM_SUB_VPVIVI(1) ELEM_SUB_VPVIVI(2) + ELEM_SUB_VPVIVI(4) ELEM_SUB_VPVIVI(8) + ELEM_SUB_VPVUVU(1) ELEM_SUB_VPVUVU(2) + ELEM_SUB_VPVUVU(4) ELEM_SUB_VPVUVU(8) + ELEM_SUB_VPVRVR(4) ELEM_SUB_VPVRVR(8) + +#undef ELEM_SUB_VPVIVI +#undef ELEM_SUB_VPVUVU +#undef ELEM_SUB_VPVRVR + +!! ========== 5 arguments subroutine interface ===============================!! +!! subroutine s(__vector_quad, vector(i), vector(i), vector(i), vector(i)) +#define ELEM_SUB_VQVIVIVIVI(VKIND) \ + elemental subroutine sub_vqvi##VKIND##vi##VKIND##vi##VKIND##vi##VKIND(acc, arg1, arg2, arg3, arg4); \ + __vector_quad, intent(out) :: acc; \ + vector(integer(VKIND)), intent(in) :: arg1, arg2, arg3, arg4; \ + end subroutine ; + +!! subroutine s(__vector_quad, vector(u), vector(u), vector(u), vector(u)) +#define ELEM_SUB_VQVUVUVUVU(VKIND) \ + elemental subroutine sub_vqvu##VKIND##vu##VKIND##vu##VKIND##vu##VKIND(acc, arg1, arg2, arg3, arg4); \ + __vector_quad, intent(out) :: acc; \ + vector(unsigned(VKIND)), intent(in) :: arg1, arg2, arg3, arg4; \ + end subroutine ; + +!! subroutine s(__vector_quad, vector(r), vector(r), vector(r), vector(r)) +#define ELEM_SUB_VQVRVRVRVR(VKIND) \ + elemental subroutine sub_vqvr##VKIND##vr##VKIND##vr##VKIND##vr##VKIND(acc, arg1, arg2, arg3, arg4); \ + __vector_quad, intent(out) :: acc; \ + vector(real(VKIND)), intent(in) :: arg1, arg2, arg3, arg4; \ + end subroutine ; + + ELEM_SUB_VQVIVIVIVI(1) ELEM_SUB_VQVIVIVIVI(2) + ELEM_SUB_VQVIVIVIVI(4) ELEM_SUB_VQVIVIVIVI(8) + ELEM_SUB_VQVUVUVUVU(1) ELEM_SUB_VQVUVUVUVU(2) + ELEM_SUB_VQVUVUVUVU(4) ELEM_SUB_VQVUVUVUVU(8) + ELEM_SUB_VQVRVRVRVR(4) ELEM_SUB_VQVRVRVRVR(8) + +#undef ELEM_SUB_VQVRVRVRVR +#undef ELEM_SUB_VQVUVUVUVU +#undef ELEM_SUB_VQVIVIVIVI + +!! ========== non-macro interface =============================================!! + elemental subroutine sub_atvp(data, pair) + ! Dummy arg 'data' is supposed to be intent(out) of any type, + ! but according to Fortran 2018: C709: Type(*) arguments can not have + ! intent(out) attribute. Use intent(inout) instead. + type(*), intent(inout) :: data + __vector_pair, intent(inout) :: pair + end subroutine + + elemental subroutine sub_atvq(data, acc) + ! Dummy arg 'data' is supposed to be intent(out) of any type, + ! but according to Fortran 2018: C709: Type(*) arguments can not have + ! intent(out) attribute. Use intent(inout) instead. + type(*), intent(inout) :: data + __vector_quad, intent(inout) :: acc + end subroutine + + end interface + + +#define SUB_VQ_VI_VI_VI_VI(NAME, VKIND) __ppc_##NAME##_vi##VKIND##vi##VKIND##vi##VKIND##vi##VKIND +#define SUB_VQ_VU_VU_VU_VU(NAME, VKIND) __ppc_##NAME##_vu##VKIND##vu##VKIND##vu##VKIND##vu##VKIND +#define SUB_VQ_VR_VR_VR_VR(NAME, VKIND) __ppc_##NAME##_vr##VKIND##vr##VKIND##vr##VKIND##vr##VKIND + +#define VEC_SUB_VQ_VI_VI_VI_VI(NAME, VKIND) \ + procedure(sub_vqvi##VKIND##vi##VKIND##vi##VKIND##vi##VKIND) :: SUB_VQ_VI_VI_VI_VI(NAME, VKIND); +#define VEC_SUB_VQ_VU_VU_VU_VU(NAME, VKIND) \ + procedure(sub_vqvu##VKIND##vu##VKIND##vu##VKIND##vu##VKIND) :: SUB_VQ_VU_VU_VU_VU(NAME, VKIND); +#define VEC_SUB_VQ_VR_VR_VR_VR(NAME, VKIND) \ + procedure(sub_vqvr##VKIND##vr##VKIND##vr##VKIND##vr##VKIND) :: SUB_VQ_VR_VR_VR_VR(NAME, VKIND); + +! mma_assemble_acc + VEC_SUB_VQ_VI_VI_VI_VI(mma_assemble_acc,1) + VEC_SUB_VQ_VI_VI_VI_VI(mma_assemble_acc,2) + VEC_SUB_VQ_VI_VI_VI_VI(mma_assemble_acc,4) + VEC_SUB_VQ_VI_VI_VI_VI(mma_assemble_acc,8) + VEC_SUB_VQ_VU_VU_VU_VU(mma_assemble_acc,1) + VEC_SUB_VQ_VU_VU_VU_VU(mma_assemble_acc,2) + VEC_SUB_VQ_VU_VU_VU_VU(mma_assemble_acc,4) + VEC_SUB_VQ_VU_VU_VU_VU(mma_assemble_acc,8) + VEC_SUB_VQ_VR_VR_VR_VR(mma_assemble_acc,4) + VEC_SUB_VQ_VR_VR_VR_VR(mma_assemble_acc,8) + interface mma_assemble_acc + procedure :: SUB_VQ_VI_VI_VI_VI(mma_assemble_acc,1) + procedure :: SUB_VQ_VI_VI_VI_VI(mma_assemble_acc,2) + procedure :: SUB_VQ_VI_VI_VI_VI(mma_assemble_acc,4) + procedure :: SUB_VQ_VI_VI_VI_VI(mma_assemble_acc,8) + procedure :: SUB_VQ_VU_VU_VU_VU(mma_assemble_acc,1) + procedure :: SUB_VQ_VU_VU_VU_VU(mma_assemble_acc,2) + procedure :: SUB_VQ_VU_VU_VU_VU(mma_assemble_acc,4) + procedure :: SUB_VQ_VU_VU_VU_VU(mma_assemble_acc,8) + procedure :: SUB_VQ_VR_VR_VR_VR(mma_assemble_acc,4) + procedure :: SUB_VQ_VR_VR_VR_VR(mma_assemble_acc,8) + end interface + public mma_assemble_acc + +! mma_build_acc + VEC_SUB_VQ_VI_VI_VI_VI(mma_build_acc,1) + VEC_SUB_VQ_VI_VI_VI_VI(mma_build_acc,2) + VEC_SUB_VQ_VI_VI_VI_VI(mma_build_acc,4) + VEC_SUB_VQ_VI_VI_VI_VI(mma_build_acc,8) + VEC_SUB_VQ_VU_VU_VU_VU(mma_build_acc,1) + VEC_SUB_VQ_VU_VU_VU_VU(mma_build_acc,2) + VEC_SUB_VQ_VU_VU_VU_VU(mma_build_acc,4) + VEC_SUB_VQ_VU_VU_VU_VU(mma_build_acc,8) + VEC_SUB_VQ_VR_VR_VR_VR(mma_build_acc,4) + VEC_SUB_VQ_VR_VR_VR_VR(mma_build_acc,8) + interface mma_build_acc + procedure :: SUB_VQ_VI_VI_VI_VI(mma_build_acc,1) + procedure :: SUB_VQ_VI_VI_VI_VI(mma_build_acc,2) + procedure :: SUB_VQ_VI_VI_VI_VI(mma_build_acc,4) + procedure :: SUB_VQ_VI_VI_VI_VI(mma_build_acc,8) + procedure :: SUB_VQ_VU_VU_VU_VU(mma_build_acc,1) + procedure :: SUB_VQ_VU_VU_VU_VU(mma_build_acc,2) + procedure :: SUB_VQ_VU_VU_VU_VU(mma_build_acc,4) + procedure :: SUB_VQ_VU_VU_VU_VU(mma_build_acc,8) + procedure :: SUB_VQ_VR_VR_VR_VR(mma_build_acc,4) + procedure :: SUB_VQ_VR_VR_VR_VR(mma_build_acc,8) + end interface + public mma_build_acc + +#undef VEC_SUB_VQ_VR_VR_VR_VR +#undef VEC_SUB_VQ_VU_VU_VU_VU +#undef VEC_SUB_VQ_VI_VI_VI_VI +#undef SUB_VQ_VR_VR_VR_VR +#undef SUB_VQ_VU_VU_VU_VU +#undef SUB_VQ_VI_VI_VI_VI + +#define SUB_VP_VI_VI(NAME, VKIND) __ppc_##NAME##_vi##VKIND##vi##VKIND +#define SUB_VP_VU_VU(NAME, VKIND) __ppc_##NAME##_vu##VKIND##vu##VKIND +#define SUB_VP_VR_VR(NAME, VKIND) __ppc_##NAME##_vr##VKIND##vr##VKIND + +#define VEC_SUB_VP_VI_VI(NAME, VKIND) \ + procedure(sub_vpvi##VKIND##vi##VKIND) :: SUB_VP_VI_VI(NAME, VKIND); +#define VEC_SUB_VP_VU_VU(NAME, VKIND) \ + procedure(sub_vpvu##VKIND##vu##VKIND) :: SUB_VP_VU_VU(NAME, VKIND); +#define VEC_SUB_VP_VR_VR(NAME, VKIND) \ + procedure(sub_vpvr##VKIND##vr##VKIND) :: SUB_VP_VR_VR(NAME, VKIND); + +! mma_assemble_pair + VEC_SUB_VP_VI_VI(mma_assemble_pair,1) VEC_SUB_VP_VI_VI(mma_assemble_pair,2) + VEC_SUB_VP_VI_VI(mma_assemble_pair,4) VEC_SUB_VP_VI_VI(mma_assemble_pair,8) + VEC_SUB_VP_VU_VU(mma_assemble_pair,1) VEC_SUB_VP_VU_VU(mma_assemble_pair,2) + VEC_SUB_VP_VU_VU(mma_assemble_pair,4) VEC_SUB_VP_VU_VU(mma_assemble_pair,8) + VEC_SUB_VP_VR_VR(mma_assemble_pair,4) VEC_SUB_VP_VR_VR(mma_assemble_pair,8) + interface mma_assemble_pair + procedure :: SUB_VP_VI_VI(mma_assemble_pair,1) + procedure :: SUB_VP_VI_VI(mma_assemble_pair,2) + procedure :: SUB_VP_VI_VI(mma_assemble_pair,4) + procedure :: SUB_VP_VI_VI(mma_assemble_pair,8) + procedure :: SUB_VP_VU_VU(mma_assemble_pair,1) + procedure :: SUB_VP_VU_VU(mma_assemble_pair,2) + procedure :: SUB_VP_VU_VU(mma_assemble_pair,4) + procedure :: SUB_VP_VU_VU(mma_assemble_pair,8) + procedure :: SUB_VP_VR_VR(mma_assemble_pair,4) + procedure :: SUB_VP_VR_VR(mma_assemble_pair,8) + end interface + public mma_assemble_pair + +#undef VEC_SUB_VP_VR_VR +#undef VEC_SUB_VP_VU_VU +#undef VEC_SUB_VP_VI_VI +#undef SUB_VP_VR_VR +#undef SUB_VP_VU_VU +#undef SUB_VP_VI_VI + +! mma_disassemble_acc + procedure(sub_atvq) :: __ppc_mma_disassemble_acc + interface mma_disassemble_acc + procedure :: __ppc_mma_disassemble_acc + end interface + public mma_disassemble_acc + +! mma_disassemble_pair + procedure(sub_atvp) :: __ppc_mma_disassemble_pair + interface mma_disassemble_pair + procedure :: __ppc_mma_disassemble_pair + end interface + public mma_disassemble_pair + +end module + diff --git a/flang/test/Lower/PowerPC/ppc-mma-assemble-disassemble.f90 b/flang/test/Lower/PowerPC/ppc-mma-assemble-disassemble.f90 new file mode 100644 --- /dev/null +++ b/flang/test/Lower/PowerPC/ppc-mma-assemble-disassemble.f90 @@ -0,0 +1,716 @@ +! RUN: %flang --target=powerpc64le-unknown-linux-gnu -mcpu=pwr10 -emit-llvm -S %s -o - | FileCheck --check-prefixes="CHECK" %s +! REQUIRES: target=powerpc{{.*}} + +! mma_assemble_acc + + subroutine test_assemble_acc_i1() + use, intrinsic :: mma + implicit none + vector(integer(1)) vi10, vi11, vi12, vi13 + __vector_quad :: cq + call mma_assemble_acc(cq, vi10, vi11, vi12, vi13) + end subroutine test_assemble_acc_i1 + +! CHECK-LABEL: @test_assemble_acc_i1 +! CHECK: %1 = alloca <512 x i1>, i64 1, align 64 +! CHECK: %2 = alloca <16 x i8>, i64 1, align 16 +! CHECK: %3 = alloca <16 x i8>, i64 1, align 16 +! CHECK: %4 = alloca <16 x i8>, i64 1, align 16 +! CHECK: %5 = alloca <16 x i8>, i64 1, align 16 +! CHECK: %6 = load <16 x i8>, ptr %2, align 16 +! CHECK: %7 = load <16 x i8>, ptr %3, align 16 +! CHECK: %8 = load <16 x i8>, ptr %4, align 16 +! CHECK: %9 = load <16 x i8>, ptr %5, align 16 +! CHECK: %10 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %6, <16 x i8> %7, <16 x i8> %8, <16 x i8> %9) +! CHECK: store <512 x i1> %10, ptr %1, align 64 + + subroutine test_assemble_acc_i2() + use, intrinsic :: mma + implicit none + vector(integer(2)) vi10, vi11, vi12, vi13 + __vector_quad :: cq + call mma_assemble_acc(cq, vi10, vi11, vi12, vi13) + end subroutine test_assemble_acc_i2 + +! CHECK-LABEL: @test_assemble_acc_i2 +! CHECK: %1 = alloca <512 x i1>, i64 1, align 64 +! CHECK: %2 = alloca <8 x i16>, i64 1, align 16 +! CHECK: %3 = alloca <8 x i16>, i64 1, align 16 +! CHECK: %4 = alloca <8 x i16>, i64 1, align 16 +! CHECK: %5 = alloca <8 x i16>, i64 1, align 16 +! CHECK: %6 = load <8 x i16>, ptr %2, align 16 +! CHECK: %7 = load <8 x i16>, ptr %3, align 16 +! CHECK: %8 = load <8 x i16>, ptr %4, align 16 +! CHECK: %9 = load <8 x i16>, ptr %5, align 16 +! CHECK: %10 = bitcast <8 x i16> %6 to <16 x i8> +! CHECK: %11 = bitcast <8 x i16> %7 to <16 x i8> +! CHECK: %12 = bitcast <8 x i16> %8 to <16 x i8> +! CHECK: %13 = bitcast <8 x i16> %9 to <16 x i8> +! CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13) +! CHECK: store <512 x i1> %14, ptr %1, align 64 + + + subroutine test_assemble_acc_i4() + use, intrinsic :: mma + implicit none + vector(integer(4)) vi10, vi11, vi12, vi13 + __vector_quad :: cq + call mma_assemble_acc(cq, vi10, vi11, vi12, vi13) + end subroutine test_assemble_acc_i4 + +! CHECK-LABEL: @test_assemble_acc_i4 +! CHECK: %1 = alloca <512 x i1>, i64 1, align 64 +! CHECK: %2 = alloca <4 x i32>, i64 1, align 16 +! CHECK: %3 = alloca <4 x i32>, i64 1, align 16 +! CHECK: %4 = alloca <4 x i32>, i64 1, align 16 +! CHECK: %5 = alloca <4 x i32>, i64 1, align 16 +! CHECK: %6 = load <4 x i32>, ptr %2, align 16 +! CHECK: %7 = load <4 x i32>, ptr %3, align 16 +! CHECK: %8 = load <4 x i32>, ptr %4, align 16 +! CHECK: %9 = load <4 x i32>, ptr %5, align 16 +! CHECK: %10 = bitcast <4 x i32> %6 to <16 x i8> +! CHECK: %11 = bitcast <4 x i32> %7 to <16 x i8> +! CHECK: %12 = bitcast <4 x i32> %8 to <16 x i8> +! CHECK: %13 = bitcast <4 x i32> %9 to <16 x i8> +! CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13) +! CHECK: store <512 x i1> %14, ptr %1, align 64 + + subroutine test_assemble_acc_i8() + use, intrinsic :: mma + implicit none + vector(integer(8)) vi10, vi11, vi12, vi13 + __vector_quad :: cq + call mma_assemble_acc(cq, vi10, vi11, vi12, vi13) + end subroutine test_assemble_acc_i8 + +! CHECK-LABEL: @test_assemble_acc_i8 +! CHECK: %1 = alloca <512 x i1>, i64 1, align 64 +! CHECK: %2 = alloca <2 x i64>, i64 1, align 16 +! CHECK: %3 = alloca <2 x i64>, i64 1, align 16 +! CHECK: %4 = alloca <2 x i64>, i64 1, align 16 +! CHECK: %5 = alloca <2 x i64>, i64 1, align 16 +! CHECK: %6 = load <2 x i64>, ptr %2, align 16 +! CHECK: %7 = load <2 x i64>, ptr %3, align 16 +! CHECK: %8 = load <2 x i64>, ptr %4, align 16 +! CHECK: %9 = load <2 x i64>, ptr %5, align 16 +! CHECK: %10 = bitcast <2 x i64> %6 to <16 x i8> +! CHECK: %11 = bitcast <2 x i64> %7 to <16 x i8> +! CHECK: %12 = bitcast <2 x i64> %8 to <16 x i8> +! CHECK: %13 = bitcast <2 x i64> %9 to <16 x i8> +! CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13) +! CHECK: store <512 x i1> %14, ptr %1, align 64 + + + subroutine test_assemble_acc_u1() + use, intrinsic :: mma + implicit none + vector(unsigned(1)) vi10, vi11, vi12, vi13 + __vector_quad :: cq + call mma_assemble_acc(cq, vi10, vi11, vi12, vi13) + end subroutine test_assemble_acc_u1 + +! CHECK-LABEL: @test_assemble_acc_u1 +! CHECK: %1 = alloca <512 x i1>, i64 1, align 64 +! CHECK: %2 = alloca <16 x i8>, i64 1, align 16 +! CHECK: %3 = alloca <16 x i8>, i64 1, align 16 +! CHECK: %4 = alloca <16 x i8>, i64 1, align 16 +! CHECK: %5 = alloca <16 x i8>, i64 1, align 16 +! CHECK: %6 = load <16 x i8>, ptr %2, align 16 +! CHECK: %7 = load <16 x i8>, ptr %3, align 16 +! CHECK: %8 = load <16 x i8>, ptr %4, align 16 +! CHECK: %9 = load <16 x i8>, ptr %5, align 16 +! CHECK: %10 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %6, <16 x i8> %7, <16 x i8> %8, <16 x i8> %9) +! CHECK: store <512 x i1> %10, ptr %1, align 64 + + subroutine test_assemble_acc_u2() + use, intrinsic :: mma + implicit none + vector(unsigned(2)) vi10, vi11, vi12, vi13 + __vector_quad :: cq + call mma_assemble_acc(cq, vi10, vi11, vi12, vi13) + end subroutine test_assemble_acc_u2 + +! CHECK-LABEL: @test_assemble_acc_u2 +! CHECK: %1 = alloca <512 x i1>, i64 1, align 64 +! CHECK: %2 = alloca <8 x i16>, i64 1, align 16 +! CHECK: %3 = alloca <8 x i16>, i64 1, align 16 +! CHECK: %4 = alloca <8 x i16>, i64 1, align 16 +! CHECK: %5 = alloca <8 x i16>, i64 1, align 16 +! CHECK: %6 = load <8 x i16>, ptr %2, align 16 +! CHECK: %7 = load <8 x i16>, ptr %3, align 16 +! CHECK: %8 = load <8 x i16>, ptr %4, align 16 +! CHECK: %9 = load <8 x i16>, ptr %5, align 16 +! CHECK: %10 = bitcast <8 x i16> %6 to <16 x i8> +! CHECK: %11 = bitcast <8 x i16> %7 to <16 x i8> +! CHECK: %12 = bitcast <8 x i16> %8 to <16 x i8> +! CHECK: %13 = bitcast <8 x i16> %9 to <16 x i8> +! CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13) +! CHECK: store <512 x i1> %14, ptr %1, align 64 + + subroutine test_assemble_acc_u4() + use, intrinsic :: mma + implicit none + vector(unsigned(4)) vi10, vi11, vi12, vi13 + __vector_quad :: cq + call mma_assemble_acc(cq, vi10, vi11, vi12, vi13) + end subroutine test_assemble_acc_u4 + +! CHECK-LABEL: @test_assemble_acc_u4 +! CHECK: %1 = alloca <512 x i1>, i64 1, align 64 +! CHECK: %2 = alloca <4 x i32>, i64 1, align 16 +! CHECK: %3 = alloca <4 x i32>, i64 1, align 16 +! CHECK: %4 = alloca <4 x i32>, i64 1, align 16 +! CHECK: %5 = alloca <4 x i32>, i64 1, align 16 +! CHECK: %6 = load <4 x i32>, ptr %2, align 16 +! CHECK: %7 = load <4 x i32>, ptr %3, align 16 +! CHECK: %8 = load <4 x i32>, ptr %4, align 16 +! CHECK: %9 = load <4 x i32>, ptr %5, align 16 +! CHECK: %10 = bitcast <4 x i32> %6 to <16 x i8> +! CHECK: %11 = bitcast <4 x i32> %7 to <16 x i8> +! CHECK: %12 = bitcast <4 x i32> %8 to <16 x i8> +! CHECK: %13 = bitcast <4 x i32> %9 to <16 x i8> +! CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13) +! CHECK: store <512 x i1> %14, ptr %1, align 64 + + subroutine test_assemble_acc_u8() + use, intrinsic :: mma + implicit none + vector(unsigned(8)) vi10, vi11, vi12, vi13 + __vector_quad :: cq + call mma_assemble_acc(cq, vi10, vi11, vi12, vi13) + end subroutine test_assemble_acc_u8 + +! CHECK-LABEL: @test_assemble_acc_u8 +! CHECK: %1 = alloca <512 x i1>, i64 1, align 64 +! CHECK: %2 = alloca <2 x i64>, i64 1, align 16 +! CHECK: %3 = alloca <2 x i64>, i64 1, align 16 +! CHECK: %4 = alloca <2 x i64>, i64 1, align 16 +! CHECK: %5 = alloca <2 x i64>, i64 1, align 16 +! CHECK: %6 = load <2 x i64>, ptr %2, align 16 +! CHECK: %7 = load <2 x i64>, ptr %3, align 16 +! CHECK: %8 = load <2 x i64>, ptr %4, align 16 +! CHECK: %9 = load <2 x i64>, ptr %5, align 16 +! CHECK: %10 = bitcast <2 x i64> %6 to <16 x i8> +! CHECK: %11 = bitcast <2 x i64> %7 to <16 x i8> +! CHECK: %12 = bitcast <2 x i64> %8 to <16 x i8> +! CHECK: %13 = bitcast <2 x i64> %9 to <16 x i8> +! CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13) +! CHECK: store <512 x i1> %14, ptr %1, align 64 + + subroutine test_assemble_acc_r4() + use, intrinsic :: mma + implicit none + vector(real(4)) vi10, vi11, vi12, vi13 + __vector_quad :: cq + call mma_assemble_acc(cq, vi10, vi11, vi12, vi13) + end subroutine test_assemble_acc_r4 + +! CHECK-LABEL: @test_assemble_acc_r4 +! CHECK: %1 = alloca <512 x i1>, i64 1, align 64 +! CHECK: %2 = alloca <4 x float>, i64 1, align 16 +! CHECK: %3 = alloca <4 x float>, i64 1, align 16 +! CHECK: %4 = alloca <4 x float>, i64 1, align 16 +! CHECK: %5 = alloca <4 x float>, i64 1, align 16 +! CHECK: %6 = load <4 x float>, ptr %2, align 16 +! CHECK: %7 = load <4 x float>, ptr %3, align 16 +! CHECK: %8 = load <4 x float>, ptr %4, align 16 +! CHECK: %9 = load <4 x float>, ptr %5, align 16 +! CHECK: %10 = bitcast <4 x float> %6 to <16 x i8> +! CHECK: %11 = bitcast <4 x float> %7 to <16 x i8> +! CHECK: %12 = bitcast <4 x float> %8 to <16 x i8> +! CHECK: %13 = bitcast <4 x float> %9 to <16 x i8> +! CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13) +! CHECK: store <512 x i1> %14, ptr %1, align 64 + + subroutine test_assemble_acc_r8() + use, intrinsic :: mma + implicit none + vector(real(8)) vi10, vi11, vi12, vi13 + __vector_quad :: cq + call mma_assemble_acc(cq, vi10, vi11, vi12, vi13) + end subroutine test_assemble_acc_r8 + +!CHECK-LABEL: @test_assemble_acc_r8 +!CHECK: %1 = alloca <512 x i1>, i64 1, align 64 +!CHECK: %2 = alloca <2 x double>, i64 1, align 16 +!CHECK: %3 = alloca <2 x double>, i64 1, align 16 +!CHECK: %4 = alloca <2 x double>, i64 1, align 16 +!CHECK: %5 = alloca <2 x double>, i64 1, align 16 +!CHECK: %6 = load <2 x double>, ptr %2, align 16 +!CHECK: %7 = load <2 x double>, ptr %3, align 16 +!CHECK: %8 = load <2 x double>, ptr %4, align 16 +!CHECK: %9 = load <2 x double>, ptr %5, align 16 +!CHECK: %10 = bitcast <2 x double> %6 to <16 x i8> +!CHECK: %11 = bitcast <2 x double> %7 to <16 x i8> +!CHECK: %12 = bitcast <2 x double> %8 to <16 x i8> +!CHECK: %13 = bitcast <2 x double> %9 to <16 x i8> +!CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13) +!CHECK: store <512 x i1> %14, ptr %1, align 64 + +! mma_assemble_pair + + subroutine test_mma_assemble_pair_i1() + use, intrinsic :: mma + implicit none + vector(integer(1)) vi10, vi11 + __vector_pair :: vp + call mma_assemble_pair(vp, vi10, vi11) + end subroutine test_mma_assemble_pair_i1 + +!CHECK: @test_mma_assemble_pair_i1_ +!CHECK: %1 = alloca <16 x i8>, i64 1, align 16 +!CHECK: %2 = alloca <16 x i8>, i64 1, align 16 +!CHECK: %3 = alloca <256 x i1>, i64 1, align 32 +!CHECK: %4 = load <16 x i8>, ptr %1, align 16 +!CHECK: %5 = load <16 x i8>, ptr %2, align 16 +!CHECK: %6 = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %4, <16 x i8> %5) +!CHECK: store <256 x i1> %6, ptr %3, align 32 + + subroutine test_mma_assemble_pair_i2() + use, intrinsic :: mma + implicit none + vector(integer(2)) vi10, vi11 + __vector_pair :: vp + call mma_assemble_pair(vp, vi10, vi11) + end subroutine test_mma_assemble_pair_i2 + +!CHECK: @test_mma_assemble_pair_i2_ +!CHECK: %1 = alloca <8 x i16>, i64 1, align 16 +!CHECK: %2 = alloca <8 x i16>, i64 1, align 16 +!CHECK: %3 = alloca <256 x i1>, i64 1, align 32 +!CHECK: %4 = load <8 x i16>, ptr %1, align 16 +!CHECK: %5 = load <8 x i16>, ptr %2, align 16 +!CHECK: %6 = bitcast <8 x i16> %4 to <16 x i8> +!CHECK: %7 = bitcast <8 x i16> %5 to <16 x i8> +!CHECK: %8 = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %6, <16 x i8> %7) +!CHECK: store <256 x i1> %8, ptr %3, align 32 + + subroutine test_mma_assemble_pair_i4() + use, intrinsic :: mma + implicit none + vector(integer(4)) vi10, vi11 + __vector_pair :: vp + call mma_assemble_pair(vp, vi10, vi11) + end subroutine test_mma_assemble_pair_i4 + +!CHECK: @test_mma_assemble_pair_i4_ +!CHECK: %1 = alloca <4 x i32>, i64 1, align 16 +!CHECK: %2 = alloca <4 x i32>, i64 1, align 16 +!CHECK: %3 = alloca <256 x i1>, i64 1, align 32 +!CHECK: %4 = load <4 x i32>, ptr %1, align 16 +!CHECK: %5 = load <4 x i32>, ptr %2, align 16 +!CHECK: %6 = bitcast <4 x i32> %4 to <16 x i8> +!CHECK: %7 = bitcast <4 x i32> %5 to <16 x i8> +!CHECK: %8 = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %6, <16 x i8> %7) +!CHECK: store <256 x i1> %8, ptr %3, align 32 + + subroutine test_mma_assemble_pair_i8() + use, intrinsic :: mma + implicit none + vector(integer(8)) vi10, vi11 + __vector_pair :: vp + call mma_assemble_pair(vp, vi10, vi11) + end subroutine test_mma_assemble_pair_i8 + +!CHECK: @test_mma_assemble_pair_i8_ +!CHECK: %1 = alloca <2 x i64>, i64 1, align 16 +!CHECK: %2 = alloca <2 x i64>, i64 1, align 16 +!CHECK: %3 = alloca <256 x i1>, i64 1, align 32 +!CHECK: %4 = load <2 x i64>, ptr %1, align 16 +!CHECK: %5 = load <2 x i64>, ptr %2, align 16 +!CHECK: %6 = bitcast <2 x i64> %4 to <16 x i8> +!CHECK: %7 = bitcast <2 x i64> %5 to <16 x i8> +!CHECK: %8 = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %6, <16 x i8> %7) +!CHECK: store <256 x i1> %8, ptr %3, align 32 + + subroutine test_mma_assemble_pair_u1() + use, intrinsic :: mma + implicit none + vector(unsigned(1)) vi10, vi11 + __vector_pair :: vp + call mma_assemble_pair(vp, vi10, vi11) + end subroutine test_mma_assemble_pair_u1 + +!CHECK: @test_mma_assemble_pair_u1_ +!CHECK: %1 = alloca <16 x i8>, i64 1, align 16 +!CHECK: %2 = alloca <16 x i8>, i64 1, align 16 +!CHECK: %3 = alloca <256 x i1>, i64 1, align 32 +!CHECK: %4 = load <16 x i8>, ptr %1, align 16 +!CHECK: %5 = load <16 x i8>, ptr %2, align 16 +!CHECK: %6 = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %4, <16 x i8> %5) +!CHECK: store <256 x i1> %6, ptr %3, align 32 + + subroutine test_mma_assemble_pair_u2() + use, intrinsic :: mma + implicit none + vector(unsigned(2)) vi10, vi11 + __vector_pair :: vp + call mma_assemble_pair(vp, vi10, vi11) + end subroutine test_mma_assemble_pair_u2 + +!CHECK: @test_mma_assemble_pair_u2_ +!CHECK: %1 = alloca <8 x i16>, i64 1, align 16 +!CHECK: %2 = alloca <8 x i16>, i64 1, align 16 +!CHECK: %3 = alloca <256 x i1>, i64 1, align 32 +!CHECK: %4 = load <8 x i16>, ptr %1, align 16 +!CHECK: %5 = load <8 x i16>, ptr %2, align 16 +!CHECK: %6 = bitcast <8 x i16> %4 to <16 x i8> +!CHECK: %7 = bitcast <8 x i16> %5 to <16 x i8> +!CHECK: %8 = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %6, <16 x i8> %7) +!CHECK: store <256 x i1> %8, ptr %3, align 32 + + subroutine test_mma_assemble_pair_u4() + use, intrinsic :: mma + implicit none + vector(unsigned(4)) vi10, vi11 + __vector_pair :: vp + call mma_assemble_pair(vp, vi10, vi11) + end subroutine test_mma_assemble_pair_u4 + +!CHECK: @test_mma_assemble_pair_u4_ +!CHECK: %1 = alloca <4 x i32>, i64 1, align 16 +!CHECK: %2 = alloca <4 x i32>, i64 1, align 16 +!CHECK: %3 = alloca <256 x i1>, i64 1, align 32 +!CHECK: %4 = load <4 x i32>, ptr %1, align 16 +!CHECK: %5 = load <4 x i32>, ptr %2, align 16 +!CHECK: %6 = bitcast <4 x i32> %4 to <16 x i8> +!CHECK: %7 = bitcast <4 x i32> %5 to <16 x i8> +!CHECK: %8 = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %6, <16 x i8> %7) +!CHECK: store <256 x i1> %8, ptr %3, align 32 + + subroutine test_mma_assemble_pair_u8() + use, intrinsic :: mma + implicit none + vector(unsigned(8)) vi10, vi11 + __vector_pair :: vp + call mma_assemble_pair(vp, vi10, vi11) + end subroutine test_mma_assemble_pair_u8 + +!CHECK: @test_mma_assemble_pair_u8_ +!CHECK: %1 = alloca <2 x i64>, i64 1, align 16 +!CHECK: %2 = alloca <2 x i64>, i64 1, align 16 +!CHECK: %3 = alloca <256 x i1>, i64 1, align 32 +!CHECK: %4 = load <2 x i64>, ptr %1, align 16 +!CHECK: %5 = load <2 x i64>, ptr %2, align 16 +!CHECK: %6 = bitcast <2 x i64> %4 to <16 x i8> +!CHECK: %7 = bitcast <2 x i64> %5 to <16 x i8> +!CHECK: %8 = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %6, <16 x i8> %7) +!CHECK: store <256 x i1> %8, ptr %3, align 32 + + subroutine test_mma_assemble_pair_r4() + use, intrinsic :: mma + implicit none + vector(real(4)) vi10, vi11 + __vector_pair :: vp + call mma_assemble_pair(vp, vi10, vi11) + end subroutine test_mma_assemble_pair_r4 + +!CHECK: @test_mma_assemble_pair_r4_ +!CHECK: %1 = alloca <4 x float>, i64 1, align 16 +!CHECK: %2 = alloca <4 x float>, i64 1, align 16 +!CHECK: %3 = alloca <256 x i1>, i64 1, align 32 +!CHECK: %4 = load <4 x float>, ptr %1, align 16 +!CHECK: %5 = load <4 x float>, ptr %2, align 16 +!CHECK: %6 = bitcast <4 x float> %4 to <16 x i8> +!CHECK: %7 = bitcast <4 x float> %5 to <16 x i8> +!CHECK: %8 = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %6, <16 x i8> %7) +!CHECK: store <256 x i1> %8, ptr %3, align 32 + + subroutine test_mma_assemble_pair_r8() + use, intrinsic :: mma + implicit none + vector(real(8)) vi10, vi11 + __vector_pair :: vp + call mma_assemble_pair(vp, vi10, vi11) + end subroutine test_mma_assemble_pair_r8 + +!CHECK: @test_mma_assemble_pair_r8_ +!CHECK: %1 = alloca <2 x double>, i64 1, align 16 +!CHECK: %2 = alloca <2 x double>, i64 1, align 16 +!CHECK: %3 = alloca <256 x i1>, i64 1, align 32 +!CHECK: %4 = load <2 x double>, ptr %1, align 16 +!CHECK: %5 = load <2 x double>, ptr %2, align 16 +!CHECK: %6 = bitcast <2 x double> %4 to <16 x i8> +!CHECK: %7 = bitcast <2 x double> %5 to <16 x i8> +!CHECK: %8 = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %6, <16 x i8> %7) +!CHECK: store <256 x i1> %8, ptr %3, align 32 + +! mma_disassemble_acc + + subroutine test_mma_build_acc_i1() + use, intrinsic :: mma + implicit none + vector(integer(1)) vi10, vi11, vi12, vi13 + __vector_quad :: cq + call mma_build_acc(cq, vi10, vi11, vi12, vi13) + end subroutine test_mma_build_acc_i1 + +!CHECK-LABEL: @test_mma_build_acc_i1 +!CHECK: %1 = alloca <512 x i1>, i64 1, align 64 +!CHECK: %2 = alloca <16 x i8>, i64 1, align 16 +!CHECK: %3 = alloca <16 x i8>, i64 1, align 16 +!CHECK: %4 = alloca <16 x i8>, i64 1, align 16 +!CHECK: %5 = alloca <16 x i8>, i64 1, align 16 +!CHECK: %6 = load <16 x i8>, ptr %2, align 16 +!CHECK: %7 = load <16 x i8>, ptr %3, align 16 +!CHECK: %8 = load <16 x i8>, ptr %4, align 16 +!CHECK: %9 = load <16 x i8>, ptr %5, align 16 +!CHECK: %10 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %9, <16 x i8> %8, <16 x i8> %7, <16 x i8> %6) +!CHECK: store <512 x i1> %10, ptr %1, align 64 + + subroutine test_mma_build_acc_i2() + use, intrinsic :: mma + implicit none + vector(integer(2)) vi10, vi11, vi12, vi13 + __vector_quad :: cq + call mma_build_acc(cq, vi10, vi11, vi12, vi13) + end subroutine test_mma_build_acc_i2 + +!CHECK-LABEL: @test_mma_build_acc_i2 +!CHECK: %1 = alloca <512 x i1>, i64 1, align 64 +!CHECK: %2 = alloca <8 x i16>, i64 1, align 16 +!CHECK: %3 = alloca <8 x i16>, i64 1, align 16 +!CHECK: %4 = alloca <8 x i16>, i64 1, align 16 +!CHECK: %5 = alloca <8 x i16>, i64 1, align 16 +!CHECK: %6 = load <8 x i16>, ptr %2, align 16 +!CHECK: %7 = load <8 x i16>, ptr %3, align 16 +!CHECK: %8 = load <8 x i16>, ptr %4, align 16 +!CHECK: %9 = load <8 x i16>, ptr %5, align 16 +!CHECK: %10 = bitcast <8 x i16> %9 to <16 x i8> +!CHECK: %11 = bitcast <8 x i16> %8 to <16 x i8> +!CHECK: %12 = bitcast <8 x i16> %7 to <16 x i8> +!CHECK: %13 = bitcast <8 x i16> %6 to <16 x i8> +!CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13) +!CHECK: store <512 x i1> %14, ptr %1, align 64 + + subroutine test_mma_build_acc_i4() + use, intrinsic :: mma + implicit none + vector(integer(4)) vi10, vi11, vi12, vi13 + __vector_quad :: cq + call mma_build_acc(cq, vi10, vi11, vi12, vi13) + end subroutine test_mma_build_acc_i4 + +!CHECK-LABEL: @test_mma_build_acc_i4 +!CHECK: %1 = alloca <512 x i1>, i64 1, align 64 +!CHECK: %2 = alloca <4 x i32>, i64 1, align 16 +!CHECK: %3 = alloca <4 x i32>, i64 1, align 16 +!CHECK: %4 = alloca <4 x i32>, i64 1, align 16 +!CHECK: %5 = alloca <4 x i32>, i64 1, align 16 +!CHECK: %6 = load <4 x i32>, ptr %2, align 16 +!CHECK: %7 = load <4 x i32>, ptr %3, align 16 +!CHECK: %8 = load <4 x i32>, ptr %4, align 16 +!CHECK: %9 = load <4 x i32>, ptr %5, align 16 +!CHECK: %10 = bitcast <4 x i32> %9 to <16 x i8> +!CHECK: %11 = bitcast <4 x i32> %8 to <16 x i8> +!CHECK: %12 = bitcast <4 x i32> %7 to <16 x i8> +!CHECK: %13 = bitcast <4 x i32> %6 to <16 x i8> +!CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13) +!CHECK: store <512 x i1> %14, ptr %1, align 64 + + subroutine test_mma_build_acc_i8() + use, intrinsic :: mma + implicit none + vector(integer(8)) vi10, vi11, vi12, vi13 + __vector_quad :: cq + call mma_build_acc(cq, vi10, vi11, vi12, vi13) + end subroutine test_mma_build_acc_i8 + +!CHECK-LABEL: @test_mma_build_acc_i8 +!CHECK: %1 = alloca <512 x i1>, i64 1, align 64 +!CHECK: %2 = alloca <2 x i64>, i64 1, align 16 +!CHECK: %3 = alloca <2 x i64>, i64 1, align 16 +!CHECK: %4 = alloca <2 x i64>, i64 1, align 16 +!CHECK: %5 = alloca <2 x i64>, i64 1, align 16 +!CHECK: %6 = load <2 x i64>, ptr %2, align 16 +!CHECK: %7 = load <2 x i64>, ptr %3, align 16 +!CHECK: %8 = load <2 x i64>, ptr %4, align 16 +!CHECK: %9 = load <2 x i64>, ptr %5, align 16 +!CHECK: %10 = bitcast <2 x i64> %9 to <16 x i8> +!CHECK: %11 = bitcast <2 x i64> %8 to <16 x i8> +!CHECK: %12 = bitcast <2 x i64> %7 to <16 x i8> +!CHECK: %13 = bitcast <2 x i64> %6 to <16 x i8> +!CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13) +!CHECK: store <512 x i1> %14, ptr %1, align 64 + + subroutine test_mma_build_acc_u1() + use, intrinsic :: mma + implicit none + vector(unsigned(1)) vi10, vi11, vi12, vi13 + __vector_quad :: cq + call mma_build_acc(cq, vi10, vi11, vi12, vi13) + end subroutine test_mma_build_acc_u1 + +!CHECK-LABEL: @test_mma_build_acc_u1 +!CHECK: %1 = alloca <512 x i1>, i64 1, align 64 +!CHECK: %2 = alloca <16 x i8>, i64 1, align 16 +!CHECK: %3 = alloca <16 x i8>, i64 1, align 16 +!CHECK: %4 = alloca <16 x i8>, i64 1, align 16 +!CHECK: %5 = alloca <16 x i8>, i64 1, align 16 +!CHECK: %6 = load <16 x i8>, ptr %2, align 16 +!CHECK: %7 = load <16 x i8>, ptr %3, align 16 +!CHECK: %8 = load <16 x i8>, ptr %4, align 16 +!CHECK: %9 = load <16 x i8>, ptr %5, align 16 +!CHECK: %10 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %9, <16 x i8> %8, <16 x i8> %7, <16 x i8> %6) +!CHECK: store <512 x i1> %10, ptr %1, align 64 + + subroutine test_mma_build_acc_u2() + use, intrinsic :: mma + implicit none + vector(unsigned(2)) vi10, vi11, vi12, vi13 + __vector_quad :: cq + call mma_build_acc(cq, vi10, vi11, vi12, vi13) + end subroutine test_mma_build_acc_u2 + +!CHECK-LABEL: @test_mma_build_acc_u2 +!CHECK: %1 = alloca <512 x i1>, i64 1, align 64 +!CHECK: %2 = alloca <8 x i16>, i64 1, align 16 +!CHECK: %3 = alloca <8 x i16>, i64 1, align 16 +!CHECK: %4 = alloca <8 x i16>, i64 1, align 16 +!CHECK: %5 = alloca <8 x i16>, i64 1, align 16 +!CHECK: %6 = load <8 x i16>, ptr %2, align 16 +!CHECK: %7 = load <8 x i16>, ptr %3, align 16 +!CHECK: %8 = load <8 x i16>, ptr %4, align 16 +!CHECK: %9 = load <8 x i16>, ptr %5, align 16 +!CHECK: %10 = bitcast <8 x i16> %9 to <16 x i8> +!CHECK: %11 = bitcast <8 x i16> %8 to <16 x i8> +!CHECK: %12 = bitcast <8 x i16> %7 to <16 x i8> +!CHECK: %13 = bitcast <8 x i16> %6 to <16 x i8> +!CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13) +!CHECK: store <512 x i1> %14, ptr %1, align 64 + + subroutine test_mma_build_acc_u4() + use, intrinsic :: mma + implicit none + vector(unsigned(4)) vi10, vi11, vi12, vi13 + __vector_quad :: cq + call mma_build_acc(cq, vi10, vi11, vi12, vi13) + end subroutine test_mma_build_acc_u4 + +!CHECK-LABEL: @test_mma_build_acc_u4 +!CHECK: %1 = alloca <512 x i1>, i64 1, align 64 +!CHECK: %2 = alloca <4 x i32>, i64 1, align 16 +!CHECK: %3 = alloca <4 x i32>, i64 1, align 16 +!CHECK: %4 = alloca <4 x i32>, i64 1, align 16 +!CHECK: %5 = alloca <4 x i32>, i64 1, align 16 +!CHECK: %6 = load <4 x i32>, ptr %2, align 16 +!CHECK: %7 = load <4 x i32>, ptr %3, align 16 +!CHECK: %8 = load <4 x i32>, ptr %4, align 16 +!CHECK: %9 = load <4 x i32>, ptr %5, align 16 +!CHECK: %10 = bitcast <4 x i32> %9 to <16 x i8> +!CHECK: %11 = bitcast <4 x i32> %8 to <16 x i8> +!CHECK: %12 = bitcast <4 x i32> %7 to <16 x i8> +!CHECK: %13 = bitcast <4 x i32> %6 to <16 x i8> +!CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13) +!CHECK: store <512 x i1> %14, ptr %1, align 64 + + subroutine test_mma_build_acc_u8() + use, intrinsic :: mma + implicit none + vector(unsigned(8)) vi10, vi11, vi12, vi13 + __vector_quad :: cq + call mma_build_acc(cq, vi10, vi11, vi12, vi13) + end subroutine test_mma_build_acc_u8 + +!CHECK-LABEL: @test_mma_build_acc_u8 +!CHECK: %1 = alloca <512 x i1>, i64 1, align 64 +!CHECK: %2 = alloca <2 x i64>, i64 1, align 16 +!CHECK: %3 = alloca <2 x i64>, i64 1, align 16 +!CHECK: %4 = alloca <2 x i64>, i64 1, align 16 +!CHECK: %5 = alloca <2 x i64>, i64 1, align 16 +!CHECK: %6 = load <2 x i64>, ptr %2, align 16 +!CHECK: %7 = load <2 x i64>, ptr %3, align 16 +!CHECK: %8 = load <2 x i64>, ptr %4, align 16 +!CHECK: %9 = load <2 x i64>, ptr %5, align 16 +!CHECK: %10 = bitcast <2 x i64> %9 to <16 x i8> +!CHECK: %11 = bitcast <2 x i64> %8 to <16 x i8> +!CHECK: %12 = bitcast <2 x i64> %7 to <16 x i8> +!CHECK: %13 = bitcast <2 x i64> %6 to <16 x i8> +!CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13) +!CHECK: store <512 x i1> %14, ptr %1, align 64 + + + subroutine test_mma_build_acc_r4() + use, intrinsic :: mma + implicit none + vector(real(4)) vi10, vi11, vi12, vi13 + __vector_quad :: cq + call mma_build_acc(cq, vi10, vi11, vi12, vi13) + end subroutine test_mma_build_acc_r4 + +!CHECK-LABEL: @test_mma_build_acc_r4 +!CHECK: %1 = alloca <512 x i1>, i64 1, align 64 +!CHECK: %2 = alloca <4 x float>, i64 1, align 16 +!CHECK: %3 = alloca <4 x float>, i64 1, align 16 +!CHECK: %4 = alloca <4 x float>, i64 1, align 16 +!CHECK: %5 = alloca <4 x float>, i64 1, align 16 +!CHECK: %6 = load <4 x float>, ptr %2, align 16 +!CHECK: %7 = load <4 x float>, ptr %3, align 16 +!CHECK: %8 = load <4 x float>, ptr %4, align 16 +!CHECK: %9 = load <4 x float>, ptr %5, align 16 +!CHECK: %10 = bitcast <4 x float> %9 to <16 x i8> +!CHECK: %11 = bitcast <4 x float> %8 to <16 x i8> +!CHECK: %12 = bitcast <4 x float> %7 to <16 x i8> +!CHECK: %13 = bitcast <4 x float> %6 to <16 x i8> +!CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13) +!CHECK: store <512 x i1> %14, ptr %1, align 64 + + + subroutine test_mma_build_acc_r8() + use, intrinsic :: mma + implicit none + vector(real(8)) vi10, vi11, vi12, vi13 + __vector_quad :: cq + call mma_build_acc(cq, vi10, vi11, vi12, vi13) + end subroutine test_mma_build_acc_r8 + +!CHECK-LABEL: @test_mma_build_acc_r8 +!CHECK: %1 = alloca <512 x i1>, i64 1, align 64 +!CHECK: %2 = alloca <2 x double>, i64 1, align 16 +!CHECK: %3 = alloca <2 x double>, i64 1, align 16 +!CHECK: %4 = alloca <2 x double>, i64 1, align 16 +!CHECK: %5 = alloca <2 x double>, i64 1, align 16 +!CHECK: %6 = load <2 x double>, ptr %2, align 16 +!CHECK: %7 = load <2 x double>, ptr %3, align 16 +!CHECK: %8 = load <2 x double>, ptr %4, align 16 +!CHECK: %9 = load <2 x double>, ptr %5, align 16 +!CHECK: %10 = bitcast <2 x double> %9 to <16 x i8> +!CHECK: %11 = bitcast <2 x double> %8 to <16 x i8> +!CHECK: %12 = bitcast <2 x double> %7 to <16 x i8> +!CHECK: %13 = bitcast <2 x double> %6 to <16 x i8> +!CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13) +!CHECK: store <512 x i1> %14, ptr %1, align 64 + +! mma_disassemble_acc + + subroutine test_disassemble_acc() + use, intrinsic :: mma + implicit none + __vector_quad :: vq + real :: data + call mma_disassemble_acc(data, vq) + end subroutine + +!CHECK-LABEL: @test_disassemble_acc_ +!CHECK: %1 = alloca float, i64 1, align 4 +!CHECK: %2 = alloca <512 x i1>, i64 1, align 64 +!CHECK: %3 = load <512 x i1>, ptr %2, align 64 +!CHECK: %4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %3) +!CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %4, ptr %1, align 16 + +! mma_disassemble_pair + + subroutine test_disassemble_pair() + use, intrinsic :: mma + implicit none + __vector_pair :: vp + real :: data + call mma_disassemble_pair(data, vp) + end subroutine + +!CHECK-LABEL: @test_disassemble_pair_ +!CHECK: %1 = alloca float, i64 1, align 4 +!CHECK: %2 = alloca <256 x i1>, i64 1, align 32 +!CHECK: %3 = load <256 x i1>, ptr %2, align 32 +!CHECK: %4 = call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> %3) +!CHECK: store { <16 x i8>, <16 x i8> } %4, ptr %1, align 16 diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt --- a/flang/tools/f18/CMakeLists.txt +++ b/flang/tools/f18/CMakeLists.txt @@ -10,6 +10,7 @@ "__fortran_type_info" "__ppc_types" "__ppc_intrinsics" + "mma" "__cuda_builtins" "ieee_arithmetic" "ieee_exceptions" @@ -32,7 +33,8 @@ set(depends "") elseif(${filename} STREQUAL "__ppc_types") set(depends "") - elseif(${filename} STREQUAL "__ppc_intrinsics") + elseif(${filename} STREQUAL "__ppc_intrinsics" OR + ${filename} STREQUAL "mma") set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__ppc_types.mod) else() set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_builtins.mod) @@ -47,7 +49,8 @@ # The module contains PPC vector types that needs the PPC target. set(opts "") - if(${filename} STREQUAL "__ppc_intrinsics") + if(${filename} STREQUAL "__ppc_intrinsics" OR + ${filename} STREQUAL "mma") set(opts "--target=ppc64le") endif()