Changeset View
Changeset View
Standalone View
Standalone View
polly/lib/Transform/MatmulOptimizer.cpp
Show First 20 Lines • Show All 790 Lines • ▼ Show 20 Lines | |||||
static isl::schedule_node createExtensionNode(isl::schedule_node Node, | static isl::schedule_node createExtensionNode(isl::schedule_node Node, | ||||
isl::map ExtensionMap) { | isl::map ExtensionMap) { | ||||
auto Extension = isl::union_map(ExtensionMap); | auto Extension = isl::union_map(ExtensionMap); | ||||
auto NewNode = isl::schedule_node::from_extension(Extension); | auto NewNode = isl::schedule_node::from_extension(Extension); | ||||
return Node.graft_before(NewNode); | return Node.graft_before(NewNode); | ||||
} | } | ||||
/// Create the copy statement and redirect access | |||||
/// | |||||
/// Create a new statement, which copies data from the memory access location | |||||
/// specified by the access relation @p AccRel to a location | |||||
/// specified by @p AccRelPacked. Subsequently, change memory access | |||||
/// relation @p AccRel of @p Stmt to @p AccRelPacked. | |||||
/// | |||||
/// @param Stmt The statement with an access @p AccRel | |||||
/// to be redirected. | |||||
/// @param AccRel The original access relation of @p Stmt. | |||||
/// @param AccRelPacked A new access relation. | |||||
/// @param OuterDomainMap The copy statement domain. | |||||
/// @param MapOldIndVar The relation, which maps original induction variables | |||||
/// to the ones, which are produced by schedule | |||||
/// transformations. | |||||
/// @return A new copy statement. | |||||
static ScopStmt *addScopStmt(ScopStmt *Stmt, isl::map AccRel, | |||||
isl::map AccRelPacked, isl::map OuterDomainMap, | |||||
isl::map MapOldIndVar) { | |||||
// { Memref[] -> Packed[] } | |||||
isl::map PackedTranslator = AccRelPacked.apply_domain(AccRel); | |||||
isl::set Domain = Stmt->getDomain(); | |||||
// Prevent the unbound optimum | |||||
isl::map CopyFrom = MapOldIndVar.intersect_domain(Domain); | |||||
// { Scatter[] -> MemrefA[] } | |||||
CopyFrom = CopyFrom.reverse().apply_range(AccRel); | |||||
// { Scatter[] -> CopyStmt[] } | |||||
isl::map DomainTranslator = OuterDomainMap.range_product(CopyFrom); | |||||
// { CopyStmt[] } | |||||
isl::set CopyDomain = DomainTranslator.range(); | |||||
// Translate the access relations to the new domain. | |||||
// { CopyStmt[] -> MemrefA[] } | |||||
CopyFrom = CopyFrom.apply_domain(DomainTranslator); | |||||
// { CopyStmt[] -> PackedA[] } | |||||
isl::map CopyTo = CopyFrom.apply_range(PackedTranslator); | |||||
return Stmt->getParent()->addScopStmt(CopyFrom, CopyTo, CopyDomain); | |||||
} | |||||
static isl::schedule_node optimizePackedB(isl::schedule_node Node, | static isl::schedule_node optimizePackedB(isl::schedule_node Node, | ||||
ScopStmt *Stmt, isl::map MapOldIndVar, | ScopStmt *Stmt, isl::map MapOldIndVar, | ||||
MicroKernelParamsTy MicroParams, | MicroKernelParamsTy MicroParams, | ||||
MacroKernelParamsTy MacroParams, | MacroKernelParamsTy MacroParams, | ||||
MatMulInfoTy &MMI) { | MatMulInfoTy &MMI) { | ||||
Scop *S = Stmt->getParent(); | Scop *S = Stmt->getParent(); | ||||
isl::set Domain = Stmt->getDomain(); | isl::set Domain = Stmt->getDomain(); | ||||
// Create packed array. | // Create packed array. | ||||
unsigned FirstDimSize = MacroParams.Nc / MicroParams.Nr; | unsigned FirstDimSize = MacroParams.Nc / MicroParams.Nr; | ||||
unsigned SecondDimSize = MacroParams.Kc; | unsigned SecondDimSize = MacroParams.Kc; | ||||
unsigned ThirdDimSize = MicroParams.Nr; | unsigned ThirdDimSize = MicroParams.Nr; | ||||
ScopArrayInfo *PackedB = | ScopArrayInfo *PackedB = | ||||
S->createScopArrayInfo(MMI.B->getElementType(), "Packed_B", | S->createScopArrayInfo(MMI.B->getElementType(), "Packed_B", | ||||
{FirstDimSize, SecondDimSize, ThirdDimSize}); | {FirstDimSize, SecondDimSize, ThirdDimSize}); | ||||
// Compute the access relation for copying from B to PackedB. | // Compute the access relation for copying from B to PackedB. | ||||
isl::map AccRelB = MMI.B->getLatestAccessRelation(); | isl::map AccRelB = MMI.B->getLatestAccessRelation(); | ||||
unsigned AccRelBDimOutNum = unsignedFromIslSize(AccRelB.dim(isl::dim::out)); | |||||
isl::map AccRelPackedB = getMatMulAccRel(MapOldIndVar, 3, 7); | isl::map AccRelPackedB = getMatMulAccRel(MapOldIndVar, 3, 7); | ||||
AccRelPackedB = | AccRelPackedB = | ||||
AccRelPackedB.set_tuple_id(isl::dim::out, PackedB->getBasePtrId()); | AccRelPackedB.set_tuple_id(isl::dim::out, PackedB->getBasePtrId()); | ||||
// Create the copy statement and redirect access. | // Compute the domain for the copy statement. | ||||
ScopStmt *CopyStmt = S->addScopStmt(AccRelB, AccRelPackedB, Domain); | // Construct the copy statement domain out of the 2 outermost scatter | ||||
// dimensions (to match the 2 band nodes surrounding the extension node) and | |||||
// the array elements to copy (one statement instance per array element). | |||||
// { Scatter[] } | |||||
isl::set ScatterDomain = MapOldIndVar.intersect_domain(Domain).range(); | |||||
isl::map OuterDomainMap = | |||||
makeIdentityMap(ScatterDomain, true).project_out(isl::dim::out, 2, 7); | |||||
ScopStmt *CopyStmt = | |||||
addScopStmt(Stmt, AccRelB, AccRelPackedB, OuterDomainMap, MapOldIndVar); | |||||
MMI.B->setNewAccessRelation(AccRelPackedB); | MMI.B->setNewAccessRelation(AccRelPackedB); | ||||
unsigned Dim = unsignedFromIslSize(MapOldIndVar.range_tuple_dim()); | // { Scatter[] -> CopyStmt[] } | ||||
assert(Dim >= 2); | isl::map ExtScatterCopy = makeIdentityMap(CopyStmt->getDomain(), true); | ||||
// Insert into the schedule tree. | ExtScatterCopy = | ||||
isl::map ExtMap = MapOldIndVar.project_out(isl::dim::out, 2, Dim - 2); | ExtScatterCopy.project_out(isl::dim::in, 2, AccRelBDimOutNum); | ||||
ExtMap = ExtMap.reverse(); | return createExtensionNode(Node, ExtScatterCopy); | ||||
ExtMap = ExtMap.fix_si(isl::dim::out, MMI.i, 0); | |||||
ExtMap = ExtMap.intersect_range(Domain); | |||||
ExtMap = ExtMap.set_tuple_id(isl::dim::out, CopyStmt->getDomainId()); | |||||
return createExtensionNode(Node, ExtMap); | |||||
} | } | ||||
static isl::schedule_node optimizePackedA(isl::schedule_node Node, ScopStmt *, | static isl::schedule_node optimizePackedA(isl::schedule_node Node, ScopStmt *, | ||||
isl::map MapOldIndVar, | isl::map MapOldIndVar, | ||||
MicroKernelParamsTy MicroParams, | MicroKernelParamsTy MicroParams, | ||||
MacroKernelParamsTy MacroParams, | MacroKernelParamsTy MacroParams, | ||||
MatMulInfoTy &MMI) { | MatMulInfoTy &MMI) { | ||||
isl::id InputDimsId = MapOldIndVar.get_tuple_id(isl::dim::in); | isl::id InputDimsId = MapOldIndVar.get_tuple_id(isl::dim::in); | ||||
ScopStmt *Stmt = static_cast<ScopStmt *>(InputDimsId.get_user()); | ScopStmt *Stmt = static_cast<ScopStmt *>(InputDimsId.get_user()); | ||||
isl::set Domain = Stmt->getDomain(); | isl::set Domain = Stmt->getDomain(); | ||||
isl::id DomainId = Domain.get_tuple_id(); | isl::id DomainId = Domain.get_tuple_id(); | ||||
// Create the packed array. | // Create the packed array. | ||||
unsigned FirstDimSize = MacroParams.Mc / MicroParams.Mr; | unsigned FirstDimSize = MacroParams.Mc / MicroParams.Mr; | ||||
unsigned SecondDimSize = MacroParams.Kc; | unsigned SecondDimSize = MacroParams.Kc; | ||||
unsigned ThirdDimSize = MicroParams.Mr; | unsigned ThirdDimSize = MicroParams.Mr; | ||||
ScopArrayInfo *PackedA = Stmt->getParent()->createScopArrayInfo( | ScopArrayInfo *PackedA = Stmt->getParent()->createScopArrayInfo( | ||||
MMI.A->getElementType(), "Packed_A", | MMI.A->getElementType(), "Packed_A", | ||||
{FirstDimSize, SecondDimSize, ThirdDimSize}); | {FirstDimSize, SecondDimSize, ThirdDimSize}); | ||||
// Compute the access relation for copying from A to PackedA. | // Compute the access relation for copying from A to PackedA. | ||||
isl::map AccRelA = MMI.A->getLatestAccessRelation(); | isl::map AccRelA = MMI.A->getLatestAccessRelation(); | ||||
unsigned AccRelADimOutNum = unsignedFromIslSize(AccRelA.dim(isl::dim::out)); | |||||
isl::map AccRelPackedA = getMatMulAccRel(MapOldIndVar, 4, 6); | isl::map AccRelPackedA = getMatMulAccRel(MapOldIndVar, 4, 6); | ||||
AccRelPackedA = | AccRelPackedA = | ||||
AccRelPackedA.set_tuple_id(isl::dim::out, PackedA->getBasePtrId()); | AccRelPackedA.set_tuple_id(isl::dim::out, PackedA->getBasePtrId()); | ||||
// { MemrefA[] -> PackedA[] } | |||||
isl::map PackedATranslator = AccRelPackedA.apply_domain(AccRelA); | |||||
// Compute the domain for the copy statement. | // Compute the domain for the copy statement. | ||||
// Construct the copy statement domain out of the 3 outermost scatter | // Construct the copy statement domain out of the 3 outermost scatter | ||||
// dimensions (to match the 3 band nodes surrounding the extension node) and | // dimensions (to match the 3 band nodes surrounding the extension node) and | ||||
// the array elements to copy (one statement instance per array element). | // the array elements to copy (one statement instance per array element). | ||||
// { Scatter[] } | // { Scatter[] } | ||||
isl::set ScatterDomain = MapOldIndVar.intersect_domain(Domain).range(); | isl::set ScatterDomain = MapOldIndVar.intersect_domain(Domain).range(); | ||||
// { Scatter[] -> OutermostScatter[] } | // { Scatter[] -> OutermostScatter[] } | ||||
isl::map OuterDomainMap = | isl::map OuterDomainMap = | ||||
makeIdentityMap(ScatterDomain, true).project_out(isl::dim::out, 3, 6); | makeIdentityMap(ScatterDomain, true).project_out(isl::dim::out, 3, 6); | ||||
// { Scatter[] -> MemrefA[] } | |||||
isl::map CopyFrom = MapOldIndVar.reverse().apply_range(AccRelA); | |||||
// { Scatter[] -> CopyStmt[] } | |||||
isl::map DomainTranslator = OuterDomainMap.range_product(CopyFrom); | |||||
// { CopyStmt[] } | |||||
isl::set CopyDomain = DomainTranslator.range(); | |||||
// Translate the access relations to the new domain. | |||||
// { CopyStmt[] -> MemrefA[] } | |||||
CopyFrom = CopyFrom.apply_domain(DomainTranslator); | |||||
// { CopyStmt[] -> PackedA[] } | |||||
isl::map CopyTo = CopyFrom.apply_range(PackedATranslator); | |||||
// Create the copy statement and redirect access. | // Create the copy statement and redirect access. | ||||
ScopStmt *CopyStmt = | ScopStmt *CopyStmt = | ||||
Stmt->getParent()->addScopStmt(CopyFrom, CopyTo, CopyDomain); | addScopStmt(Stmt, AccRelA, AccRelPackedA, OuterDomainMap, MapOldIndVar); | ||||
MMI.A->setNewAccessRelation(AccRelPackedA); | MMI.A->setNewAccessRelation(AccRelPackedA); | ||||
// Insert into the schedule tree. | // Insert into the schedule tree. | ||||
// { Scatter[] -> CopyStmt[] } | // { Scatter[] -> CopyStmt[] } | ||||
isl::map ExtScatterCopy = makeIdentityMap(CopyStmt->getDomain(), true); | isl::map ExtScatterCopy = makeIdentityMap(CopyStmt->getDomain(), true); | ||||
ExtScatterCopy = ExtScatterCopy.project_out(isl::dim::in, 3, 2); | ExtScatterCopy = | ||||
ExtScatterCopy.project_out(isl::dim::in, 3, AccRelADimOutNum); | |||||
return createExtensionNode(Node, ExtScatterCopy); | return createExtensionNode(Node, ExtScatterCopy); | ||||
} | } | ||||
/// Apply the packing transformation. | /// Apply the packing transformation. | ||||
/// | /// | ||||
/// The packing transformation can be described as a data-layout | /// The packing transformation can be described as a data-layout | ||||
/// transformation that requires to introduce a new array, copy data | /// transformation that requires to introduce a new array, copy data | ||||
/// to the array, and change memory access locations to reference the array. | /// to the array, and change memory access locations to reference the array. | ||||
▲ Show 20 Lines • Show All 912 Lines • ▼ Show 20 Lines | static bool isTCPattern(isl::schedule_node Node, const Dependences *D, | ||||
isl::map PartialScheduleMap = isl::map::from_union_map(PartialSchedule); | isl::map PartialScheduleMap = isl::map::from_union_map(PartialSchedule); | ||||
if (containsTCInfoTy(PartialScheduleMap, D, TCI, isl::set(Domain))) | if (containsTCInfoTy(PartialScheduleMap, D, TCI, isl::set(Domain))) | ||||
return true; | return true; | ||||
return false; | return false; | ||||
} | } | ||||
/// Order the dimensions of operands of the tensor contraction. | |||||
/// | |||||
/// To improve the spatial locality of the data, we use a heuristic, which is | |||||
/// introduced in [1]. The heuristic logically reorders the tensor dimensions | |||||
/// and consists of the following steps: | |||||
/// | |||||
/// 1. Sort the dimensions in bundles I and J by increasing stride in | |||||
/// the tensor C. | |||||
/// 2. If tensor A contains the last dimension of C, swap A with B and I with J. | |||||
/// 3. Sort the dimensions in the bundle P by increasing stride in the tensor A. | |||||
/// | |||||
/// [1] - Devin Matthews. High-Performance Tensor Contraction without BLAS. | |||||
/// SIAM Journal on Scientific Computing, 40(1). 2016. DOI: 10.1137/16M108968X. | |||||
/// | |||||
/// @param TCI The information about the tensor contraction. | |||||
static void orderDimensions(TCInfoTy &TCI) { | |||||
// Sort the dimensions in bundles I and J by increasing stride in | |||||
// the tensor C | |||||
for (const int &Dimension : TCI.CDimensions) { | |||||
assert(TCI.I.count(Dimension) || TCI.J.count(Dimension)); | |||||
if (TCI.I.count(Dimension)) { | |||||
TCI.OrderedI.push_back(Dimension); | |||||
continue; | |||||
} | |||||
if (TCI.J.count(Dimension)) | |||||
TCI.OrderedJ.push_back(Dimension); | |||||
} | |||||
// If tensor A contains the last dimension of C, swap A with B and I with J | |||||
if (TCI.I.count(TCI.CDimensions.back())) { | |||||
MemoryAccess *Access = TCI.A; | |||||
TCI.A = TCI.B; | |||||
TCI.B = Access; | |||||
TCI.I.swap(TCI.J); | |||||
TCI.ADimensions.swap(TCI.BDimensions); | |||||
TCI.OrderedI.swap(TCI.OrderedJ); | |||||
} | |||||
// Sort the dimensions in the bundle P by increasing stride in the tensor A | |||||
for (const int &Dimension : TCI.ADimensions) | |||||
if (TCI.P.count(Dimension)) | |||||
TCI.OrderedP.push_back(Dimension); | |||||
} | |||||
/// Logically map dimensions of a tensor to dimensions of an imaginary matrix | |||||
/// | |||||
/// flattenDimensions returns a new dimension, which is a linear combination of | |||||
/// @p Dimensions in order specified by @p DimensionsOrder. Specifically, a new | |||||
/// dimension J can be represented as | |||||
/// J = Dimensions[D0] A0 + … + Dimensions[D(N - 1)] * A(N - 1), | |||||
/// where Di = DimensionsOrder[i], | |||||
/// Ai = DimensionsSizes[i+1] * … * DimensionsSizes[N - 1]. | |||||
/// | |||||
/// Using the described one-to-one mapping, which is introduced in [1], the | |||||
/// opimization of the matrix multiplication of the corresponding imaginary | |||||
/// matrices can be performed for the tensor contraction. | |||||
/// | |||||
/// [1] - Devin Matthews. High-Performance Tensor Contraction without BLAS. | |||||
/// SIAM Journal on Scientific Computing, 40(1). 2016. DOI: 10.1137/16M108968X. | |||||
/// | |||||
/// @param Dimensions Dimensions of the tensor. | |||||
/// @param DimensionsOrder An order of dimensions in the linear combination. | |||||
/// @param DimensionsSizes Sizes of dimensions in the linear combination. | |||||
/// @return A new dimension. | |||||
static isl::union_pw_aff | |||||
flattenDimensions(isl::multi_union_pw_aff Dimensions, | |||||
const SmallVector<int> &DimensionsOrder, | |||||
const SmallVector<int> &DimensionsSizes) { | |||||
isl::union_pw_aff NewDimension; | |||||
isl::ctx Ctx = Dimensions.ctx(); | |||||
for (const int &Dim : DimensionsOrder) { | |||||
// Handle the first element in the linear combination of dimensions | |||||
if (NewDimension.is_null()) { | |||||
NewDimension = isl::manage( | |||||
isl_multi_union_pw_aff_get_union_pw_aff(Dimensions.get(), Dim)); | |||||
continue; | |||||
} | |||||
// Update the linear combination | |||||
int DimSize = DimensionsSizes[Dim]; | |||||
if (DimSize > 0) | |||||
NewDimension = isl::manage(isl_union_pw_aff_scale_val( | |||||
NewDimension.release(), isl::val(Ctx, DimSize).release())); | |||||
else | |||||
NewDimension = isl::manage(isl_union_pw_aff_scale_val( | |||||
NewDimension.release(), isl::val(Ctx, 1).release())); | |||||
// Add a new element to the linear combination | |||||
NewDimension = NewDimension.add(isl::manage( | |||||
isl_multi_union_pw_aff_get_union_pw_aff(Dimensions.get(), Dim))); | |||||
} | |||||
return NewDimension; | |||||
} | |||||
/// Logically represent tensor contraction operands as matrix multiplication | |||||
/// operands. | |||||
/// | |||||
/// Convert the information about tensor contraction operands into the | |||||
/// information about matrix multiplication operands. | |||||
/// | |||||
/// @param TCI The information about the tensor contraction. | |||||
/// @return The information about matrix multiplication operands. | |||||
static MatMulInfoTy convertIntoMatMulInfo(const TCInfoTy &TCI) { | |||||
MatMulInfoTy NewMatMulInfo; | |||||
NewMatMulInfo.A = TCI.A; | |||||
NewMatMulInfo.B = TCI.B; | |||||
NewMatMulInfo.ReadFromC = TCI.ReadFromC; | |||||
NewMatMulInfo.WriteToC = TCI.WriteToC; | |||||
return NewMatMulInfo; | |||||
} | |||||
/// Logically cast the schedule tree, which represents the tensor contraction, | |||||
/// into the schedule tree, which represents the matrix multiplication | |||||
/// | |||||
/// convertIntoMMMSchedule maps dimensions of tensors to dimensions of imaginary | |||||
/// matrices, which are operands of the matrix multiplication. Using the | |||||
/// described one-to-one mapping, the opimization of the matrix | |||||
/// multiplication of the corresponding imaginary matrices can be performed for | |||||
/// the tensor contraction. | |||||
/// | |||||
/// @param Node The schedule node to be optimized. | |||||
/// @param TTI Target Transform Info. | |||||
/// @param TCI The information about the tensor contraction. | |||||
/// @return The optimized schedule node. | |||||
static isl::schedule_node | |||||
convertIntoMMMSchedule(isl::schedule_node Node, | |||||
const llvm::TargetTransformInfo *TTI, | |||||
const TCInfoTy &TCI) { | |||||
assert(TTI && "The target transform info should be provided."); | |||||
// Obtain dimensions of tensors | |||||
isl::union_set Domain = Node.get_universe_domain(); | |||||
isl::union_pw_multi_aff IdentityUnionPwAff = | |||||
Domain.identity_union_pw_multi_aff(); | |||||
auto Dimensions = isl::multi_union_pw_aff(IdentityUnionPwAff); | |||||
Dimensions = Dimensions.reset_tuple_id(isl::dim::set); | |||||
// Map dimensions of tensors to dimensions of imaginary matrices | |||||
isl::union_pw_aff IDimensions = | |||||
flattenDimensions(Dimensions, TCI.OrderedI, TCI.DimensionSizes); | |||||
isl::union_pw_aff JDimensions = | |||||
flattenDimensions(Dimensions, TCI.OrderedJ, TCI.DimensionSizes); | |||||
isl::union_pw_aff PDimensions = | |||||
flattenDimensions(Dimensions, TCI.OrderedP, TCI.DimensionSizes); | |||||
unsigned DimNumber = unsignedFromIslSize(Dimensions.dim(isl::dim::out)); | |||||
Dimensions = isl::manage(isl_multi_union_pw_aff_drop_dims( | |||||
Dimensions.release(), isl_dim_out, 3, DimNumber - 3)); | |||||
Dimensions = Dimensions.set_union_pw_aff(0, IDimensions); | |||||
Dimensions = Dimensions.set_union_pw_aff(1, JDimensions); | |||||
Dimensions = Dimensions.set_union_pw_aff(2, PDimensions); | |||||
// Modify the schedule tree | |||||
auto NodeType = isl_schedule_node_get_type(Node.get()); | |||||
while ((NodeType != isl_schedule_node_domain) && | |||||
(NodeType != isl_schedule_node_filter)) { | |||||
assert((NodeType != isl_schedule_node_sequence) && | |||||
L"Prevent the undefined behavior"); | |||||
Node = Node.parent(); | |||||
NodeType = isl_schedule_node_get_type(Node.get()); | |||||
} | |||||
Node = Node.child(0); | |||||
Node = isl::manage(isl_schedule_node_cut(Node.release())); | |||||
return Node.insert_partial_schedule(Dimensions); | |||||
} | |||||
/// Apply the BLIS matmul optimization pattern if possible. | |||||
/// | |||||
/// Map dimensions of tensors to dimensions of imaginary matrices, | |||||
/// which are operands of the matrix multiplication. Using the described | |||||
/// one-to-one mapping, which is introduced in [1], the opimization of | |||||
/// the matrix multiplication of the corresponding imaginary matrices | |||||
/// can be performed for the tensor contraction. | |||||
/// | |||||
/// Specifically, make the loops containing the matrix multiplication be the | |||||
/// innermost loops and apply the BLIS matmul optimization pattern. BLIS | |||||
/// implements matrix multiplication as three nested loops around | |||||
/// a macro-kernel, plus two packing routines. The macro-kernel is implemented | |||||
/// in terms of two additional loops around a micro-kernel. | |||||
/// The micro-kernel is a loop around a rank-1 (i.e., outer product) update. | |||||
/// | |||||
/// For a detailed description please see [2]. | |||||
/// | |||||
/// The order of the loops defines the data reused in the BLIS implementation | |||||
/// of matrix multiplication ([2]). In particular, elements of the matrix B, | |||||
/// the second operand of matrix multiplication, are reused between | |||||
/// iterations of the innermost loop. To keep the reused data in cache, only | |||||
/// elements of matrix A, the first operand of matrix multiplication, | |||||
/// should be evicted during an iteration of the innermost loop. To provide | |||||
/// such a cache replacement policy, elements of the matrix A can, | |||||
/// in particular, be loaded first and, consequently, be least-recently-used. | |||||
/// | |||||
/// In our case matrices are stored in row-major order instead of | |||||
/// column-major order used in the BLIS implementation ([2]). It affects only | |||||
/// on the form of the BLIS micro kernel and the computation of its | |||||
/// parameters. In particular, reused elements of the matrix B are | |||||
/// successively multiplied by specific elements of the matrix A. | |||||
/// | |||||
/// Refs.: | |||||
/// [1] - Devin Matthews. High-Performance Tensor Contraction without BLAS. | |||||
/// SIAM Journal on Scientific Computing, 40(1). 2016. DOI: 10.1137/16M108968X. | |||||
/// [2] - Analytical Modeling is Enough for High Performance BLIS | |||||
/// Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti | |||||
/// Technical Report, 2014 | |||||
/// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf | |||||
/// | |||||
/// @see ScheduleTreeOptimizer::createMicroKernel | |||||
/// @see ScheduleTreeOptimizer::createMacroKernel | |||||
/// @see getMicroKernelParams | |||||
/// @see getMacroKernelParams | |||||
/// | |||||
/// @param Node The node that contains a band to be optimized. The node | |||||
/// is required to successfully pass | |||||
/// ScheduleTreeOptimizer::isTCPattern. | |||||
/// @param TTI Target Transform Info. | |||||
/// @param D The dependencies. | |||||
/// @param TCI The information about the tensor contraction. | |||||
/// | |||||
/// @returns The transformed schedule or nullptr if the optimization | |||||
/// cannot be applied. | |||||
static isl::schedule_node | |||||
optimizeTCPattern(isl::schedule_node Node, const llvm::TargetTransformInfo *TTI, | |||||
TCInfoTy &TCI) { | |||||
orderDimensions(TCI); | |||||
assert(TTI && "The target transform info should be provided."); | |||||
Node = convertIntoMMMSchedule(Node, TTI, TCI); | |||||
MatMulInfoTy MMI = convertIntoMatMulInfo(TCI); | |||||
MicroKernelParamsTy MicroKernelParams = getMicroKernelParams(TTI, MMI); | |||||
MacroKernelParamsTy MacroKernelParams = | |||||
getMacroKernelParams(TTI, MicroKernelParams, MMI); | |||||
Node = createMacroKernel(Node, MacroKernelParams); | |||||
Node = createMicroKernel(Node, MicroKernelParams); | |||||
if (MacroKernelParams.Mc == 1 || MacroKernelParams.Nc == 1 || | |||||
MacroKernelParams.Kc == 1) | |||||
return Node; | |||||
isl::map MapOldIndVar = getInductionVariablesSubstitution( | |||||
Node, MicroKernelParams, MacroKernelParams); | |||||
if (MapOldIndVar.is_null()) | |||||
return Node; | |||||
Node = markLoopVectorizerDisabled(Node.parent()).child(0); | |||||
Node = isolateAndUnrollMatMulInnerLoops(Node, MicroKernelParams); | |||||
return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams, | |||||
MacroKernelParams, MMI); | |||||
} | |||||
} // namespace | } // namespace | ||||
isl::schedule_node | isl::schedule_node | ||||
polly::tryOptimizeMatMulPattern(isl::schedule_node Node, | polly::tryOptimizeMatMulPattern(isl::schedule_node Node, | ||||
const llvm::TargetTransformInfo *TTI, | const llvm::TargetTransformInfo *TTI, | ||||
const Dependences *D) { | const Dependences *D) { | ||||
TCInfoTy TCI; | TCInfoTy TCI; | ||||
if (PMBasedTCOpts && isTCPattern(Node, D, TCI)) | if (PMBasedTCOpts && isTCPattern(Node, D, TCI)) { | ||||
LLVM_DEBUG(dbgs() << "The tensor contraction pattern was detected\n"); | LLVM_DEBUG(dbgs() << "The tensor contraction pattern was detected\n"); | ||||
return optimizeTCPattern(Node, TTI, TCI); | |||||
} | |||||
MatMulInfoTy MMI; | MatMulInfoTy MMI; | ||||
if (PMBasedMMMOpts && isMatrMultPattern(Node, D, MMI)) { | if (PMBasedMMMOpts && isMatrMultPattern(Node, D, MMI)) { | ||||
LLVM_DEBUG(dbgs() << "The matrix multiplication pattern was detected\n"); | LLVM_DEBUG(dbgs() << "The matrix multiplication pattern was detected\n"); | ||||
return optimizeMatMulPattern(Node, TTI, MMI); | return optimizeMatMulPattern(Node, TTI, MMI); | ||||
} | } | ||||
return {}; | return {}; | ||||
} | } |