Index: polly/trunk/lib/Transform/ScheduleOptimizer.cpp =================================================================== --- polly/trunk/lib/Transform/ScheduleOptimizer.cpp +++ polly/trunk/lib/Transform/ScheduleOptimizer.cpp @@ -851,6 +851,8 @@ static __isl_give isl_schedule_node *optimizeDataLayoutMatrMulPattern( __isl_take isl_schedule_node *Node, __isl_take isl_map *MapOldIndVar, MicroKernelParamsTy MicroParams, MacroKernelParamsTy MacroParams) { + // Check whether memory accesses of the SCoP statement correspond to + // the matrix multiplication pattern and if this is true, obtain them. auto InputDimsId = isl_map_get_tuple_id(MapOldIndVar, isl_dim_in); auto *Stmt = static_cast(isl_id_get_user(InputDimsId)); isl_id_free(InputDimsId); @@ -860,6 +862,9 @@ isl_map_free(MapOldIndVar); return Node; } + + // Create a copy statement that corresponds to the memory access to the + // matrix B, the second operand of the matrix multiplication. Node = isl_schedule_node_parent(isl_schedule_node_parent(Node)); Node = isl_schedule_node_parent(isl_schedule_node_parent(Node)); Node = isl_schedule_node_parent(Node); @@ -879,10 +884,19 @@ isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1); ExtMap = isl_map_project_out(ExtMap, isl_dim_in, 2, 1); auto *Domain = Stmt->getDomain(); + + // Restrict the domains of the copy statements to only execute when also its + // originating statement is executed. + auto *DomainId = isl_set_get_tuple_id(Domain); auto *NewStmt = Stmt->getParent()->addScopStmt( OldAcc, MemAccessB->getAccessRelation(), isl_set_copy(Domain)); + ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, isl_id_copy(DomainId)); + ExtMap = isl_map_intersect_range(ExtMap, isl_set_copy(Domain)); ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId()); Node = createExtensionNode(Node, ExtMap); + + // Create a copy statement that corresponds to the memory access + // to the matrix A, the first operand of the matrix multiplication. Node = isl_schedule_node_child(Node, 0); AccRel = getMatMulAccRel(MapOldIndVar, MacroParams.Kc, 4, 6); FirstDimSize = MacroParams.Mc * MacroParams.Kc / MicroParams.Mr; @@ -896,7 +910,12 @@ isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1); isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1); NewStmt = Stmt->getParent()->addScopStmt( - OldAcc, MemAccessA->getAccessRelation(), Domain); + OldAcc, MemAccessA->getAccessRelation(), isl_set_copy(Domain)); + + // Restrict the domains of the copy statements to only execute when also its + // originating statement is executed. + ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, DomainId); + ExtMap = isl_map_intersect_range(ExtMap, Domain); ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId()); Node = createExtensionNode(Node, ExtMap); Node = isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0); Index: polly/trunk/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll =================================================================== --- polly/trunk/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll +++ polly/trunk/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll @@ -0,0 +1,127 @@ +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-target-througput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-cache-level-associativity=8,8 -polly-target-cache-level-sizes=32768,262144 -polly-ast -analyze < %s | FileCheck %s +; +; /* C := alpha*A*B + beta*C */ +; /* _PB_NK % Kc != 0 */ +; for (i = 0; i < _PB_NI; i++) +; for (j = 0; j < _PB_NJ; j++) +; { +; C[i][j] *= beta; +; for (k = 0; k < _PB_NK; ++k) +; C[i][j] += alpha * A[i][k] * B[k][j]; +; } +; +; CHECK: { +; CHECK-NEXT: // 1st level tiling - Tiles +; CHECK-NEXT: for (int c0 = 0; c0 <= 32; c0 += 1) +; CHECK-NEXT: for (int c1 = 0; c1 <= 32; c1 += 1) { +; CHECK-NEXT: // 1st level tiling - Points +; CHECK-NEXT: for (int c2 = 0; c2 <= 31; c2 += 1) +; CHECK-NEXT: for (int c3 = 0; c3 <= 31; c3 += 1) +; CHECK-NEXT: Stmt_bb9(32 * c0 + c2, 32 * c1 + c3); +; CHECK-NEXT: } +; CHECK-NEXT: // 1st level tiling - Tiles +; CHECK-NEXT: for (int c0 = 0; c0 <= 65; c0 += 1) +; CHECK-NEXT: for (int c1 = 0; c1 <= 3; c1 += 1) { +; CHECK-NEXT: for (int c3 = 16 * c0; c3 <= 16 * c0 + 15; c3 += 1) +; CHECK-NEXT: for (int c4 = 256 * c1; c4 <= min(1022, 256 * c1 + 255); c4 += 1) +; CHECK-NEXT: CopyStmt_0(0, c3, c4); +; CHECK-NEXT: for (int c2 = 0; c2 <= 10; c2 += 1) { +; CHECK-NEXT: for (int c3 = 96 * c2; c3 <= 96 * c2 + 95; c3 += 1) +; CHECK-NEXT: for (int c5 = 256 * c1; c5 <= min(1022, 256 * c1 + 255); c5 += 1) +; CHECK-NEXT: CopyStmt_1(c3, 0, c5); +; CHECK-NEXT: // 1st level tiling - Points +; CHECK-NEXT: // Register tiling - Tiles +; CHECK-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1) +; CHECK-NEXT: for (int c4 = 0; c4 <= 23; c4 += 1) +; CHECK-NEXT: for (int c5 = 0; c5 <= min(255, -256 * c1 + 1022); c5 += 1) { +; CHECK-NEXT: // Register tiling - Points +; CHECK-NEXT: // 1st level tiling - Tiles +; CHECK-NEXT: // 1st level tiling - Points +; CHECK-NEXT: { +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 16 * c0 + 8 * c3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 1, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 2, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 4, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 5, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 6, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 7, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 1, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 2, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 4, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 5, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 6, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 7, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 1, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 2, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 4, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 5, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 6, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 7, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 1, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 2, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 4, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 5, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 6, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 7, 256 * c1 + c5); +; CHECK-NEXT: } +; CHECK-NEXT: } +; CHECK-NEXT: } +; CHECK-NEXT: } +; CHECK-NEXT: } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" + +define internal void @kernel_gemm(i32 %arg, i32 %arg1, i32 %arg2, double %arg3, double %arg4, [1056 x double]* %arg5, [1023 x double]* %arg6, [1056 x double]* %arg7) #0 { +bb: + br label %bb8 + +bb8: ; preds = %bb29, %bb + %tmp = phi i64 [ 0, %bb ], [ %tmp30, %bb29 ] + br label %bb9 + +bb9: ; preds = %bb26, %bb8 + %tmp10 = phi i64 [ 0, %bb8 ], [ %tmp27, %bb26 ] + %tmp11 = getelementptr inbounds [1056 x double], [1056 x double]* %arg5, i64 %tmp, i64 %tmp10 + %tmp12 = load double, double* %tmp11, align 8 + %tmp13 = fmul double %tmp12, %arg4 + store double %tmp13, double* %tmp11, align 8 + br label %Copy_0 + +Copy_0: ; preds = %Copy_0, %bb9 + %tmp15 = phi i64 [ 0, %bb9 ], [ %tmp24, %Copy_0 ] + %tmp16 = getelementptr inbounds [1023 x double], [1023 x double]* %arg6, i64 %tmp, i64 %tmp15 + %tmp17 = load double, double* %tmp16, align 8 + %tmp18 = fmul double %tmp17, %arg3 + %tmp19 = getelementptr inbounds [1056 x double], [1056 x double]* %arg7, i64 %tmp15, i64 %tmp10 + %tmp20 = load double, double* %tmp19, align 8 + %tmp21 = fmul double %tmp18, %tmp20 + %tmp22 = load double, double* %tmp11, align 8 + %tmp23 = fadd double %tmp22, %tmp21 + store double %tmp23, double* %tmp11, align 8 + %tmp24 = add nuw nsw i64 %tmp15, 1 + %tmp25 = icmp ne i64 %tmp24, 1023 + br i1 %tmp25, label %Copy_0, label %bb26 + +bb26: ; preds = %Copy_0 + %tmp27 = add nuw nsw i64 %tmp10, 1 + %tmp28 = icmp ne i64 %tmp27, 1056 + br i1 %tmp28, label %bb9, label %bb29 + +bb29: ; preds = %bb26 + %tmp30 = add nuw nsw i64 %tmp, 1 + %tmp31 = icmp ne i64 %tmp30, 1056 + br i1 %tmp31, label %bb8, label %bb32 + +bb32: ; preds = %bb29 + ret void +} + +attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+aes,+avx,+cmov,+cx16,+fxsr,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" }