This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
mlir/lib/Conversion/SCFToGPU/
-
lib/
-
Conversion/
-
SCFToGPU/
15
SCFToGPU.cpp

Differential D153178

mlir/lib/Conversion/SCFToGPU: basic support for reductions in SCF to GPU
Needs ReviewPublic

Authored by rohany on Jun 16 2023, 4:00 PM.

Download Raw Diff

Details

Reviewers

jdoerfert
herhut
csigg

Summary

This commit adds initial support for mapping SCF parallel loops with
reductions onto GPUs using atomics and the GPU allreduce operation.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

rohany created this revision.Jun 16 2023, 4:00 PM

Herald added a project: Restricted Project. · View Herald TranscriptJun 16 2023, 4:00 PM

Herald added subscribers: bviyer, Moerafaat, zero9178 and 22 others. · View Herald Transcript

rohany requested review of this revision.Jun 16 2023, 4:00 PM

Herald added a reviewer: jdoerfert. · View Herald TranscriptJun 16 2023, 4:00 PM

Herald added a reviewer: herhut. · View Herald Transcript

Herald added a project: Restricted Project. · View Herald Transcript

Herald added subscribers: jplehr, sstefan1, stephenneuendorffer, nicolasvasilache. · View Herald Transcript

This commit is an initial implementation to adding reduction support for SCF to GPU mapping. It's really a work in progress that I'm hoping to get feedback on, because a couple parts of the implementation feel awkward / hacky. This approach handles simply nested parallel for loops, which have been tiled for blocks and then threads. Thread-level reductions are handled with all-reduce operations (extra work around this due to iteration space guards), and block-level reductions are handled with atomic RMW operations to GPU memory.

As an example, the following affine code (which does a parallel sum and max reduction over an input array):

func.func @reduce2(%input : memref<?xf32>) -> (f32, f32) {
  %zero = arith.constant 0. : f32
  %zero_0 = arith.constant 0 : index
  %n = memref.dim %input, %zero_0 : memref<?xf32>
  %reduceval, %maxval = affine.for %i = 0 to %n iter_args(%sum = %zero, %max = %zero) -> (f32, f32) {
    %0 = affine.load %input[%i] : memref<?xf32>
    %1 = arith.addf %0, %sum : f32
    %2 = arith.maxf %0, %max : f32
    affine.yield %1, %2 : f32, f32
  }
  return %reduceval, %maxval : f32, f32
}

is lowered with ./bin/mlir-opt ../testing.mlir -pass-pipeline="builtin.module(func.func(affine-parallelize{parallel-reductions}, lower-affine, canonicalize, scf-parallel-loop-tiling{parallel-loop-tile-sizes=256}, scf-for-loop-canonicalization, gpu-map-parallel-loops))" to

#map = affine_map<(d0, d1, d2) -> (256, d1 - d2)>
module {
  func.func @reduce2(%arg0: memref<?xf32>) -> (f32, f32) {
    %c256 = arith.constant 256 : index
    %cst = arith.constant 0xFF800000 : f32
    %c1 = arith.constant 1 : index
    %cst_0 = arith.constant 0.000000e+00 : f32
    %c0 = arith.constant 0 : index
    %dim = memref.dim %arg0, %c0 : memref<?xf32>
    %0:2 = scf.parallel (%arg1) = (%c0) to (%dim) step (%c256) init (%cst_0, %cst) -> (f32, f32) {
      %3 = affine.min #map(%c256, %dim, %arg1)
      %4:2 = scf.parallel (%arg2) = (%c0) to (%3) step (%c1) init (%cst_0, %cst) -> (f32, f32) {
        %5 = arith.addi %arg2, %arg1 : index
        %6 = memref.load %arg0[%5] : memref<?xf32>
        scf.reduce(%6)  : f32 {
        ^bb0(%arg3: f32, %arg4: f32):
          %7 = arith.addf %arg3, %arg4 : f32
          scf.reduce.return %7 : f32
        }
        scf.reduce(%6)  : f32 {
        ^bb0(%arg3: f32, %arg4: f32):
          %7 = arith.maxf %arg3, %arg4 : f32
          scf.reduce.return %7 : f32
        }
        scf.yield
      } {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
      scf.reduce(%4#0)  : f32 {
      ^bb0(%arg2: f32, %arg3: f32):
        %5 = arith.addf %arg2, %arg3 : f32
        scf.reduce.return %5 : f32
      }
      scf.reduce(%4#1)  : f32 {
      ^bb0(%arg2: f32, %arg3: f32):
        %5 = arith.maxf %arg2, %arg3 : f32
        scf.reduce.return %5 : f32
      }
      scf.yield
    } {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
    %1 = arith.addf %0#0, %cst_0 : f32
    %2 = arith.maxf %0#1, %cst_0 : f32
    return %1, %2 : f32, f32
  }
}

Final application of the conversion pass with this patch yields:

#map = affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)>
#map1 = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
#map2 = affine_map<(d0, d1, d2) -> (256, d1 - d2)>
module {
  func.func @reduce2(%arg0: memref<?xf32>) -> (f32, f32) {
    %c256 = arith.constant 256 : index
    %cst = arith.constant 0xFF800000 : f32
    %c1 = arith.constant 1 : index
    %cst_0 = arith.constant 0.000000e+00 : f32
    %c0 = arith.constant 0 : index
    %dim = memref.dim %arg0, %c0 : memref<?xf32>
    %memref = gpu.alloc  () : memref<f32>
    %alloca = memref.alloca() : memref<f32>
    memref.store %cst_0, %alloca[] : memref<f32>
    gpu.memcpy  %memref, %alloca : memref<f32>, memref<f32>
    %memref_1 = gpu.alloc  () : memref<f32>
    %alloca_2 = memref.alloca() : memref<f32>
    memref.store %cst, %alloca_2[] : memref<f32>
    gpu.memcpy  %memref_1, %alloca_2 : memref<f32>, memref<f32>
    %c1_3 = arith.constant 1 : index
    %0 = affine.apply #map(%dim)[%c0, %c256]
    %c256_4 = arith.constant 256 : index
    %1 = affine.apply #map(%c256_4)[%c0, %c1]
    gpu.launch blocks(%arg1, %arg2, %arg3) in (%arg7 = %0, %arg8 = %c1_3, %arg9 = %c1_3) threads(%arg4, %arg5, %arg6) in (%arg10 = %1, %arg11 = %c1_3, %arg12 = %c1_3) {
      %6 = affine.apply #map1(%arg1)[%c256, %c0]
      %7 = affine.min #map2(%c256, %dim, %6)
      %8 = affine.apply #map1(%arg4)[%c1, %c0]
      %9 = arith.cmpi slt, %8, %7 : index
      %10:2 = scf.if %9 -> (f32, f32) {
        %17 = arith.addi %8, %6 : index
        %18 = memref.load %arg0[%17] : memref<?xf32>
        scf.yield %18, %18 : f32, f32
      } else {
        scf.yield %cst_0, %cst : f32, f32
      }
      %11 = gpu.all_reduce  max %10#1 uniform {
      } : (f32) -> f32
      %12 = gpu.all_reduce  add %10#0 uniform {
      } : (f32) -> f32
      %13 = gpu.thread_id  x
      %c0_7 = arith.constant 0 : index
      %14 = arith.cmpi eq, %13, %c0_7 : index
      scf.if %14 {
        %17 = memref.atomic_rmw addf %12, %memref[] : (f32, memref<f32>) -> f32
      }
      %15 = gpu.thread_id  x
      %c0_8 = arith.constant 0 : index
      %16 = arith.cmpi eq, %15, %c0_8 : index
      scf.if %16 {
        %17 = memref.atomic_rmw maxf %11, %memref_1[] : (f32, memref<f32>) -> f32
      }
      gpu.terminator
    } {SCFToGPU_visited}
    %alloca_5 = memref.alloca() : memref<f32>
    gpu.memcpy  %alloca_5, %memref : memref<f32>, memref<f32>
    %2 = memref.load %alloca_5[] : memref<f32>
    %alloca_6 = memref.alloca() : memref<f32>
    gpu.memcpy  %alloca_6, %memref_1 : memref<f32>, memref<f32>
    %3 = memref.load %alloca_6[] : memref<f32>
    %4 = arith.addf %2, %cst_0 : f32
    %5 = arith.maxf %3, %cst_0 : f32
    return %4, %5 : f32, f32
  }
}

Harbormaster completed remote builds in B239547: Diff 532316.Jun 16 2023, 5:30 PM

I would consider having an operation, or maybe attributes to gpu.launch, that represent the cross-block reduction. That operation could be lowered separately and hopefully make the implementation here less complex.

In general, I couldn't always follow where the new IR gets inserted. Specifically, it looks like the if providing the reduction or neutral value is always inserted in the beginning, but it may depend on values computed later within the loop, e.g., when the logic is more complex than loading from a memref indexed by the loop/block index. It feels like it can be inserted right after the scf.reduce so it is dominated by the same set of values as the reduce itself. Similarly, all_reduce should be inserted there, after the if, which would remove the non-guaranteed hack that attempts to move it in the end.

mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
36	This is not allowed in LLVM https://llvm.org/docs/CodingStandards.html#include-iostream-is-forbidden
539	Why is this limited to threadX? What happens if the reduction loop is mapped to other thread dimensions?
546	Nit: please use `/thisStyleOfCommentForArgumentName=/1` to avoid magic values.
561	It isn't very fresh in my memory even if I may have reviewed this, but it does feel quite convoluted. However, that's what the existing code is doing so I would just keep doing the same for this patch and submitted a refactoring as a separate patch.
624–625	Same place as `matchReduction`?
682	Nit: please expand `auto` unless the type is obvious from context, e.g., the RHS is a cast, or annoying to spell out, e.g. iterators and lambdas. Note that here the type isn't clear from the context and it may be either `MLIRContext *` or `MLIRContext &` (clang-tidy will complain about this). https://llvm.org/docs/CodingStandards.html#use-auto-type-deduction-to-make-code-more-readable
699	Nit: no need to prefix most of LLVM ADTs with `llvm::` in MLIR code as they are re-exported into the MLIR namespace.
700	Nit: don't specify the number of stack elements in small containers unless you have a strong reason to peek a specific value.
721–753	Instead of doing all this, can we just allocate memory that's accessible from both host and device? Than we can just store into it on host. Or does that come with a performance penalty?
726	Nit: you should be able to construct an `ArrayRef<int64_t>` rather than a vector. Here and below.
734–735	I suppose this is related to how the underlying `memset` function is implemented.
747	I would expect that the op has properly named arguments in ODS so we can see which of the values is the source and which is the destination.
862–863	Nit: you can just do `cast<scf::ifOp>(parent)` that asserts internally and remove your own assert.
944	RAUW is forbidden in patterns. Use the rewriter API instead.
966	Low-level IR mutation API are forbidden in patterns. Use the rewriter API instead.

ftynse added a reviewer: csigg.Jun 19 2023, 2:48 AM

Thanks for the initial pass, let me respond to some high-level things before going and making changes to the code. I'll fix all the nits / code style around the rewriter, thanks for the pointers.

I would consider having an operation, or maybe attributes to gpu.launch, that represent the cross-block reduction. That operation could be lowered separately and hopefully make the implementation here less complex.

I think an operation makes sense, but I don't view myself as knowledgeable enough about this community and consumers of the GPU dialect to go forward with proposing and adding all the things needed for a new operation. I'm not sure that should block the addition of a feature like this.

Specifically, it looks like the if providing the reduction or neutral value is always inserted in the beginning, but it may depend on values computed later within the loop, e.g., when the logic is more complex than loading from a memref indexed by the loop/block index.

AFAICT, the if is inserted as a guard to make sure that only the thread ids inside the thread-mapped loop enter the loop body. It is not dependent on other things that may be inside the loop, because it must surround the entire loop body. On the other hand, the gpu.all_reduce _must_ go outside the if, because every thread must execute the collective block-wide reduction whether or not individual threads in a warp of the block enter the loop body. So that's why I've structured it this way.

Instead of doing all this, can we just allocate memory that's accessible from both host and device? Than we can just store into it on host. Or does that come with a performance penalty?

If I remember correctly, contended atomic RMW's to zero-copy memory perform worse than those to frame-buffer memory. However, I'll run a simple experiment later and see -- it would be much simpler to just reduce to a host buffer.

I would expect that the op has properly named arguments in ODS so we can see which of the values is the source and which is the destination.

I looked deeper into this and the driver function that this ends up calling is able to tell whether a pointer is host or device allocated.

Woops, forgot one high-level comment:

Why is this limited to threadX? What happens if the reduction loop is mapped to other thread dimensions?

I'm not sure what to do if there are multiple nested loops mapped to different thread dimensions. Consider:

%0 = pfor i in ... -- map thread x reduce:
  %1 = pfor j in ... -- map thread y reduce:
    ...

If the thread x block uses the value %1 in a way that isn't just collapsing both the x and y thread dimensions to a single scalar, then we need some kind of sub-block reduction primitive that is not currently there. The use case I have in mind is not going to map multiple thread dimensions when doing reductions, so I'm incentivized to support my simple use case.

If I remember correctly, contended atomic RMW's to zero-copy memory perform worse than those to frame-buffer memory. However, I'll run a simple experiment later and see -- it would be much simpler to just reduce to a host buffer.

A quick experiment yielded that for the case of reducing to a single element, there wasn't any noticeable difference in reductions to a buffer in zero copy memory versus a buffer in gpu framebuffer. I'll clean up the implementation here to allocate the reduction buffers in zero-copy memory, which should clean up some of the cruft here. However, the GPUtoLLVM pass doesn't currently support host-shared gpu allocations! I can add support for this for CUDA in this PR or a separate one, depending on what makes sense. I don't have access to AMD machines, so I wouldn't be able to add the corresponding definitions in the AMD gpu runtime.

A quick experiment yielded that for the case of reducing to a single element, there wasn't any noticeable difference in reductions to a buffer in zero copy memory versus a buffer in gpu framebuffer. I'll clean up the implementation here to allocate the reduction buffers in zero-copy memory, which should clean up some of the cruft here. However, the GPUtoLLVM pass doesn't currently support host-shared gpu allocations! I can add support for this for CUDA in this PR or a separate one, depending on what makes sense. I don't have access to AMD machines, so I wouldn't be able to add the corresponding definitions in the AMD gpu runtime.

Please ignore this. I had some if-statements backwards in my lowering to test this and the zero-copy buffer was not actually getting allocated. When zero-copy is used, the reduction is significantly slower (around 2 orders of magnitude). So we need to do it this way, and i'll update this code to thread a gpu async token through the allocations and copies.

Revision Contents

Path

Size

mlir/

lib/

Conversion/

SCFToGPU/

SCFToGPU.cpp

323 lines

Diff 532316

mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp

//===- SCFToGPU.cpp - Convert an affine loop nest to a GPU kernel -------===//		//===- SCFToGPU.cpp - Convert an affine loop nest to a GPU kernel -------===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This implements a straightforward conversion of an loop nest into a GPU		// This implements a straightforward conversion of an loop nest into a GPU
// kernel. The caller is expected to guarantee that the conversion is correct		// kernel. The caller is expected to guarantee that the conversion is correct
// or to further transform the kernel to ensure correctness.		// or to further transform the kernel to ensure correctness.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "mlir/Conversion/SCFToGPU/SCFToGPU.h"		#include "mlir/Conversion/SCFToGPU/SCFToGPU.h"

		#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"		#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"		#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"		#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"		#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/ParallelLoopMapper.h"		#include "mlir/Dialect/GPU/Transforms/ParallelLoopMapper.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"		#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/IR/SCF.h"		#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/IR/AffineExpr.h"		#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/Builders.h"		#include "mlir/IR/Builders.h"
#include "mlir/IR/IRMapping.h"		#include "mlir/IR/IRMapping.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"		#include "mlir/Interfaces/SideEffectInterfaces.h"
#include "mlir/Pass/Pass.h"		#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"		#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/Passes.h"		#include "mlir/Transforms/Passes.h"
#include "mlir/Transforms/RegionUtils.h"		#include "mlir/Transforms/RegionUtils.h"
#include "llvm/ADT/Sequence.h"		#include "llvm/ADT/Sequence.h"
#include "llvm/Support/Debug.h"		#include "llvm/Support/Debug.h"
#include <optional>		#include <optional>
		#include <iostream>
		ftynseUnsubmitted Not Done Reply Inline Actions This is not allowed in LLVM https://llvm.org/docs/CodingStandards.html#include-iostream-is-forbidden ftynse: This is not allowed in LLVM https://llvm.org/docs/CodingStandards.html#include-iostream-is…


#define DEBUG_TYPE "loops-to-gpu"		#define DEBUG_TYPE "loops-to-gpu"

using namespace mlir;		using namespace mlir;
using namespace mlir::affine;		using namespace mlir::affine;
using namespace mlir::scf;		using namespace mlir::scf;

// Name of internal attribute to mark visited operations during conversion.		// Name of internal attribute to mark visited operations during conversion.
▲ Show 20 Lines • Show All 360 Lines • ▼ Show 20 Lines
/// - append the instructions from the loops body to worklist, in reverse order.		/// - append the instructions from the loops body to worklist, in reverse order.
/// To note the end of the current scope in case a loop or conditional was		/// To note the end of the current scope in case a loop or conditional was
/// inserted, a sentinel (the `gpu.launch` operation) is inserted into the		/// inserted, a sentinel (the `gpu.launch` operation) is inserted into the
/// worklist. This signals the processor of the worklist to pop the rewriter		/// worklist. This signals the processor of the worklist to pop the rewriter
/// one scope-level up.		/// one scope-level up.
static LogicalResult processParallelLoop(		static LogicalResult processParallelLoop(
ParallelOp parallelOp, gpu::LaunchOp launchOp, IRMapping &cloningMap,		ParallelOp parallelOp, gpu::LaunchOp launchOp, IRMapping &cloningMap,
SmallVectorImpl<Operation *> &worklist,		SmallVectorImpl<Operation *> &worklist,
DenseMap<gpu::Processor, Value> &bounds, PatternRewriter &rewriter) {		DenseMap<gpu::Processor, Value> &bounds, PatternRewriter &rewriter,
		DenseMap<ParallelOp, llvm::SmallVector<mlir::Value, 4>>& loopReductionOperands
		) {
// TODO: Verify that this is a valid GPU mapping.		// TODO: Verify that this is a valid GPU mapping.
// processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential		// processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
ArrayAttr mapping =		ArrayAttr mapping =
parallelOp->getAttrOfType<ArrayAttr>(gpu::getMappingAttrName());		parallelOp->getAttrOfType<ArrayAttr>(gpu::getMappingAttrName());

// TODO: Support reductions.		if (!mapping)
if (!mapping \|\| parallelOp.getNumResults() != 0)
return failure();		return failure();

Location loc = parallelOp.getLoc();		Location loc = parallelOp.getLoc();

auto launchIndependent = [&launchOp](Value val) {		auto launchIndependent = [&launchOp](Value val) {
return val.getParentRegion()->isAncestor(launchOp->getParentRegion());		return val.getParentRegion()->isAncestor(launchOp->getParentRegion());
};		};

▲ Show 20 Lines • Show All 89 Lines • ▼ Show 20 Lines	if (isMappedToProcessor(processor)) {
// when this condition is relaxed.		// when this condition is relaxed.
if (bounds.contains(processor)) {		if (bounds.contains(processor)) {
return rewriter.notifyMatchFailure(		return rewriter.notifyMatchFailure(
parallelOp, "cannot redefine the bound for processor " +		parallelOp, "cannot redefine the bound for processor " +
Twine(static_cast<int64_t>(processor)));		Twine(static_cast<int64_t>(processor)));
}		}
bounds[processor] = launchBound;		bounds[processor] = launchBound;
}		}

		// Thread-level reductions require special handling because of the
		// loop-bounds guard that we insert to ensure that threads outside
		// of the loop iteration space have no effects. If a thread-mapped
		// loop has reductions, we always construct an if-else pair (for simplicity)
		// where the if returns all operands of the reduction blocks in the
		// thread-level reduction loop, and the else branch returns the initial
		// values of the loop. This makes it easy to explain the semantics of
		// a thread-level reduction: all threads contribute a value to the
		// reduction, and threads outside of the iteration space contribute
		// the initial value.
		if (parallelOp.getNumReductions() > 0 && processor == gpu::Processor::ThreadX) {
		ftynseUnsubmitted Not Done Reply Inline Actions Why is this limited to threadX? What happens if the reduction loop is mapped to other thread dimensions? ftynse: Why is this limited to threadX? What happens if the reduction loop is mapped to other thread…
		assert (!loopReductionOperands.contains(parallelOp));

		// If we we can always prove that the iteration space bounds are
		// tight, generate a vacuously true check.
		mlir::Value pred;
		if (boundIsPrecise) {
		pred = rewriter.create<arith::ConstantIntOp>(loc, 1, 1);
		ftynseUnsubmitted Not Done Reply Inline Actions Nit: please use `/thisStyleOfCommentForArgumentName=/1` to avoid magic values. ftynse: Nit: please use `/thisStyleOfCommentForArgumentName=/1` to avoid magic values.
		} else {
		Value originalBound = std::get<3>(config);
		pred = rewriter.create<arith::CmpIOp>(
		loc, arith::CmpIPredicate::slt, newIndex,
		cloningMap.lookupOrDefault(originalBound));
		}
		auto ifOp = rewriter.create<scf::IfOp>(loc, parallelOp.getResults().getTypes(), pred, true /* withElseRegion */);

		// Map the loop reduction operands of this parallel loop to
		// the results of the if.
		for (auto result : ifOp.getResults()) {
		loopReductionOperands[parallelOp].push_back(result);
		}

		// TODO (rohany): This feels pretty ugly to me.
		ftynseUnsubmitted Not Done Reply Inline Actions It isn't very fresh in my memory even if I may have reviewed this, but it does feel quite convoluted. However, that's what the existing code is doing so I would just keep doing the same for this patch and submitted a refactoring as a separate patch. ftynse: It isn't very fresh in my memory even if I may have reviewed this, but it does feel quite…
		// The way that this instruction copying is set up (in the driver
		// function below) makes it hard to know when we're copying an
		// instruction if we should yield it from the if here. Instead,
		// when we process the reduction operations later, we'll reach
		// in an add the operands to this yield.
		rewriter.setInsertionPointToEnd(ifOp.thenBlock());
		rewriter.create<scf::YieldOp>(loc);

		// Initialize the else block to just return the initial values.
		rewriter.setInsertionPointToStart(ifOp.elseBlock());
		rewriter.create<scf::YieldOp>(loc, parallelOp.getInitVals());

		rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
		worklist.push_back(launchOp.getOperation());
		} else {
if (!boundIsPrecise) {		if (!boundIsPrecise) {
// We are using an approximation, create a surrounding conditional.		// We are using an approximation, create a surrounding conditional.
Value originalBound = std::get<3>(config);		Value originalBound = std::get<3>(config);
arith::CmpIOp pred = rewriter.create<arith::CmpIOp>(		arith::CmpIOp pred = rewriter.create<arith::CmpIOp>(
loc, arith::CmpIPredicate::slt, newIndex,		loc, arith::CmpIPredicate::slt, newIndex,
cloningMap.lookupOrDefault(originalBound));		cloningMap.lookupOrDefault(originalBound));
scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, pred, false);		scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, pred, false);
rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());		rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
// Put a sentinel into the worklist so we know when to pop out of the		// Put a sentinel into the worklist so we know when to pop out of the
// if body again. We use the launchOp here, as that cannot be part of		// if body again. We use the launchOp here, as that cannot be part of
// the bodies instruction.		// the bodies instruction.
worklist.push_back(launchOp.getOperation());		worklist.push_back(launchOp.getOperation());
}		}
}		}
		}
} else {		} else {
// Create a sequential for loop.		// Create a sequential for loop.
auto loopOp = rewriter.create<scf::ForOp>(		auto loopOp = rewriter.create<scf::ForOp>(
loc, cloningMap.lookupOrDefault(lowerBound),		loc, cloningMap.lookupOrDefault(lowerBound),
cloningMap.lookupOrDefault(upperBound),		cloningMap.lookupOrDefault(upperBound),
cloningMap.lookupOrDefault(step));		cloningMap.lookupOrDefault(step));
newIndex = loopOp.getInductionVar();		newIndex = loopOp.getInductionVar();
rewriter.setInsertionPointToStart(loopOp.getBody());		rewriter.setInsertionPointToStart(loopOp.getBody());
Show All 16 Lines	static LogicalResult processParallelLoop(

Block *body = parallelOp.getBody();		Block *body = parallelOp.getBody();
worklist.reserve(worklist.size() + body->getOperations().size());		worklist.reserve(worklist.size() + body->getOperations().size());
for (Operation &op : llvm::reverse(body->without_terminator()))		for (Operation &op : llvm::reverse(body->without_terminator()))
worklist.push_back(&op);		worklist.push_back(&op);
return success();		return success();
}		}

		// TODO (rohany): Avoid this duplication, but not sure where in MLIR
		// this sort of utility might go.
		ftynseUnsubmitted Not Done Reply Inline Actions Same place as `matchReduction`? ftynse: Same place as `matchReduction`?
		// matchSimpleReduction is code borrowed from the SCF->OpenMP conversion
		// pass. It checks if the input block matches a simple reduction on
		// the input values, such as an add or max reduction.
		template <typename... OpTy>
		static bool matchSimpleReduction(Block &block) {
		if (block.empty() \|\| llvm::hasSingleElement(block) \|\|
		std::next(block.begin(), 2) != block.end())
		return false;

		if (block.getNumArguments() != 2)
		return false;

		SmallVector<Operation *, 4> combinerOps;
		Value reducedVal = matchReduction({block.getArguments()[1]},
		/redPos=/0, combinerOps);

		if (!reducedVal \|\| !isa<BlockArgument>(reducedVal) \|\| combinerOps.size() != 1)
		return false;

		return isa<OpTy...>(combinerOps[0]) &&
		isa<scf::ReduceReturnOp>(block.back()) &&
		block.front().getOperands() == block.getArguments();
		}

/// Lower a `scf.parallel` operation into a corresponding `gpu.launch`		/// Lower a `scf.parallel` operation into a corresponding `gpu.launch`
/// operation.		/// operation.
///		///
/// This essentially transforms a loop nest into a corresponding SIMT function.		/// This essentially transforms a loop nest into a corresponding SIMT function.
/// The conversion is driven by mapping annotations on the `scf.parallel`		/// The conversion is driven by mapping annotations on the `scf.parallel`
/// operations. The mapping is provided via a `DictionaryAttribute` named		/// operations. The mapping is provided via a `DictionaryAttribute` named
/// `mapping`, which has three entries:		/// `mapping`, which has three entries:
/// - processor: the hardware id to map to. 0-2 are block dimensions, 3-5 are		/// - processor: the hardware id to map to. 0-2 are block dimensions, 3-5 are
Show All 16 Lines
/// the hardware id might iterate over additional indices. The transformation		/// the hardware id might iterate over additional indices. The transformation
/// caters for this by predicating the created sequence of instructions on		/// caters for this by predicating the created sequence of instructions on
/// the actual loop bound. This only works if an static upper bound for the		/// the actual loop bound. This only works if an static upper bound for the
/// dynamic loop bound can be derived, currently via analyzing `affine.min`		/// dynamic loop bound can be derived, currently via analyzing `affine.min`
/// operations.		/// operations.
LogicalResult		LogicalResult
ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,		ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
PatternRewriter &rewriter) const {		PatternRewriter &rewriter) const {
		auto ctx = parallelOp.getOperation()->getContext();
		ftynseUnsubmitted Not Done Reply Inline Actions Nit: please expand `auto` unless the type is obvious from context, e.g., the RHS is a cast, or annoying to spell out, e.g. iterators and lambdas. Note that here the type isn't clear from the context and it may be either `MLIRContext ` or `MLIRContext &` (clang-tidy will complain about this). https://llvm.org/docs/CodingStandards.html#use-auto-type-deduction-to-make-code-more-readable ftynse:* Nit: please expand `auto` unless the type is obvious from context, e.g., the RHS is a cast, or…
// Mark the operation as visited for recursive legality check.		// Mark the operation as visited for recursive legality check.
parallelOp->setAttr(kVisitedAttrName, rewriter.getUnitAttr());		parallelOp->setAttr(kVisitedAttrName, rewriter.getUnitAttr());

// We can only transform starting at the outer-most loop. Launches inside of		// We can only transform starting at the outer-most loop. Launches inside of
// parallel loops are not supported.		// parallel loops are not supported.
if (auto parentLoop = parallelOp->getParentOfType<ParallelOp>())		if (auto parentLoop = parallelOp->getParentOfType<ParallelOp>())
return failure();		return failure();
// Create a launch operation. We start with bound one for all grid/block		// Create a launch operation. We start with bound one for all grid/block
// sizes. Those will be refined later as we discover them from mappings.		// sizes. Those will be refined later as we discover them from mappings.
Location loc = parallelOp.getLoc();		Location loc = parallelOp.getLoc();

		// We introduce initial support for GPU reductions from SCF and allow
		// for single-dimensional block and thread decompositions of reduction
		// loops. If there is a reduction at the block level, we'll allocate
		// a location for each reduction result, and have reductions at the
		// block level perform atomics into that location.
		llvm::DenseMap<ParallelOp, int> loopReductionCounters;
		ftynseUnsubmitted Not Done Reply Inline Actions Nit: no need to prefix most of LLVM ADTs with `llvm::` in MLIR code as they are re-exported into the MLIR namespace. ftynse: Nit: no need to prefix most of LLVM ADTs with `llvm::` in MLIR code as they are re-exported…
		llvm::SmallVector<gpu::AllocOp, 4> reductionAllocs;
		ftynseUnsubmitted Not Done Reply Inline Actions Nit: don't specify the number of stack elements in small containers unless you have a strong reason to peek a specific value. ftynse: Nit: don't specify the number of stack elements in small containers unless you have a strong…
		// loopReductionOperands maintains remapped operands for reduce
		// operations, as the rewriting process for reductions will not
		// just use a cloning map to maintain these.
		llvm::DenseMap<ParallelOp, llvm::SmallVector<mlir::Value, 4>> loopReductionOperands;
		// TODO (rohany): Maintaining this is unfortunate.
		// threadAllReduceOps maintains the AllReduceOp operations generated
		// for each loop reduction, since we need to move these around later.
		llvm::SmallVector<gpu::AllReduceOp, 4> threadAllReduceOps;
		if (parallelOp.getNumReductions() > 0) {
		rewriter.setInsertionPoint(parallelOp);

		// Ensure that the outer parallel loop we're lowering is mapped
		// over the BlockX parallel dimension.
		ArrayAttr allVarsMapping = parallelOp.getOperation()->getAttrOfType<ArrayAttr>(gpu::getMappingAttrName());
		assert(allVarsMapping.size() == 1);
		auto mapping = allVarsMapping[0];
		auto annotation = dyn_cast<gpu::ParallelLoopDimMappingAttr>(mapping);
		auto processor = annotation.getProcessor();
		if (processor != gpu::Processor::BlockX) return failure();

		// Allocate a GPU buffer for each reduction.
		for (unsigned i = 0; i < parallelOp.getNumReductions(); i++) {
		auto resultType = parallelOp.getResults()[i].getType();
		auto allocOp = rewriter.create<gpu::AllocOp>(
		loc,
		MemRefType::get(llvm::SmallVector<int64_t, 1>(), resultType),
		ftynseUnsubmitted Not Done Reply Inline Actions Nit: you should be able to construct an `ArrayRef<int64_t>` rather than a vector. Here and below. ftynse: Nit: you should be able to construct an `ArrayRef<int64_t>` rather than a vector. Here and…
		Type() /* asyncToken */,
		llvm::SmallVector<Value, 1>() /* deps */,
		llvm::SmallVector<Value, 1>() /* dynamicSizes */,
		llvm::SmallVector<Value, 1>() /* symbolOperands */
		);
		reductionAllocs.push_back(allocOp);

		// Memset is broken, so use memref alloca's + loads and stores.
		// By broken, I mean that it only supports 32 bit arguments...
		ftynseUnsubmitted Not Done Reply Inline Actions I suppose this is related to how the underlying `memset` function is implemented. ftynse: I suppose this is related to how the underlying `memset` function is implemented.
		auto initVal = parallelOp.getInitVals()[i];
		auto initValMemref = rewriter.create<memref::AllocaOp>(
		loc,
		MemRefType::get(llvm::SmallVector<int64_t, 1>(), initVal.getType())
		);
		rewriter.create<memref::StoreOp>(
		loc,
		initVal,
		initValMemref,
		llvm::SmallVector<Value, 1>() /* indices */
		);
		// TODO (rohany): How is the source / destination inferred?
		ftynseUnsubmitted Not Done Reply Inline Actions I would expect that the op has properly named arguments in ODS so we can see which of the values is the source and which is the destination. ftynse: I would expect that the op has properly named arguments in ODS so we can see which of the…
		rewriter.create<gpu::MemcpyOp>(
		loc,
		Type() /* asyncToken */,
		llvm::SmallVector<Value, 1>() /* async dependencies */,
		allocOp.getResults()[0],
		initValMemref
		ftynseUnsubmitted Not Done Reply Inline Actions Instead of doing all this, can we just allocate memory that's accessible from both host and device? Than we can just store into it on host. Or does that come with a performance penalty? ftynse: Instead of doing all this, can we just allocate memory that's accessible from both host and…
		);
		}
		}

Value constantOne =		Value constantOne =
rewriter.create<arith::ConstantIndexOp>(parallelOp.getLoc(), 1);		rewriter.create<arith::ConstantIndexOp>(parallelOp.getLoc(), 1);
gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>(		gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>(
parallelOp.getLoc(), constantOne, constantOne, constantOne, constantOne,		parallelOp.getLoc(), constantOne, constantOne, constantOne, constantOne,
constantOne, constantOne);		constantOne, constantOne);
rewriter.setInsertionPointToEnd(&launchOp.getBody().front());		rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
rewriter.create<gpu::TerminatorOp>(loc);		rewriter.create<gpu::TerminatorOp>(loc);
rewriter.setInsertionPointToStart(&launchOp.getBody().front());		rewriter.setInsertionPointToStart(&launchOp.getBody().front());

IRMapping cloningMap;		IRMapping cloningMap;
llvm::DenseMap<gpu::Processor, Value> launchBounds;		llvm::DenseMap<gpu::Processor, Value> launchBounds;
SmallVector<Operation *, 16> worklist;		SmallVector<Operation *, 16> worklist;
if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist,		if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist,
launchBounds, rewriter)))		launchBounds, rewriter, loopReductionOperands)))
return failure();		return failure();

// Whether we have seen any side-effects. Reset when leaving an inner scope.		// Whether we have seen any side-effects. Reset when leaving an inner scope.
bool seenSideeffects = false;		bool seenSideeffects = false;
// Whether we have left a nesting scope (and hence are no longer innermost).		// Whether we have left a nesting scope (and hence are no longer innermost).
bool leftNestingScope = false;		bool leftNestingScope = false;
while (!worklist.empty()) {		while (!worklist.empty()) {
Operation *op = worklist.pop_back_val();		Operation *op = worklist.pop_back_val();
// Now walk over the body and clone it.		// Now walk over the body and clone it.
// TODO: This is only correct if there either is no further scf.parallel		// TODO: This is only correct if there either is no further scf.parallel
// nested or this code is side-effect free. Otherwise we might need		// nested or this code is side-effect free. Otherwise we might need
// predication. We are overly conservative for now and only allow		// predication. We are overly conservative for now and only allow
// side-effects in the innermost scope.		// side-effects in the innermost scope.
if (auto nestedParallel = dyn_cast<ParallelOp>(op)) {		if (auto nestedParallel = dyn_cast<ParallelOp>(op)) {
// Before entering a nested scope, make sure there have been no		// Before entering a nested scope, make sure there have been no
// sideeffects until now.		// sideeffects until now.
if (seenSideeffects)		if (seenSideeffects)
return failure();		return failure();
// A nested scf.parallel needs insertion of code to compute indices.		// A nested scf.parallel needs insertion of code to compute indices.
// Insert that now. This will also update the worklist with the loops		// Insert that now. This will also update the worklist with the loops
// body.		// body.
if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap,		if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap,
worklist, launchBounds, rewriter)))		worklist, launchBounds, rewriter, loopReductionOperands)))
return failure();		return failure();
} else if (op == launchOp.getOperation()) {		} else if (op == launchOp.getOperation()) {
// Found our sentinel value. We have finished the operations from one		// Found our sentinel value. We have finished the operations from one
// nesting level, pop one level back up.		// nesting level, pop one level back up.
auto *parent = rewriter.getInsertionPoint()->getParentOp();		auto *parent = rewriter.getInsertionPoint()->getParentOp();
rewriter.setInsertionPointAfter(parent);		rewriter.setInsertionPointAfter(parent);
leftNestingScope = true;		leftNestingScope = true;
seenSideeffects = false;		seenSideeffects = false;
		} else if (auto redop = dyn_cast<ReduceOp>(op)) {
		// Get the processor mapping of the parallel loop this reduction
		// is occurring within.
		auto parentLoop = dyn_cast<ParallelOp>(redop.getOperation()->getParentOp());
		ArrayAttr allVarsMapping =
		parentLoop->getAttrOfType<ArrayAttr>(gpu::getMappingAttrName());
		assert(allVarsMapping.size() == 1);
		auto mapping = allVarsMapping[0];
		auto annotation = dyn_cast<gpu::ParallelLoopDimMappingAttr>(mapping);
		gpu::Processor processor = annotation.getProcessor();

		// Extract the body of the reduction, we'll use this to match the type
		// of reduction in the individual cases.
		Block &reduction = redop.getRegion().front();

		// The number of reductions that we've seen for each loop corresponds to
		// which return value of the loop we are reducing for.
		auto index = loopReductionCounters[parentLoop]++;

		// Block-level reductions will perform atomic rmw operations into a
		// preallocated buffer.
		if (processor == gpu::Processor::BlockX) {
		// Only the first thread of each block needs to contribute
		// to the reduction.
		auto threadId = rewriter.create<gpu::ThreadIdOp>(loc, gpu::Dimension::x);
		auto zeroIdx = rewriter.create<mlir::arith::ConstantIndexOp>(loc, 0);
		auto cond = rewriter.create<arith::CmpIOp>(
		loc, arith::CmpIPredicate::eq, threadId, zeroIdx);
		auto check = rewriter.create<scf::IfOp>(loc, cond, false /* withElseRegion */);
		{
		mlir::OpBuilder::InsertionGuard guard(rewriter);
		rewriter.setInsertionPointToStart(&check.getThenRegion().front());
		arith::AtomicRMWKind rmwKind;
		if (matchSimpleReduction<arith::AddFOp>(reduction)) {
		rmwKind = arith::AtomicRMWKind::addf;
		} else if (matchSimpleReduction<arith::MaxFOp>(reduction)) {
		rmwKind = arith::AtomicRMWKind::maxf;
		} else {
		return failure();
		}
		// Do an atomic rmw into the memref allocation for block-level reductions.
		rewriter.create<memref::AtomicRMWOp>(
		loc,
		rmwKind,
		cloningMap.lookup(redop.getOperand()),
		reductionAllocs[index].getOperation()->getResults()[0],
		llvm::SmallVector<Value, 1>() /* indices */
		);
		}
		} else if (processor == gpu::Processor::ThreadX) {
		// TODO (rohany): This feels ugly.
		// When we see a ReduceOp inside a thread-level reduction, we need
		// to take its operand, and have the IfOp containing that operand
		// return it. This block adds the mapped operand of the ReduceOp
		// to the yield of the parent if. Since we always add an IfOp if
		// we are doing a reduction, we don't have to handle cases here.
		auto operand = redop.getOperand();
		auto mappedOperand = cloningMap.lookup(operand);
		auto parent = mappedOperand.getDefiningOp()->getParentOp();
		assert (isa<scf::IfOp>(parent));
		auto parentIfOp = dyn_cast<scf::IfOp>(parent);
		ftynseUnsubmitted Not Done Reply Inline Actions Nit: you can just do `cast<scf::ifOp>(parent)` that asserts internally and remove your own assert. ftynse: Nit: you can just do `cast<scf::ifOp>(parent)` that asserts internally and remove your own…
		parentIfOp.thenYield().getResultsMutable().append(mappedOperand);
		auto finalOperand = loopReductionOperands[parentLoop][index];

		// Thread-level reductions will simply utilize the GPU allreduce
		// primitive that gives each thread the reduced value across
		// an entire block.
		gpu::AllReduceOperation allReduceOperation;
		if (matchSimpleReduction<arith::AddFOp>(reduction)) {
		allReduceOperation = gpu::AllReduceOperation::ADD;
		} else if (matchSimpleReduction<arith::MaxFOp>(reduction)) {
		allReduceOperation = gpu::AllReduceOperation::MAX;
		} else {
		return failure();
		}
		{
		// We must adjust the rewriter to be after the IfOp
		// to use the results of the IfOp. Without this move,
		// we would be generating the AllReduceOp inside the
		// then branch of the if.
		OpBuilder::InsertionGuard guard(rewriter);
		rewriter.setInsertionPointAfter(parent);
		// We know that the allreduce operation is uniform because
		// we explictly hoist the reduces outside of the thread-level
		// iteration space bounds check.
		auto allReduce = rewriter.create<gpu::AllReduceOp>(
		loc,
		finalOperand,
		gpu::AllReduceOperationAttr::get(ctx, allReduceOperation),
		true /* uniform */
		);
		threadAllReduceOps.push_back(allReduce);
		cloningMap.map(parentLoop.getResults()[index], allReduce);
		}
		} else {
		return failure();
		}
} else {		} else {
// Otherwise we copy it over.		// Otherwise we copy it over.
Operation clone = rewriter.clone(op, cloningMap);		Operation clone = rewriter.clone(op, cloningMap);
cloningMap.map(op->getResults(), clone->getResults());		cloningMap.map(op->getResults(), clone->getResults());
// Check for side effects.		// Check for side effects.
// TODO: Handle region side effects properly.		// TODO: Handle region side effects properly.
seenSideeffects \|=		seenSideeffects \|=
!isMemoryEffectFree(clone) \|\| clone->getNumRegions() != 0;		!isMemoryEffectFree(clone) \|\| clone->getNumRegions() != 0;
// If we are no longer in the innermost scope, sideeffects are disallowed.		// If we are no longer in the innermost scope, sideeffects are disallowed.
if (seenSideeffects && leftNestingScope)		if (seenSideeffects && leftNestingScope)
return failure();		return failure();
}		}
}		}

// Now that we succeeded creating the launch operation, also update the		// Now that we succeeded creating the launch operation, also update the
// bounds.		// bounds.
for (auto bound : launchBounds)		for (auto bound : launchBounds)
launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)),		launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)),
std::get<1>(bound));		std::get<1>(bound));

		// After handling all of the instructions inside the mapped
		// parallel loops, we need to extract the reduction value
		// from each GPU buffer. For each buffer, we allocate a
		// corresponding buffer on the CPU, and issue a copy.
		if (parallelOp.getNumReductions() > 0) {
		rewriter.setInsertionPointAfter(launchOp);
		for (unsigned i = 0; i < parallelOp.getNumReductions(); i++) {
		auto result = parallelOp.getResults()[i];
		auto localReducVal = rewriter.create<memref::AllocaOp>(
		loc,
		MemRefType::get(llvm::SmallVector<int64_t, 1>(), result.getType())
		);
		rewriter.create<gpu::MemcpyOp>(
		loc,
		Type() /* asyncToken */,
		llvm::SmallVector<Value, 1>() /* async dependencies */,
		localReducVal,
		reductionAllocs[i].getOperation()->getResults()[0]
		);
		auto load = rewriter.create<memref::LoadOp>(
		loc,
		localReducVal,
		llvm::SmallVector<mlir::Value, 1>() /* indices */
		);
		result.replaceAllUsesWith(load);
		ftynseUnsubmitted Not Done Reply Inline Actions RAUW is forbidden in patterns. Use the rewriter API instead. ftynse: RAUW is forbidden in patterns. Use the rewriter API instead.
		}

		// TODO (rohany): This feels hacky.
		// The final step here is to move AllReduceOps produced by
		// thread-level reductions closer to the definitions
		// of their operands. The way that scoping affects the
		// rewriter position causes the operations after
		// the thread-level reduction to be inserted before the
		// AllReduceOps. In particular, when we see a ReduceOp
		// inside a thread-level reduction, we add an AllReduceOp after
		// the IfOp guard generated for the thread-level loop. But then,
		// when we pop out of the nesting scope, we set the rewriter
		// position to be after the IfOp, causing future operations to
		// be inserted before the AllReduceOps, which is problematic when
		// data flows from the AllReduceOp to the other instructions. This
		// fixup phase is an attempt to move the AllReduceOperations as
		// close as possible to their definition, but is not a gauranteed fix.
		for (auto redop : threadAllReduceOps) {
		auto operand = redop.getValue();
		auto definingOp = operand.getDefiningOp();
		assert (isa<scf::IfOp>(definingOp));
		redop.getOperation()->moveAfter(definingOp);
		ftynseUnsubmitted Not Done Reply Inline Actions Low-level IR mutation API are forbidden in patterns. Use the rewriter API instead. ftynse: Low-level IR mutation API are forbidden in patterns. Use the rewriter API instead.
		}
		}

rewriter.eraseOp(parallelOp);		rewriter.eraseOp(parallelOp);
return success();		return success();
}		}

void mlir::populateParallelLoopToGPUPatterns(RewritePatternSet &patterns) {		void mlir::populateParallelLoopToGPUPatterns(RewritePatternSet &patterns) {
patterns.add<ParallelToGpuLaunchLowering>(patterns.getContext());		patterns.add<ParallelToGpuLaunchLowering>(patterns.getContext());
}		}

Show All 13 Lines