diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td @@ -207,8 +207,11 @@ def LLVM_UMinOp : LLVM_BinarySameArgsIntrOpI<"umin">; class LLVM_MemcpyIntrOpBase : - LLVM_ZeroResultIntrOp { + LLVM_ZeroResultIntrOp, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods], + /*requiresAccessGroup=*/1, /*requiresAliasAnalysis=*/1> { dag args = (ins Arg:$dst, Arg:$src, AnySignlessInteger:$len, I1Attr:$isVolatile); @@ -245,9 +248,11 @@ def LLVM_MemmoveOp : LLVM_MemcpyIntrOpBase<"memmove">; def LLVM_MemcpyInlineOp : - LLVM_ZeroResultIntrOp<"memcpy.inline", [0, 1], [], - /*requiresAccessGroup=*/1, - /*requiresAliasAnalysis=*/1> { + LLVM_ZeroResultIntrOp<"memcpy.inline", [0, 1], + [DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods], + /*requiresAccessGroup=*/1, /*requiresAliasAnalysis=*/1> { dag args = (ins Arg:$dst, Arg:$src, APIntAttr:$len, I1Attr:$isVolatile); diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp @@ -19,6 +19,7 @@ #include "mlir/IR/ValueRange.h" #include "mlir/Interfaces/DataLayoutInterfaces.h" #include "mlir/Interfaces/MemorySlotInterfaces.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/TypeSwitch.h" using namespace mlir; @@ -414,35 +415,65 @@ } //===----------------------------------------------------------------------===// -// Interfaces for memset +// Utilities for memory intrinsics //===----------------------------------------------------------------------===// -/// Returns the length of the given memset in bytes if it can be known at -/// compile-time on a best-effort basis, nothing otherwise. -static std::optional getStaticMemsetLen(LLVM::MemsetOp op) { - IntegerAttr::ValueType memsetLen; - if (!matchPattern(op.getLen(), m_ConstantInt(&memsetLen))) - return {}; +namespace { - if (memsetLen.getBitWidth() > 64) +/// Returns the length of the given memory intrinsic in bytes if it can be known +/// at compile-time on a best-effort basis, nothing otherwise. +template +std::optional getStaticMemIntrLen(MemIntr op) { + APInt memIntrLen; + if (!matchPattern(op.getLen(), m_ConstantInt(&memIntrLen))) + return {}; + if (memIntrLen.getBitWidth() > 64) return {}; + return memIntrLen.getZExtValue(); +} - return memsetLen.getZExtValue(); +/// Returns the length of the given memory intrinsic in bytes if it can be known +/// at compile-time on a best-effort basis, nothing otherwise. +/// Because MemcpyInlineOp has its length encoded as an attribute, this requires +/// specialized handling. +template <> +std::optional getStaticMemIntrLen(LLVM::MemcpyInlineOp op) { + APInt memIntrLen = op.getLen(); + if (memIntrLen.getBitWidth() > 64) + return {}; + return memIntrLen.getZExtValue(); } -/// Returns whether one can be sure the memset does not write outside of the -/// bounds of the given slot, on a best-effort basis. -static bool definitelyWritesOnlyWithinSlot(LLVM::MemsetOp op, - const MemorySlot &slot, +} // namespace + +/// Returns whether one can be sure the memory intrinsic does not write outside +/// of the bounds of the given slot, on a best-effort basis. +template +static bool definitelyWritesOnlyWithinSlot(MemIntr op, const MemorySlot &slot, DataLayout &dataLayout) { if (!isa(slot.ptr.getType()) || op.getDst() != slot.ptr) return false; - std::optional memsetLen = getStaticMemsetLen(op); - return memsetLen && *memsetLen <= dataLayout.getTypeSize(slot.elemType); + std::optional memIntrLen = getStaticMemIntrLen(op); + return memIntrLen && *memIntrLen <= dataLayout.getTypeSize(slot.elemType); +} + +/// Checks whether all indices are i32. This is used to check GEPs can index +/// into them. +static bool areAllIndicesI32(const DestructurableMemorySlot &slot) { + Type i32 = IntegerType::get(slot.ptr.getContext(), 32); + return llvm::all_of(llvm::make_first_range(slot.elementPtrs), + [&](Attribute index) { + auto intIndex = dyn_cast(index); + return intIndex && intIndex.getType() == i32; + }); } +//===----------------------------------------------------------------------===// +// Interfaces for memset +//===----------------------------------------------------------------------===// + bool LLVM::MemsetOp::loadsFrom(const MemorySlot &slot) { return false; } bool LLVM::MemsetOp::storesTo(const MemorySlot &slot) { @@ -459,8 +490,8 @@ assert(intType.getWidth() % 8 == 0); - // Build the memset integer by repeatedly shifting the value and or-ing - // it with the previous value. + // Build the memset integer by repeatedly shifting the value and + // or-ing it with the previous value. uint64_t coveredBits = 8; Value currentValue = rewriter.create(getLoc(), intType, getVal()); @@ -499,7 +530,7 @@ return false; DataLayout layout = DataLayout::closest(*this); - return getStaticMemsetLen(*this) == layout.getTypeSize(slot.elemType); + return getStaticMemIntrLen(*this) == layout.getTypeSize(slot.elemType); } DeletionKind LLVM::MemsetOp::removeBlockingUses( @@ -527,6 +558,9 @@ .getSubelementIndexMap()) return false; + if (!areAllIndicesI32(slot)) + return false; + DataLayout dataLayout = DataLayout::closest(*this); return definitelyWritesOnlyWithinSlot(*this, slot, dataLayout); } @@ -587,6 +621,302 @@ return DeletionKind::Delete; } +//===----------------------------------------------------------------------===// +// Interfaces for memcpy/memmove +//===----------------------------------------------------------------------===// + +template +static bool memcpyLoadsFrom(MemcpyLike op, const MemorySlot &slot) { + return op.getSrc() == slot.ptr; +} + +template +static bool memcpyStoresTo(MemcpyLike op, const MemorySlot &slot) { + return op.getDst() == slot.ptr; +} + +template +static Value memcpyGetStored(MemcpyLike op, const MemorySlot &slot, + RewriterBase &rewriter) { + return rewriter.create(op.getLoc(), slot.elemType, op.getSrc()); +} + +template +static bool +memcpyCanUsesBeRemoved(MemcpyLike op, const MemorySlot &slot, + const SmallPtrSetImpl &blockingUses, + SmallVectorImpl &newBlockingUses) { + // If source and destination are the same, memcpy behavior is undefined and + // memmove is a no-op. Because there is no memory change happening here, + // simplifying such operations is left to canonicalization. + if (op.getDst() == op.getSrc()) + return false; + + if (op.getIsVolatile()) + return false; + + DataLayout layout = DataLayout::closest(op); + return getStaticMemIntrLen(op) == layout.getTypeSize(slot.elemType); +} + +template +static DeletionKind +memcpyRemoveBlockingUses(MemcpyLike op, const MemorySlot &slot, + const SmallPtrSetImpl &blockingUses, + RewriterBase &rewriter, Value reachingDefinition) { + if (op.loadsFrom(slot)) + rewriter.create(op.getLoc(), reachingDefinition, + op.getDst()); + return DeletionKind::Delete; +} + +template +static LogicalResult +memcpyEnsureOnlySafeAccesses(MemcpyLike op, const MemorySlot &slot, + SmallVectorImpl &mustBeSafelyUsed) { + DataLayout dataLayout = DataLayout::closest(op); + // While rewiring memcpy-like intrinsics only supports full copies, partial + // copies are still safe accesses so it is enough to only check for writes + // within bounds. + return success(definitelyWritesOnlyWithinSlot(op, slot, dataLayout)); +} + +template +static bool memcpyCanRewire(MemcpyLike op, const DestructurableMemorySlot &slot, + SmallPtrSetImpl &usedIndices, + SmallVectorImpl &mustBeSafelyUsed) { + if (op.getIsVolatile()) + return false; + + if (!slot.elemType.cast() + .getSubelementIndexMap()) + return false; + + if (!areAllIndicesI32(slot)) + return false; + + // Only full copies are supported. + DataLayout dataLayout = DataLayout::closest(op); + if (getStaticMemIntrLen(op) != dataLayout.getTypeSize(slot.elemType)) + return false; + + if (op.getSrc() == slot.ptr) + for (Attribute index : llvm::make_first_range(slot.elementPtrs)) + usedIndices.insert(index); + + return true; +} + +namespace { + +template +void createMemcpyLikeToReplace(RewriterBase &rewriter, const DataLayout &layout, + MemcpyLike toReplace, Value dst, Value src, + Type toCpy, bool isVolatile) { + Value memcpySize = rewriter.create( + toReplace.getLoc(), IntegerAttr::get(toReplace.getLen().getType(), + layout.getTypeSize(toCpy))); + rewriter.create(toReplace.getLoc(), dst, src, memcpySize, + isVolatile); +} + +template <> +void createMemcpyLikeToReplace(RewriterBase &rewriter, const DataLayout &layout, + LLVM::MemcpyInlineOp toReplace, Value dst, + Value src, Type toCpy, bool isVolatile) { + Type lenType = IntegerType::get(toReplace->getContext(), + toReplace.getLen().getBitWidth()); + rewriter.create( + toReplace.getLoc(), dst, src, + IntegerAttr::get(lenType, layout.getTypeSize(toCpy)), isVolatile); +} + +} // namespace + +/// Rewires a memcpy-like operation. Only copies to or from the full slot are +/// supported. +template +static DeletionKind memcpyRewire(MemcpyLike op, + const DestructurableMemorySlot &slot, + DenseMap &subslots, + RewriterBase &rewriter) { + if (subslots.empty()) + return DeletionKind::Delete; + + DataLayout layout = DataLayout::closest(op); + + assert((slot.ptr == op.getDst()) != (slot.ptr == op.getSrc())); + bool isDst = slot.ptr == op.getDst(); + +#ifndef NDEBUG + size_t slotsTreated = 0; +#endif + + // It was previously checked that index types are consistent, so this type can + // be fetched now. + Type indexType = cast(subslots.begin()->first).getType(); + for (size_t i = 0, e = slot.elementPtrs.size(); i != e; i++) { + Attribute index = IntegerAttr::get(indexType, i); + if (!subslots.contains(index)) + continue; + const MemorySlot &subslot = subslots.at(index); + +#ifndef NDEBUG + slotsTreated++; +#endif + + // First get a pointer to the equivalent of this subslot from the source + // pointer. + SmallVector gepIndices{ + 0, static_cast( + cast(index).getValue().getZExtValue())}; + Value subslotPtrInOther = rewriter.create( + op.getLoc(), LLVM::LLVMPointerType::get(op.getContext()), slot.elemType, + isDst ? op.getSrc() : op.getDst(), gepIndices); + + // Then create a new memcpy out of this source pointer. + createMemcpyLikeToReplace(rewriter, layout, op, + isDst ? subslot.ptr : subslotPtrInOther, + isDst ? subslotPtrInOther : subslot.ptr, + subslot.elemType, op.getIsVolatile()); + } + + assert(subslots.size() == slotsTreated); + + return DeletionKind::Delete; +} + +bool LLVM::MemcpyOp::loadsFrom(const MemorySlot &slot) { + return memcpyLoadsFrom(*this, slot); +} + +bool LLVM::MemcpyOp::storesTo(const MemorySlot &slot) { + return memcpyStoresTo(*this, slot); +} + +Value LLVM::MemcpyOp::getStored(const MemorySlot &slot, + RewriterBase &rewriter) { + return memcpyGetStored(*this, slot, rewriter); +} + +bool LLVM::MemcpyOp::canUsesBeRemoved( + const MemorySlot &slot, const SmallPtrSetImpl &blockingUses, + SmallVectorImpl &newBlockingUses) { + return memcpyCanUsesBeRemoved(*this, slot, blockingUses, newBlockingUses); +} + +DeletionKind LLVM::MemcpyOp::removeBlockingUses( + const MemorySlot &slot, const SmallPtrSetImpl &blockingUses, + RewriterBase &rewriter, Value reachingDefinition) { + return memcpyRemoveBlockingUses(*this, slot, blockingUses, rewriter, + reachingDefinition); +} + +LogicalResult LLVM::MemcpyOp::ensureOnlySafeAccesses( + const MemorySlot &slot, SmallVectorImpl &mustBeSafelyUsed) { + return memcpyEnsureOnlySafeAccesses(*this, slot, mustBeSafelyUsed); +} + +bool LLVM::MemcpyOp::canRewire(const DestructurableMemorySlot &slot, + SmallPtrSetImpl &usedIndices, + SmallVectorImpl &mustBeSafelyUsed) { + return memcpyCanRewire(*this, slot, usedIndices, mustBeSafelyUsed); +} + +DeletionKind LLVM::MemcpyOp::rewire(const DestructurableMemorySlot &slot, + DenseMap &subslots, + RewriterBase &rewriter) { + return memcpyRewire(*this, slot, subslots, rewriter); +} + +bool LLVM::MemcpyInlineOp::loadsFrom(const MemorySlot &slot) { + return memcpyLoadsFrom(*this, slot); +} + +bool LLVM::MemcpyInlineOp::storesTo(const MemorySlot &slot) { + return memcpyStoresTo(*this, slot); +} + +Value LLVM::MemcpyInlineOp::getStored(const MemorySlot &slot, + RewriterBase &rewriter) { + return memcpyGetStored(*this, slot, rewriter); +} + +bool LLVM::MemcpyInlineOp::canUsesBeRemoved( + const MemorySlot &slot, const SmallPtrSetImpl &blockingUses, + SmallVectorImpl &newBlockingUses) { + return memcpyCanUsesBeRemoved(*this, slot, blockingUses, newBlockingUses); +} + +DeletionKind LLVM::MemcpyInlineOp::removeBlockingUses( + const MemorySlot &slot, const SmallPtrSetImpl &blockingUses, + RewriterBase &rewriter, Value reachingDefinition) { + return memcpyRemoveBlockingUses(*this, slot, blockingUses, rewriter, + reachingDefinition); +} + +LogicalResult LLVM::MemcpyInlineOp::ensureOnlySafeAccesses( + const MemorySlot &slot, SmallVectorImpl &mustBeSafelyUsed) { + return memcpyEnsureOnlySafeAccesses(*this, slot, mustBeSafelyUsed); +} + +bool LLVM::MemcpyInlineOp::canRewire( + const DestructurableMemorySlot &slot, + SmallPtrSetImpl &usedIndices, + SmallVectorImpl &mustBeSafelyUsed) { + return memcpyCanRewire(*this, slot, usedIndices, mustBeSafelyUsed); +} + +DeletionKind +LLVM::MemcpyInlineOp::rewire(const DestructurableMemorySlot &slot, + DenseMap &subslots, + RewriterBase &rewriter) { + return memcpyRewire(*this, slot, subslots, rewriter); +} + +bool LLVM::MemmoveOp::loadsFrom(const MemorySlot &slot) { + return memcpyLoadsFrom(*this, slot); +} + +bool LLVM::MemmoveOp::storesTo(const MemorySlot &slot) { + return memcpyStoresTo(*this, slot); +} + +Value LLVM::MemmoveOp::getStored(const MemorySlot &slot, + RewriterBase &rewriter) { + return memcpyGetStored(*this, slot, rewriter); +} + +bool LLVM::MemmoveOp::canUsesBeRemoved( + const MemorySlot &slot, const SmallPtrSetImpl &blockingUses, + SmallVectorImpl &newBlockingUses) { + return memcpyCanUsesBeRemoved(*this, slot, blockingUses, newBlockingUses); +} + +DeletionKind LLVM::MemmoveOp::removeBlockingUses( + const MemorySlot &slot, const SmallPtrSetImpl &blockingUses, + RewriterBase &rewriter, Value reachingDefinition) { + return memcpyRemoveBlockingUses(*this, slot, blockingUses, rewriter, + reachingDefinition); +} + +LogicalResult LLVM::MemmoveOp::ensureOnlySafeAccesses( + const MemorySlot &slot, SmallVectorImpl &mustBeSafelyUsed) { + return memcpyEnsureOnlySafeAccesses(*this, slot, mustBeSafelyUsed); +} + +bool LLVM::MemmoveOp::canRewire(const DestructurableMemorySlot &slot, + SmallPtrSetImpl &usedIndices, + SmallVectorImpl &mustBeSafelyUsed) { + return memcpyCanRewire(*this, slot, usedIndices, mustBeSafelyUsed); +} + +DeletionKind LLVM::MemmoveOp::rewire(const DestructurableMemorySlot &slot, + DenseMap &subslots, + RewriterBase &rewriter) { + return memcpyRewire(*this, slot, subslots, rewriter); +} + //===----------------------------------------------------------------------===// // Interfaces for destructurable types //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/LLVMIR/mem2reg-intrinsics.mlir b/mlir/test/Dialect/LLVMIR/mem2reg-intrinsics.mlir --- a/mlir/test/Dialect/LLVMIR/mem2reg-intrinsics.mlir +++ b/mlir/test/Dialect/LLVMIR/mem2reg-intrinsics.mlir @@ -143,3 +143,157 @@ %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> i10 llvm.return %2 : i10 } + +// ----- + +// CHECK-LABEL: llvm.func @basic_memcpy +// CHECK-SAME: (%[[SOURCE:.*]]: !llvm.ptr) +llvm.func @basic_memcpy(%source: !llvm.ptr) -> i32 { + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr + %is_volatile = llvm.mlir.constant(false) : i1 + %memcpy_len = llvm.mlir.constant(4 : i32) : i32 + "llvm.intr.memcpy"(%1, %source, %memcpy_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + // CHECK-NOT: "llvm.intr.memcpy" + // CHECK: %[[LOADED:.*]] = llvm.load %[[SOURCE]] : !llvm.ptr -> i32 + // CHECK-NOT: "llvm.intr.memcpy" + %2 = llvm.load %1 : !llvm.ptr -> i32 + // CHECK: llvm.return %[[LOADED]] : i32 + llvm.return %2 : i32 +} + +// ----- + +// CHECK-LABEL: llvm.func @basic_memcpy_dest +// CHECK-SAME: (%[[DESTINATION:.*]]: !llvm.ptr) +llvm.func @basic_memcpy_dest(%destination: !llvm.ptr) -> i32 { + %0 = llvm.mlir.constant(1 : i32) : i32 + // CHECK: %[[DATA:.*]] = llvm.mlir.constant(42 : i32) : i32 + %data = llvm.mlir.constant(42 : i32) : i32 + %is_volatile = llvm.mlir.constant(false) : i1 + %memcpy_len = llvm.mlir.constant(4 : i32) : i32 + + %1 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr + llvm.store %data, %1 : i32, !llvm.ptr + "llvm.intr.memcpy"(%destination, %1, %memcpy_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + // CHECK-NOT: "llvm.intr.memcpy" + // CHECK: llvm.store %[[DATA]], %[[DESTINATION]] : i32, !llvm.ptr + // CHECK-NOT: "llvm.intr.memcpy" + + %2 = llvm.load %1 : !llvm.ptr -> i32 + // CHECK: llvm.return %[[DATA]] : i32 + llvm.return %2 : i32 +} + +// ----- + +// CHECK-LABEL: llvm.func @double_memcpy +llvm.func @double_memcpy() -> i32 { + %0 = llvm.mlir.constant(1 : i32) : i32 + // CHECK-NEXT: %[[DATA:.*]] = llvm.mlir.constant(42 : i32) : i32 + %data = llvm.mlir.constant(42 : i32) : i32 + %is_volatile = llvm.mlir.constant(false) : i1 + %memcpy_len = llvm.mlir.constant(4 : i32) : i32 + + %1 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr + %2 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr + llvm.store %data, %1 : i32, !llvm.ptr + "llvm.intr.memcpy"(%2, %1, %memcpy_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + + %res = llvm.load %2 : !llvm.ptr -> i32 + // CHECK-NEXT: llvm.return %[[DATA]] : i32 + llvm.return %res : i32 +} + +// ----- + +// CHECK-LABEL: llvm.func @ignore_self_memcpy +llvm.func @ignore_self_memcpy() -> i32 { + // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32 + %0 = llvm.mlir.constant(1 : i32) : i32 + %is_volatile = llvm.mlir.constant(false) : i1 + %memcpy_len = llvm.mlir.constant(4 : i32) : i32 + + // CHECK-DAG: %[[ALLOCA:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32 + %1 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr + // CHECK: "llvm.intr.memcpy"(%[[ALLOCA]], %[[ALLOCA]] + "llvm.intr.memcpy"(%1, %1, %memcpy_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + + %res = llvm.load %1 : !llvm.ptr -> i32 + llvm.return %res : i32 +} + +// ----- + +// CHECK-LABEL: llvm.func @ignore_partial_memcpy +// CHECK-SAME: (%[[SOURCE:.*]]: !llvm.ptr) +llvm.func @ignore_partial_memcpy(%source: !llvm.ptr) -> i32 { + // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32 + %0 = llvm.mlir.constant(1 : i32) : i32 + %is_volatile = llvm.mlir.constant(false) : i1 + // CHECK-DAG: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(2 : i32) : i32 + %memcpy_len = llvm.mlir.constant(2 : i32) : i32 + + // CHECK-DAG: %[[ALLOCA:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32 + %1 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr + // CHECK: "llvm.intr.memcpy"(%[[ALLOCA]], %[[SOURCE]], %[[MEMCPY_LEN]]) <{isVolatile = false}> + "llvm.intr.memcpy"(%1, %source, %memcpy_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + + %res = llvm.load %1 : !llvm.ptr -> i32 + llvm.return %res : i32 +} + +// ----- + +// CHECK-LABEL: llvm.func @ignore_volatile_memcpy +// CHECK-SAME: (%[[SOURCE:.*]]: !llvm.ptr) +llvm.func @ignore_volatile_memcpy(%source: !llvm.ptr) -> i32 { + // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32 + %0 = llvm.mlir.constant(1 : i32) : i32 + %is_volatile = llvm.mlir.constant(false) : i1 + %memcpy_len = llvm.mlir.constant(4 : i32) : i32 + + // CHECK-DAG: %[[ALLOCA:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32 + %1 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr + // CHECK: "llvm.intr.memcpy"(%[[ALLOCA]], %[[SOURCE]], %[[MEMCPY_LEN]]) <{isVolatile = true}> + "llvm.intr.memcpy"(%1, %source, %memcpy_len) <{isVolatile = true}> : (!llvm.ptr, !llvm.ptr, i32) -> () + + %res = llvm.load %1 : !llvm.ptr -> i32 + llvm.return %res : i32 +} + +// ----- + +// CHECK-LABEL: llvm.func @basic_memmove +// CHECK-SAME: (%[[SOURCE:.*]]: !llvm.ptr) +llvm.func @basic_memmove(%source: !llvm.ptr) -> i32 { + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr + %is_volatile = llvm.mlir.constant(false) : i1 + %memmove_len = llvm.mlir.constant(4 : i32) : i32 + "llvm.intr.memmove"(%1, %source, %memmove_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + // CHECK-NOT: "llvm.intr.memmove" + // CHECK: %[[LOADED:.*]] = llvm.load %[[SOURCE]] : !llvm.ptr -> i32 + // CHECK-NOT: "llvm.intr.memmove" + %2 = llvm.load %1 : !llvm.ptr -> i32 + // CHECK: llvm.return %[[LOADED]] : i32 + llvm.return %2 : i32 +} + +// ----- + +// CHECK-LABEL: llvm.func @basic_memcpy_inline +// CHECK-SAME: (%[[SOURCE:.*]]: !llvm.ptr) +llvm.func @basic_memcpy_inline(%source: !llvm.ptr) -> i32 { + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr + %is_volatile = llvm.mlir.constant(false) : i1 + "llvm.intr.memcpy.inline"(%1, %source) <{isVolatile = false, len = 4 : i32}> : (!llvm.ptr, !llvm.ptr) -> () + // CHECK-NOT: "llvm.intr.memcpy.inline" + // CHECK: %[[LOADED:.*]] = llvm.load %[[SOURCE]] : !llvm.ptr -> i32 + // CHECK-NOT: "llvm.intr.memcpy.inline" + %2 = llvm.load %1 : !llvm.ptr -> i32 + // CHECK: llvm.return %[[LOADED]] : i32 + llvm.return %2 : i32 +} diff --git a/mlir/test/Dialect/LLVMIR/sroa-intrinsics.mlir b/mlir/test/Dialect/LLVMIR/sroa-intrinsics.mlir --- a/mlir/test/Dialect/LLVMIR/sroa-intrinsics.mlir +++ b/mlir/test/Dialect/LLVMIR/sroa-intrinsics.mlir @@ -235,3 +235,202 @@ %7 = llvm.add %3, %6 : i32 llvm.return %7 : i32 } + +// ----- + +// CHECK-LABEL: llvm.func @memcpy_dest +// CHECK-SAME: (%[[OTHER_ARRAY:.*]]: !llvm.ptr) +llvm.func @memcpy_dest(%other_array: !llvm.ptr) -> i32 { + // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: %[[ALLOCA:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32 + // After SROA, only one i32 will be actually used, so only 4 bytes will be set. + // CHECK-DAG: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32 + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x !llvm.array<10 x i32> : (i32) -> !llvm.ptr + %memcpy_len = llvm.mlir.constant(40 : i32) : i32 + // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32> + // CHECK: "llvm.intr.memcpy"(%[[ALLOCA]], %[[SLOT_IN_OTHER]], %[[MEMCPY_LEN]]) <{isVolatile = false}> + "llvm.intr.memcpy"(%1, %other_array, %memcpy_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32> + %3 = llvm.load %2 : !llvm.ptr -> i32 + llvm.return %3 : i32 +} + +// ----- + +// CHECK-LABEL: llvm.func @memcpy_src +// CHECK-SAME: (%[[OTHER_ARRAY:.*]]: !llvm.ptr) +llvm.func @memcpy_src(%other_array: !llvm.ptr) -> i32 { + // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32 + // After SROA, only one i32 will be actually used, so only 4 bytes will be set. + // CHECK-DAG: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32 + // CHECK-COUNT-4: = llvm.alloca %[[ALLOCA_LEN]] x i32 + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x !llvm.array<4 x i32> : (i32) -> !llvm.ptr + %memcpy_len = llvm.mlir.constant(16 : i32) : i32 + // Unfortunately because of FileCheck limitations it is not possible to check which slot gets read from. + // We can only check that the amount of operations and allocated slots is correct, which should be sufficient + // as unused slots are not generated. + // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32> + // CHECK-DAG: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}> + // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32> + // CHECK-DAG: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}> + // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32> + // CHECK-DAG: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}> + // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32> + // CHECK-DAG: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}> + "llvm.intr.memcpy"(%other_array, %1, %memcpy_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32> + %3 = llvm.load %2 : !llvm.ptr -> i32 + llvm.return %3 : i32 +} + +// ----- + +// CHECK-LABEL: llvm.func @memcpy_double +llvm.func @memcpy_double() -> i32 { + // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32 + %0 = llvm.mlir.constant(1 : i32) : i32 + // CHECK-COUNT-2: = llvm.alloca %[[ALLOCA_LEN]] x i32 + %1 = llvm.alloca %0 x !llvm.array<1 x i32> : (i32) -> !llvm.ptr + %2 = llvm.alloca %0 x !llvm.array<1 x i32> : (i32) -> !llvm.ptr + %memcpy_len = llvm.mlir.constant(4 : i32) : i32 + // CHECK-NOT: "llvm.intr.memcpy" + // CHECK: "llvm.intr.memcpy"(%{{.*}}, %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}> + // CHECK-NOT: "llvm.intr.memcpy" + "llvm.intr.memcpy"(%1, %2, %memcpy_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + %3 = llvm.getelementptr %1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<1 x i32> + %4 = llvm.load %3 : !llvm.ptr -> i32 + llvm.return %4 : i32 +} + +// ----- + +// CHECK-LABEL: llvm.func @memcpy_no_partial +// CHECK-SAME: (%[[OTHER_ARRAY:.*]]: !llvm.ptr) +llvm.func @memcpy_no_partial(%other_array: !llvm.ptr) -> i32 { + // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: %[[ALLOCA:.*]] = llvm.alloca %[[ALLOCA_LEN]] x !llvm.array<10 x i32> + // CHECK-DAG: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(21 : i32) : i32 + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x !llvm.array<10 x i32> : (i32) -> !llvm.ptr + %memcpy_len = llvm.mlir.constant(21 : i32) : i32 + // CHECK: "llvm.intr.memcpy"(%[[ALLOCA]], %[[OTHER_ARRAY]], %[[MEMCPY_LEN]]) <{isVolatile = false}> + "llvm.intr.memcpy"(%1, %other_array, %memcpy_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32> + %3 = llvm.load %2 : !llvm.ptr -> i32 + llvm.return %3 : i32 +} + +// ----- + +// CHECK-LABEL: llvm.func @memcpy_no_volatile +// CHECK-SAME: (%[[OTHER_ARRAY:.*]]: !llvm.ptr) +llvm.func @memcpy_no_volatile(%other_array: !llvm.ptr) -> i32 { + // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: %[[ALLOCA:.*]] = llvm.alloca %[[ALLOCA_LEN]] x !llvm.array<10 x i32> + // CHECK-DAG: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(40 : i32) : i32 + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x !llvm.array<10 x i32> : (i32) -> !llvm.ptr + %memcpy_len = llvm.mlir.constant(40 : i32) : i32 + // CHECK: "llvm.intr.memcpy"(%[[ALLOCA]], %[[OTHER_ARRAY]], %[[MEMCPY_LEN]]) <{isVolatile = true}> + "llvm.intr.memcpy"(%1, %other_array, %memcpy_len) <{isVolatile = true}> : (!llvm.ptr, !llvm.ptr, i32) -> () + %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32> + %3 = llvm.load %2 : !llvm.ptr -> i32 + llvm.return %3 : i32 +} + +// ----- + +// CHECK-LABEL: llvm.func @memmove_dest +// CHECK-SAME: (%[[OTHER_ARRAY:.*]]: !llvm.ptr) +llvm.func @memmove_dest(%other_array: !llvm.ptr) -> i32 { + // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: %[[ALLOCA:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32 + // After SROA, only one i32 will be actually used, so only 4 bytes will be set. + // CHECK-DAG: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32 + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x !llvm.array<10 x i32> : (i32) -> !llvm.ptr + %memmove_len = llvm.mlir.constant(40 : i32) : i32 + // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32> + // CHECK: "llvm.intr.memmove"(%[[ALLOCA]], %[[SLOT_IN_OTHER]], %[[MEMMOVE_LEN]]) <{isVolatile = false}> + "llvm.intr.memmove"(%1, %other_array, %memmove_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32> + %3 = llvm.load %2 : !llvm.ptr -> i32 + llvm.return %3 : i32 +} + +// ----- + +// CHECK-LABEL: llvm.func @memmove_src +// CHECK-SAME: (%[[OTHER_ARRAY:.*]]: !llvm.ptr) +llvm.func @memmove_src(%other_array: !llvm.ptr) -> i32 { + // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32 + // After SROA, only one i32 will be actually used, so only 4 bytes will be set. + // CHECK-DAG: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32 + // CHECK-COUNT-4: = llvm.alloca %[[ALLOCA_LEN]] x i32 + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x !llvm.array<4 x i32> : (i32) -> !llvm.ptr + %memmove_len = llvm.mlir.constant(16 : i32) : i32 + // Unfortunately because of FileCheck limitations it is not possible to check which slot gets read from. + // We can only check that the amount of operations and allocated slots is correct, which should be sufficient + // as unused slots are not generated. + // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32> + // CHECK-DAG: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}> + // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32> + // CHECK-DAG: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}> + // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32> + // CHECK-DAG: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}> + // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32> + // CHECK-DAG: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}> + "llvm.intr.memmove"(%other_array, %1, %memmove_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32> + %3 = llvm.load %2 : !llvm.ptr -> i32 + llvm.return %3 : i32 +} + +// ----- + +// CHECK-LABEL: llvm.func @memcpy_inline_dest +// CHECK-SAME: (%[[OTHER_ARRAY:.*]]: !llvm.ptr) +llvm.func @memcpy_inline_dest(%other_array: !llvm.ptr) -> i32 { + // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: %[[ALLOCA:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32 + // After SROA, only one i32 will be actually used, so only 4 bytes will be set. + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x !llvm.array<10 x i32> : (i32) -> !llvm.ptr + // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32> + // CHECK: "llvm.intr.memcpy.inline"(%[[ALLOCA]], %[[SLOT_IN_OTHER]]) <{isVolatile = false, len = 4 : i32}> + "llvm.intr.memcpy.inline"(%1, %other_array) <{isVolatile = false, len = 40 : i32}> : (!llvm.ptr, !llvm.ptr) -> () + %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32> + %3 = llvm.load %2 : !llvm.ptr -> i32 + llvm.return %3 : i32 +} + +// ----- + +// CHECK-LABEL: llvm.func @memcpy_inline_src +// CHECK-SAME: (%[[OTHER_ARRAY:.*]]: !llvm.ptr) +llvm.func @memcpy_inline_src(%other_array: !llvm.ptr) -> i32 { + // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32 + // After SROA, only one i32 will be actually used, so only 4 bytes will be set. + // CHECK-COUNT-4: = llvm.alloca %[[ALLOCA_LEN]] x i32 + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x !llvm.array<4 x i32> : (i32) -> !llvm.ptr + // Unfortunately because of FileCheck limitations it is not possible to check which slot gets read from. + // We can only check that the amount of operations and allocated slots is correct, which should be sufficient + // as unused slots are not generated. + // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32> + // CHECK-DAG: "llvm.intr.memcpy.inline"(%[[SLOT_IN_OTHER]], %{{.*}}) <{isVolatile = false, len = 4 : i32}> + // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32> + // CHECK-DAG: "llvm.intr.memcpy.inline"(%[[SLOT_IN_OTHER]], %{{.*}}) <{isVolatile = false, len = 4 : i32}> + // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32> + // CHECK-DAG: "llvm.intr.memcpy.inline"(%[[SLOT_IN_OTHER]], %{{.*}}) <{isVolatile = false, len = 4 : i32}> + // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32> + // CHECK-DAG: "llvm.intr.memcpy.inline"(%[[SLOT_IN_OTHER]], %{{.*}}) <{isVolatile = false, len = 4 : i32}> + "llvm.intr.memcpy.inline"(%other_array, %1) <{isVolatile = false, len = 16 : i32}> : (!llvm.ptr, !llvm.ptr) -> () + %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32> + %3 = llvm.load %2 : !llvm.ptr -> i32 + llvm.return %3 : i32 +}