diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -29,6 +29,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -132,6 +133,9 @@ Function &Func; const DataLayout &DL; const TargetTransformInfo &TTI; + DenseMap, SmallVector> AliasScopes; + DenseMap, SmallVector> NoaliasLists; + MDNode *ScopeDomain = nullptr; /// Wrapper class representing a matrix as a set of column vectors. /// All column vectors must have the same vector type. @@ -455,6 +459,13 @@ } bool Visit() { + dbgs() << "HI\n"; + TempMDTuple TmpScopeDomain = MDNode::getTemporary(Func.getContext(), None); + SmallVector MDs; + MDs.push_back(TmpScopeDomain.get()); + ScopeDomain = MDNode::get(Func.getContext(), MDs); + ScopeDomain->replaceOperandWith(0, ScopeDomain); + if (EnableShapePropagation) { SmallVector WorkList; @@ -554,20 +565,64 @@ return true; } + MDNode *getOrCreateScopeList(Value *Ptr, Value *Stride, unsigned Col, + LLVMContext &Context) { + auto I = AliasScopes.find({Ptr, Stride}); + if (I != AliasScopes.end() && Col < I->second.size()) + return I->second[Col]; + + // Create Scope. + TempMDTuple TmpScope = MDNode::getTemporary(Context, None); + MDNode *Scope = MDNode::get(Context, {TmpScope.get(), ScopeDomain}); + Scope->replaceOperandWith(0, Scope); + + // Create alias.scope list. + MDNode *AliasScopeList = MDNode::get(Ptr->getContext(), {Scope}); + // + // Cache and return alias.scope lst. + auto II = AliasScopes.insert({{Ptr, Stride}, {}}); + II.first->second.push_back(AliasScopeList); + return AliasScopeList; + } + void LowerLoad(Instruction *Inst, Value *Ptr, Value *Stride, ShapeInfo Shape) { IRBuilder<> Builder(Inst); auto VType = cast(Inst->getType()); Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); + ColumnMatrixTy Result; + SmallVector Loads; + SmallVector Scopes; // Distance between start of one column and the start of the next for (unsigned C = 0, E = Shape.NumColumns; C < E; ++C) { Value *GEP = computeColumnAddr(EltPtr, Builder.getInt32(C), Stride, Shape.NumRows, VType->getElementType(), Builder); - Value *Column = createColumnLoad(GEP, VType->getElementType(), Builder); + LoadInst *Column = + createColumnLoad(GEP, VType->getElementType(), Builder); + Loads.push_back(Column); Result.addColumn(Column); + Scopes.push_back(getOrCreateScopeList(Ptr, Stride, C, Ptr->getContext())); } + for (unsigned i = 0; i < Loads.size(); i++) { + SmallVector Copy; + Loads[i]->setMetadata(LLVMContext::MD_alias_scope, Scopes[i]); + + MDNode *NoAliasScopeList; + auto NA = NoaliasLists.find({Ptr, Stride}); + if (NA != NoaliasLists.end() && i < NA->second.size()) + NoAliasScopeList = NA->second[i]; + else { + for (unsigned j = 0; j < Loads.size(); j++) + if (j != i) + Copy.push_back(Scopes[j]->getOperand(0)); + NoAliasScopeList = MDNode::get(Ptr->getContext(), Copy); + auto I = NoaliasLists.insert({{Ptr, Stride}, {}}); + I.first->second.push_back(NoAliasScopeList); + } + Loads[i]->setMetadata(LLVMContext::MD_noalias, NoAliasScopeList); + } finalizeLowering(Inst, Result, Builder); } @@ -586,11 +641,35 @@ auto VType = cast(Matrix->getType()); Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); auto LM = getMatrix(Matrix, Shape, Builder); + SmallVector Scopes; + SmallVector Stores; for (auto C : enumerate(LM.columns())) { Value *GEP = computeColumnAddr(EltPtr, Builder.getInt32(C.index()), Stride, Shape.NumRows, VType->getElementType(), Builder); - createColumnStore(C.value(), GEP, VType->getElementType(), Builder); + Stores.push_back( + createColumnStore(C.value(), GEP, VType->getElementType(), Builder)); + Scopes.push_back( + getOrCreateScopeList(Ptr, Stride, C.index(), Ptr->getContext())); + } + + for (unsigned i = 0; i < Stores.size(); i++) { + SmallVector Copy; + Stores[i]->setMetadata(LLVMContext::MD_alias_scope, Scopes[i]); + + MDNode *NoAliasScopeList; + auto NA = NoaliasLists.find({Ptr, Stride}); + if (NA != NoaliasLists.end() && i < NA->second.size()) { + NoAliasScopeList = NA->second[i]; + } else { + for (unsigned j = 0; j < Stores.size(); j++) + if (j != i) + Copy.push_back(Scopes[j]->getOperand(0)); + NoAliasScopeList = MDNode::get(Ptr->getContext(), Copy); + auto I = NoaliasLists.insert({{Ptr, Stride}, {}}); + I.first->second.push_back(NoAliasScopeList); + } + Stores[i]->setMetadata(LLVMContext::MD_noalias, NoAliasScopeList); } ToRemove.push_back(Inst); diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-store-alias-metadata.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-store-alias-metadata.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-store-alias-metadata.ll @@ -0,0 +1,92 @@ +; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s +; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s + +; The columnwise loads and stores from %in1 share stride and matrix size. +; For each column pointer, we create a separate alias scope. Each access +; does not alias accesses to the same pointer & stride, except their own +; column. +define void @load_store_shared_stride(double* %in1, double* %in2, i32 %stride) { +; CHECK-LABEL: @load_store_shared_stride( +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, double* [[IN1:%.*]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[TMP2]] to <2 x double>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8, !alias.scope [[IN1_C0:!.*]], !noalias [[IN1_C1:!.*]] +; CHECK-NEXT: [[TMP5:%.*]] = mul i32 1, [[STRIDE]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr double, double* [[IN1]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>* +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 8, !alias.scope [[IN1_C1]], !noalias [[IN1_C0]] +; CHECK-NEXT: [[TMP9:%.*]] = mul i32 0, [[STRIDE]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr double, double* [[IN2:%.*]], i32 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[TMP10]] to <2 x double>* +; CHECK-NEXT: [[TMP12:%.*]] = load <2 x double>, <2 x double>* [[TMP11]], align 8, !alias.scope [[IN2_C0:!.*]], !noalias [[IN2_C1:!.*]] +; CHECK-NEXT: [[TMP13:%.*]] = mul i32 1, [[STRIDE]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr double, double* [[IN2]], i32 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>* +; CHECK-NEXT: [[TMP16:%.*]] = load <2 x double>, <2 x double>* [[TMP15]], align 8, !alias.scope [[IN2_C1]], !noalias [[IN2_C0]] +; CHECK-NEXT: [[TMP17:%.*]] = fadd <2 x double> [[TMP4]], [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = fadd <2 x double> [[TMP8]], [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = mul i32 0, [[STRIDE]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr double, double* [[IN1]], i32 [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP20]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP17]], <2 x double>* [[TMP21]], align 8, !alias.scope [[IN1_C0]], !noalias [[IN1_C1]] +; CHECK-NEXT: [[TMP22:%.*]] = mul i32 1, [[STRIDE]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr double, double* [[IN1]], i32 [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = bitcast double* [[TMP23]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP18]], <2 x double>* [[TMP24]], align 8, !alias.scope [[IN1_C1]], !noalias [[IN1_C0]] +; CHECK-NEXT: ret void +; + %v1 = call <4 x double> @llvm.matrix.columnwise.load(double* %in1, i32 %stride, i32 2, i32 2) + %v2 = call <4 x double> @llvm.matrix.columnwise.load(double* %in2, i32 %stride, i32 2, i32 2) + %res = fadd <4 x double> %v1, %v2 + call void @llvm.matrix.columnwise.store(<4 x double> %res, double* %in1, i32 %stride, i32 2, i32 2) + ret void +} + +define void @load_store_different_strides(double* %in1, double* %in2, i32 %stride, i32 %stride2) { +; CHECK-LABEL: @load_store_different_strides( +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, double* [[IN1:%.*]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[TMP2]] to <2 x double>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8, !alias.scope [[F2_IN1_C0:!.*]], !noalias [[F2_IN1_C1:!.*]] +; CHECK-NEXT: [[TMP5:%.*]] = mul i32 1, [[STRIDE]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr double, double* [[IN1]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>* +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 8, !alias.scope [[F2_IN1_C1]], !noalias [[F2_IN1_C0]] +; CHECK-NEXT: [[TMP19:%.*]] = mul i32 0, [[STRIDE2:%.*]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr double, double* [[IN1]], i32 [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP20]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP21]], align 8, !alias.scope [[F2_IN1_C0_STRIDE2:!.*]], !noalias [[F2_IN1_C1_STRIDE2:!.*]] +; CHECK-NEXT: [[TMP22:%.*]] = mul i32 1, [[STRIDE2]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr double, double* [[IN1]], i32 [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = bitcast double* [[TMP23]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP24]], align 8, !alias.scope [[F2_IN1_C1_STRIDE2]], !noalias [[F2_IN1_C0_STRIDE2]] +; CHECK-NEXT: ret void +; + %v1 = call <4 x double> @llvm.matrix.columnwise.load(double* %in1, i32 %stride, i32 2, i32 2) + call void @llvm.matrix.columnwise.store(<4 x double> %v1, double* %in1, i32 %stride2, i32 2, i32 2) + ret void +} + +declare <4 x double> @llvm.matrix.columnwise.load(double*, i32, i32, i32) +declare void @llvm.matrix.columnwise.store(<4 x double>, double*, i32, i32, i32) + + +; CHECK: [[IN1_C0]] = !{!1} +; CHECK-NEXT: !1 = distinct !{!1, !2} +; CHECK-NEXT: !2 = distinct !{!2} +; CHECK-NEXT: [[IN1_C1]] = !{!4} +; CHECK-NEXT: !4 = distinct !{!4, !2} +; CHECK-NEXT: [[IN2_C0]] = !{!6} +; CHECK-NEXT: !6 = distinct !{!6, !2} +; CHECK-NEXT: [[IN2_C1]] = !{!8} +; CHECK-NEXT: !8 = distinct !{!8, !2} + +; CHECK-NEXT: [[F2_IN1_C0]] = !{!10} +; CHECK-NEXT: !10 = distinct !{!10, !11} +; CHECK-NEXT: !11 = distinct !{!11} +; CHECK-NEXT: [[F2_IN1_C1]] = !{!13} +; CHECK-NEXT: !13 = distinct !{!13, !11} +; CHECK-NEXT: [[F2_IN1_C0_STRIDE2]] = !{!15} +; CHECK-NEXT: !15 = distinct !{!15, !11} +; CHECK-NEXT: [[F2_IN1_C1_STRIDE2]] = !{!17} +; CHECK-NEXT: !17 = distinct !{!17, !11}