diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -17,7 +17,10 @@ #include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/OrderedInstructions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" @@ -33,6 +36,7 @@ #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" using namespace llvm; using namespace PatternMatch; @@ -44,6 +48,15 @@ cl::desc("Enable/disable shape propagation from matrix intrinsics to other " "instructions.")); +static cl::opt + FuseMatrix("fuse-matrix", cl::init(true), cl::Hidden, + cl::desc("Enable/disable fusing matrix instructions.")); +static cl::opt + TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, + cl::desc("Tile size for matrix instruction fusion.")); +static cl::opt ForceFusion( + "force-fuse-matrix", cl::init(false), cl::Hidden, + cl::desc("Force matrix instruction fusion even if not profitable.")); static cl::opt AllowContractEnabled( "matrix-allow-contract", cl::init(false), cl::Hidden, cl::desc("Allow the use of FMAs if available and profitable. This may " @@ -146,6 +159,10 @@ Function &Func; const DataLayout &DL; const TargetTransformInfo &TTI; + AliasAnalysis &AA; + DominatorTree &DT; + LoopInfo &LI; + OrderedInstructions OI; OptimizationRemarkEmitter &ORE; /// Contains estimates of the number of operations (loads, stores, compute) required to lower a matrix operation. @@ -299,8 +316,10 @@ public: LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI, + AliasAnalysis &AA, DominatorTree &DT, LoopInfo &LI, OptimizationRemarkEmitter &ORE) - : Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI), ORE(ORE) {} + : Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI), AA(AA), DT(DT), + LI(LI), OI(&DT), ORE(ORE) {} unsigned getNumOps(Type *VT) { assert(isa(VT) && "Expected vector type"); @@ -586,24 +605,46 @@ } } - ReversePostOrderTraversal RPOT(&Func); bool Changed = false; - for (auto *BB : RPOT) { - for (Instruction &Inst : make_early_inc_range(*BB)) { - IRBuilder<> Builder(&Inst); - - if (CallInst *CInst = dyn_cast(&Inst)) - Changed |= VisitCallInst(CInst); - - Value *Op1; - Value *Op2; - if (auto *BinOp = dyn_cast(&Inst)) - Changed |= VisitBinaryOperator(BinOp); - if (match(&Inst, m_Load(m_Value(Op1)))) - Changed |= VisitLoad(&Inst, Op1, Builder); - else if (match(&Inst, m_Store(m_Value(Op1), m_Value(Op2)))) - Changed |= VisitStore(&Inst, Op1, Op2, Builder); + SmallVector MaybeFusableInsts; + SmallVector MatrixInsts; + + // First, collect all instructions with shape information and candidates for + // fusion (currently only matrix multiplies). + ReversePostOrderTraversal RPOT(&Func); + for (auto *BB : RPOT) + for (Instruction &I : *BB) { + if (ShapeMap.find(&I) == ShapeMap.end()) + continue; + if (match(&I, m_Intrinsic())) + MaybeFusableInsts.push_back(cast(&I)); + MatrixInsts.push_back(&I); } + + // Second, try to fuse candidates. + SmallPtrSet FusedInsts; + for (CallInst *CI : MaybeFusableInsts) + LowerMatrixMultiplyFused(CI, FusedInsts); + Changed = !FusedInsts.empty(); + + // Third, lower remaining instructions with shape information. + for (Instruction *Inst : MatrixInsts) { + if (FusedInsts.find(Inst) != FusedInsts.end()) + continue; + + IRBuilder<> Builder(Inst); + + if (CallInst *CInst = dyn_cast(Inst)) + Changed |= VisitCallInst(CInst); + + Value *Op1; + Value *Op2; + if (auto *BinOp = dyn_cast(Inst)) + Changed |= VisitBinaryOperator(BinOp); + if (match(Inst, m_Load(m_Value(Op1)))) + Changed |= VisitLoad(Inst, Op1, Builder); + else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2)))) + Changed |= VisitStore(Inst, Op1, Op2, Builder); } RemarkGenerator RemarkGen(Inst2ColumnMatrix, ORE, Func); @@ -699,7 +740,7 @@ Value *TilePtr = Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast"); - return loadMatrix(TileTy, TilePtr, Builder.getInt32(ResultShape.NumRows), + return loadMatrix(TileTy, TilePtr, Builder.getInt32(MatrixShape.NumRows), ResultShape, Builder); } @@ -743,7 +784,7 @@ Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast"); storeMatrix(TileTy, StoreVal, TilePtr, - Builder.getInt32(StoreVal.getNumRows()), Builder); + Builder.getInt32(MatrixShape.NumRows), Builder); } /// Store matrix \p StoreVal starting at \p Ptr and using \p Stride between @@ -915,6 +956,213 @@ } } + /// Ensure that the memory in \p Load does not alias \p Store by potentially + /// copying it to a new location. This new or otherwise the original location + /// is returned. + Value *getNonAliasingPointer(LoadInst *Load, StoreInst *Store, + CallInst *MatMul) { + MemoryLocation StoreLoc = MemoryLocation::get(Store); + MemoryLocation LoadLoc = MemoryLocation::get(Load); + + AliasResult LdAliased = AA.alias(LoadLoc, StoreLoc); + + // If we can statically determine noalias we're good. + if (!LdAliased) + return Load->getPointerOperand(); + + // Create code to check if the memory locations of the Load and Store + // overlap and if they do, copy Load's operand to a new buffer. + + // First, create new blocks for 2n part of the check and the copy. + BasicBlock *Check0 = MatMul->getParent(); + // FIXME: Use lazy DTU and update SplitBlock to accept a DTU instead of a + // DT. Manually collect dominator tree updates, to avoid unnecessary work, + // as we adjust Check0 and Check1's branches. + SmallVector DTUpdates; + for (BasicBlock *Succ : successors(Check0)) + DTUpdates.push_back({DT.Delete, Check0, Succ}); + + BasicBlock *Check1 = SplitBlock(MatMul->getParent(), MatMul, nullptr, &LI, + nullptr, "alias_cont"); + BasicBlock *Copy = + SplitBlock(MatMul->getParent(), MatMul, nullptr, &LI, nullptr, "copy"); + BasicBlock *Fusion = SplitBlock(MatMul->getParent(), MatMul, nullptr, &LI, + nullptr, "no_alias"); + + // Check if the loaded memory location begins before the end of the store + // location. If the condition holds, they might overlap, otherwise they are + // guaranteed to not overlap. + IRBuilder<> Builder(MatMul); + Type *IntPtrTy = Builder.getIntPtrTy(Load->getModule()->getDataLayout()); + Value *StoreBegin = Builder.CreatePtrToInt( + const_cast(StoreLoc.Ptr), IntPtrTy, "store.begin"); + Value *StoreEnd = Builder.CreateAdd( + StoreBegin, ConstantInt::get(IntPtrTy, StoreLoc.Size.getValue()), + "store.end", true, true); + Check0->getTerminator()->eraseFromParent(); + Builder.SetInsertPoint(Check0); + Value *LoadBegin = Builder.CreatePtrToInt(const_cast(LoadLoc.Ptr), + IntPtrTy, "load.begin"); + Builder.CreateCondBr(Builder.CreateICmpULT(LoadBegin, StoreEnd), Check1, + Fusion); + + // Check if the store begins before the end of the load location. If the + // condition holds, they alias, otherwise they are guaranteed to not + // overlap. + Check1->getTerminator()->eraseFromParent(); + Builder.SetInsertPoint(Check1, Check1->begin()); + Value *LoadEnd = Builder.CreateAdd( + LoadBegin, ConstantInt::get(IntPtrTy, LoadLoc.Size.getValue()), + "load.end", true, true); + Builder.CreateCondBr(Builder.CreateICmpULT(StoreBegin, LoadEnd), Copy, + Fusion); + + // Copy load operand to new alloca. + Builder.SetInsertPoint(Copy, Copy->begin()); + AllocaInst *NewLd = + Builder.CreateAlloca(Load->getType(), Load->getPointerAddressSpace()); + Builder.CreateMemCpy(NewLd, MaybeAlign(NewLd->getAlignment()), + Load->getPointerOperand(), Load->getAlign(), + LoadLoc.Size.getValue()); + Builder.SetInsertPoint(Fusion, Fusion->begin()); + PHINode *PHI = Builder.CreatePHI(Load->getPointerOperandType(), 3); + PHI->addIncoming(Load->getPointerOperand(), Check0); + PHI->addIncoming(Load->getPointerOperand(), Check1); + PHI->addIncoming(NewLd, Copy); + + // Adjust DT. + DTUpdates.push_back({DT.Insert, Fusion, Copy}); + DTUpdates.push_back({DT.Insert, Copy, Check1}); + DTUpdates.push_back({DT.Insert, Check0, Check1}); + DTUpdates.push_back({DT.Insert, Check0, Fusion}); + DTUpdates.push_back({DT.Insert, Check1, Copy}); + DTUpdates.push_back({DT.Insert, Check1, Fusion}); + DT.applyUpdates(DTUpdates); + return PHI; + } + + bool isFusionProfitable(CallInst *MatMul) { + if (ForceFusion) + return true; + + ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3)); + ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4)); + + const unsigned R = LShape.NumRows; + const unsigned C = RShape.NumColumns; + const unsigned M = LShape.NumColumns; + auto *EltType = cast(MatMul->getType())->getElementType(); + + const unsigned VF = + std::max(TTI.getRegisterBitWidth(true) / + EltType->getPrimitiveSizeInBits().getFixedSize(), + 1U); + + // Cost model for tiling + // + // For tiling to be beneficial, we need reuse either along the R or + // the C axis. We vectorize along the R axis so that means at least + // 3 elements. + if (R <= VF && C == 1) + return false; + // Then we need enough elements to exceed the number of vector + // registers we have. Note that this is an oversimplification since + // fusing also takes some extra loads which may exceed the number of + // reloads necessary. + unsigned Op0Regs = (R + VF - 1) / VF * M; + unsigned Op1Regs = (M + VF - 1) / VF * C; + return Op0Regs + Op1Regs > TTI.getNumberOfRegisters(true); + } + + MatrixTy getZeroMatrix(Type *EltType, unsigned R, unsigned C) { + MatrixTy Res; + Type *ColumType = VectorType::get(EltType, R); + for (unsigned I = 0; I < C; ++I) + Res.addColumn(ConstantAggregateZero::get(ColumType)); + return Res; + } + + void emitSIMDTiling(CallInst *MatMul, LoadInst *LoadOp0, LoadInst *LoadOp1, + StoreInst *Store, + SmallPtrSetImpl &FusedInsts) { + if (!isFusionProfitable(MatMul)) + return; + + ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3)); + ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4)); + + const unsigned R = LShape.NumRows; + const unsigned C = RShape.NumColumns; + const unsigned M = LShape.NumColumns; + auto *EltType = cast(MatMul->getType())->getElementType(); + + Value *APtr = getNonAliasingPointer(LoadOp0, Store, MatMul); + Value *BPtr = getNonAliasingPointer(LoadOp1, Store, MatMul); + Value *CPtr = Store->getPointerOperand(); + + bool AllowContract = AllowContractEnabled || (isa(MatMul) && + MatMul->hasAllowContract()); + IRBuilder<> Builder(Store); + for (unsigned J = 0; J < C; J += TileSize) + for (unsigned I = 0; I < R; I += TileSize) { + const unsigned TileR = std::min(R - I, unsigned(TileSize)); + const unsigned TileC = std::min(C - J, unsigned(TileSize)); + MatrixTy Res = getZeroMatrix(EltType, TileR, TileC); + + for (unsigned K = 0; K < M; K += TileSize) { + const unsigned TileM = std::min(M - K, unsigned(TileSize)); + MatrixTy A = + loadMatrix(APtr, LShape, I, K, {TileR, TileM}, EltType, Builder); + MatrixTy B = + loadMatrix(BPtr, RShape, K, J, {TileM, TileC}, EltType, Builder); + emitChainedMatrixMultiply(Res, A, B, AllowContract, Builder, true); + } + storeMatrix(Res, CPtr, {R, M}, I, J, EltType, Builder); + } + + // Mark eliminated instructions as fused and remove them. + FusedInsts.insert(Store); + FusedInsts.insert(MatMul); + Store->eraseFromParent(); + MatMul->eraseFromParent(); + if (LoadOp0->hasNUses(0)) { + FusedInsts.insert(LoadOp0); + LoadOp0->eraseFromParent(); + } + if (LoadOp1->hasNUses(0)) { + FusedInsts.insert(LoadOp1); + LoadOp1->eraseFromParent(); + } + } + + /// Try to lower matrix multiply chains by fusing operations. + /// + /// Currently we only lower {ld, ld} -> matmul -> st chains. + // + /// No need to return a MatrixTy object for the result of the operation, since + /// the single store user will be lowered as part of this. Instructions that + /// are completely eliminated by fusion are added to \p FusedInsts. + void LowerMatrixMultiplyFused(CallInst *MatMul, + SmallPtrSetImpl &FusedInsts) { + if (!FuseMatrix || !MatMul->hasOneUse()) + return; + + auto *LoadOp0 = dyn_cast(MatMul->getOperand(0)); + auto *LoadOp1 = dyn_cast(MatMul->getOperand(1)); + auto *Store = dyn_cast(*MatMul->user_begin()); + if (LoadOp0 && LoadOp1 && Store) { + // The store address must dominate the MatMul instruction, otherwise + // we create invalid IR. + // FIXME: See if we can hoist the store address computation. + auto *AddrI = dyn_cast(Store->getOperand(1)); + if (AddrI && (!OI.dominates(AddrI, MatMul))) + return; + + emitSIMDTiling(MatMul, LoadOp0, LoadOp1, Store, FusedInsts); + return; + } + } + /// Lowers llvm.matrix.multiply. void LowerMultiply(CallInst *MatMul) { IRBuilder<> Builder(MatMul); @@ -1481,7 +1729,11 @@ FunctionAnalysisManager &AM) { auto &TTI = AM.getResult(F); auto &ORE = AM.getResult(F); - LowerMatrixIntrinsics LMT(F, TTI, ORE); + auto &AA = AM.getResult(F); + auto &DT = AM.getResult(F); + auto &LI = AM.getResult(F); + + LowerMatrixIntrinsics LMT(F, TTI, AA, DT, LI, ORE); if (LMT.Visit()) { PreservedAnalyses PA; PA.preserveSet(); @@ -1504,7 +1756,10 @@ bool runOnFunction(Function &F) override { auto &TTI = getAnalysis().getTTI(F); auto &ORE = getAnalysis().getORE(); - LowerMatrixIntrinsics LMT(F, TTI, ORE); + auto &AA = getAnalysis().getAAResults(); + auto &DT = getAnalysis().getDomTree(); + auto &LI = getAnalysis().getLoopInfo(); + LowerMatrixIntrinsics LMT(F, TTI, AA, DT, LI, ORE); bool C = LMT.Visit(); return C; } @@ -1512,7 +1767,11 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); - AU.setPreservesCFG(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); } }; } // namespace @@ -1522,6 +1781,9 @@ INITIALIZE_PASS_BEGIN(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name, false, false) INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name, false, false) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll @@ -0,0 +1,303 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -lower-matrix-intrinsics -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -instcombine -verify-dom-info %s -S | FileCheck %s + +; REQUIRES: aarch64-registered-target + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "aarch64-apple-ios" + +define void @test(<6 x double> * %A, <6 x double> * %B, <9 x double>* %C, i1 %cond) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[COL_CAST196:%.*]] = bitcast <6 x double>* [[A:%.*]] to <3 x double>* +; CHECK-NEXT: [[COL_LOAD197:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST196]], align 8 +; CHECK-NEXT: [[COL_GEP198:%.*]] = getelementptr <6 x double>, <6 x double>* [[A]], i64 0, i64 3 +; CHECK-NEXT: [[COL_CAST199:%.*]] = bitcast double* [[COL_GEP198]] to <3 x double>* +; CHECK-NEXT: [[COL_LOAD200:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST199]], align 8 +; CHECK-NEXT: [[COL_CAST201:%.*]] = bitcast <6 x double>* [[B:%.*]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD202:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST201]], align 8 +; CHECK-NEXT: [[COL_GEP203:%.*]] = getelementptr <6 x double>, <6 x double>* [[B]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST204:%.*]] = bitcast double* [[COL_GEP203]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD205:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST204]], align 8 +; CHECK-NEXT: [[COL_GEP206:%.*]] = getelementptr <6 x double>, <6 x double>* [[B]], i64 0, i64 4 +; CHECK-NEXT: [[COL_CAST207:%.*]] = bitcast double* [[COL_GEP206]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD208:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST207]], align 8 +; CHECK-NEXT: [[ST_B:%.*]] = ptrtoint <9 x double>* [[C:%.*]] to i64 +; CHECK-NEXT: [[ST_E:%.*]] = add nuw nsw i64 [[ST_B]], 72 +; CHECK-NEXT: [[LD_B:%.*]] = ptrtoint <6 x double>* [[A]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[ST_E]], [[LD_B]] +; CHECK-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]] +; CHECK: alias_cont: +; CHECK-NEXT: [[LD_E:%.*]] = add nuw nsw i64 [[LD_B]], 48 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[LD_E]], [[ST_B]] +; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]] +; CHECK: copy: +; CHECK-NEXT: [[TMP2:%.*]] = alloca <6 x double>, align 64 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <6 x double>* [[TMP2]] to i8* +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <6 x double>* [[A]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 64 dereferenceable(48) [[TMP3]], i8* nonnull align 16 dereferenceable(48) [[TMP4]], i64 48, i1 false) +; CHECK-NEXT: br label [[NO_ALIAS]] +; CHECK: no_alias: +; CHECK-NEXT: [[TMP5:%.*]] = phi <6 x double>* [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ] +; CHECK-NEXT: [[ST_B1:%.*]] = ptrtoint <9 x double>* [[C]] to i64 +; CHECK-NEXT: [[ST_E2:%.*]] = add nuw nsw i64 [[ST_B1]], 72 +; CHECK-NEXT: [[LD_B6:%.*]] = ptrtoint <6 x double>* [[B]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[ST_E2]], [[LD_B6]] +; CHECK-NEXT: br i1 [[TMP6]], label [[ALIAS_CONT3:%.*]], label [[NO_ALIAS5:%.*]] +; CHECK: alias_cont3: +; CHECK-NEXT: [[LD_E7:%.*]] = add nuw nsw i64 [[LD_B6]], 48 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[LD_E7]], [[ST_B1]] +; CHECK-NEXT: br i1 [[TMP7]], label [[COPY4:%.*]], label [[NO_ALIAS5]] +; CHECK: copy4: +; CHECK-NEXT: [[TMP8:%.*]] = alloca <6 x double>, align 64 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <6 x double>* [[TMP8]] to i8* +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <6 x double>* [[B]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 64 dereferenceable(48) [[TMP9]], i8* nonnull align 16 dereferenceable(48) [[TMP10]], i64 48, i1 false) +; CHECK-NEXT: br label [[NO_ALIAS5]] +; CHECK: no_alias5: +; CHECK-NEXT: [[TMP11:%.*]] = phi <6 x double>* [ [[B]], [[NO_ALIAS]] ], [ [[B]], [[ALIAS_CONT3]] ], [ [[TMP8]], [[COPY4]] ] +; CHECK-NEXT: [[COL_CAST8:%.*]] = bitcast <6 x double>* [[TMP5]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST8]], align 8 +; CHECK-NEXT: [[COL_GEP:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP5]], i64 0, i64 3 +; CHECK-NEXT: [[COL_CAST9:%.*]] = bitcast double* [[COL_GEP]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD10:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST9]], align 8 +; CHECK-NEXT: [[COL_CAST12:%.*]] = bitcast <6 x double>* [[TMP11]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD13:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST12]], align 8 +; CHECK-NEXT: [[COL_GEP14:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP11]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST15:%.*]] = bitcast double* [[COL_GEP14]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD16:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST15]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD13]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = fmul <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]] +; CHECK-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <2 x double> [[COL_LOAD13]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD10]], <2 x double> [[SPLAT_SPLAT19]], <2 x double> [[TMP12]]) +; CHECK-NEXT: [[SPLAT_SPLAT22:%.*]] = shufflevector <2 x double> [[COL_LOAD16]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = fmul <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT22]] +; CHECK-NEXT: [[SPLAT_SPLAT25:%.*]] = shufflevector <2 x double> [[COL_LOAD16]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD10]], <2 x double> [[SPLAT_SPLAT25]], <2 x double> [[TMP14]]) +; CHECK-NEXT: [[COL_CAST27:%.*]] = bitcast <9 x double>* [[C]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP13]], <2 x double>* [[COL_CAST27]], align 8 +; CHECK-NEXT: [[COL_GEP28:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 3 +; CHECK-NEXT: [[COL_CAST29:%.*]] = bitcast double* [[COL_GEP28]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP15]], <2 x double>* [[COL_CAST29]], align 8 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP5]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST31:%.*]] = bitcast double* [[TMP16]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD32:%.*]] = load <1 x double>, <1 x double>* [[COL_CAST31]], align 8 +; CHECK-NEXT: [[COL_GEP33:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP5]], i64 0, i64 5 +; CHECK-NEXT: [[COL_CAST34:%.*]] = bitcast double* [[COL_GEP33]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD35:%.*]] = load <1 x double>, <1 x double>* [[COL_CAST34]], align 8 +; CHECK-NEXT: [[COL_CAST37:%.*]] = bitcast <6 x double>* [[TMP11]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD38:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST37]], align 8 +; CHECK-NEXT: [[COL_GEP39:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP11]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST40:%.*]] = bitcast double* [[COL_GEP39]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD41:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST40]], align 8 +; CHECK-NEXT: [[SPLAT_SPLATINSERT43:%.*]] = shufflevector <2 x double> [[COL_LOAD38]], <2 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = fmul <1 x double> [[COL_LOAD32]], [[SPLAT_SPLATINSERT43]] +; CHECK-NEXT: [[SPLAT_SPLATINSERT46:%.*]] = shufflevector <2 x double> [[COL_LOAD38]], <2 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD35]], <1 x double> [[SPLAT_SPLATINSERT46]], <1 x double> [[TMP17]]) +; CHECK-NEXT: [[SPLAT_SPLATINSERT49:%.*]] = shufflevector <2 x double> [[COL_LOAD41]], <2 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = fmul <1 x double> [[COL_LOAD32]], [[SPLAT_SPLATINSERT49]] +; CHECK-NEXT: [[SPLAT_SPLATINSERT52:%.*]] = shufflevector <2 x double> [[COL_LOAD41]], <2 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD35]], <1 x double> [[SPLAT_SPLATINSERT52]], <1 x double> [[TMP19]]) +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST55:%.*]] = bitcast double* [[TMP21]] to <1 x double>* +; CHECK-NEXT: store <1 x double> [[TMP18]], <1 x double>* [[COL_CAST55]], align 8 +; CHECK-NEXT: [[COL_GEP56:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 5 +; CHECK-NEXT: [[COL_CAST57:%.*]] = bitcast double* [[COL_GEP56]] to <1 x double>* +; CHECK-NEXT: store <1 x double> [[TMP20]], <1 x double>* [[COL_CAST57]], align 8 +; CHECK-NEXT: [[COL_CAST59:%.*]] = bitcast <6 x double>* [[TMP5]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD60:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST59]], align 8 +; CHECK-NEXT: [[COL_GEP61:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP5]], i64 0, i64 3 +; CHECK-NEXT: [[COL_CAST62:%.*]] = bitcast double* [[COL_GEP61]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD63:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST62]], align 8 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP11]], i64 0, i64 4 +; CHECK-NEXT: [[COL_CAST65:%.*]] = bitcast double* [[TMP22]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD66:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST65]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD66]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = fmul <2 x double> [[COL_LOAD60]], [[SPLAT_SPLAT69]] +; CHECK-NEXT: [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD66]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD63]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP23]]) +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 6 +; CHECK-NEXT: [[COL_CAST74:%.*]] = bitcast double* [[TMP25]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP24]], <2 x double>* [[COL_CAST74]], align 8 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP5]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST76:%.*]] = bitcast double* [[TMP26]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD77:%.*]] = load <1 x double>, <1 x double>* [[COL_CAST76]], align 8 +; CHECK-NEXT: [[COL_GEP78:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP5]], i64 0, i64 5 +; CHECK-NEXT: [[COL_CAST79:%.*]] = bitcast double* [[COL_GEP78]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD80:%.*]] = load <1 x double>, <1 x double>* [[COL_CAST79]], align 8 +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP11]], i64 0, i64 4 +; CHECK-NEXT: [[COL_CAST82:%.*]] = bitcast double* [[TMP27]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD83:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST82]], align 8 +; CHECK-NEXT: [[SPLAT_SPLATINSERT85:%.*]] = shufflevector <2 x double> [[COL_LOAD83]], <2 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP28:%.*]] = fmul <1 x double> [[COL_LOAD77]], [[SPLAT_SPLATINSERT85]] +; CHECK-NEXT: [[SPLAT_SPLATINSERT88:%.*]] = shufflevector <2 x double> [[COL_LOAD83]], <2 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD80]], <1 x double> [[SPLAT_SPLATINSERT88]], <1 x double> [[TMP28]]) +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 8 +; CHECK-NEXT: [[COL_CAST91:%.*]] = bitcast double* [[TMP30]] to <1 x double>* +; CHECK-NEXT: store <1 x double> [[TMP29]], <1 x double>* [[COL_CAST91]], align 8 +; CHECK-NEXT: br i1 [[COND:%.*]], label [[TRUE:%.*]], label [[FALSE:%.*]] +; CHECK: true: +; CHECK-NEXT: [[TMP31:%.*]] = fadd <3 x double> [[COL_LOAD197]], [[COL_LOAD197]] +; CHECK-NEXT: [[TMP32:%.*]] = fadd <3 x double> [[COL_LOAD200]], [[COL_LOAD200]] +; CHECK-NEXT: [[COL_CAST214:%.*]] = bitcast <6 x double>* [[A]] to <3 x double>* +; CHECK-NEXT: store <3 x double> [[TMP31]], <3 x double>* [[COL_CAST214]], align 8 +; CHECK-NEXT: [[COL_GEP215:%.*]] = getelementptr <6 x double>, <6 x double>* [[A]], i64 0, i64 3 +; CHECK-NEXT: [[COL_CAST216:%.*]] = bitcast double* [[COL_GEP215]] to <3 x double>* +; CHECK-NEXT: store <3 x double> [[TMP32]], <3 x double>* [[COL_CAST216]], align 8 +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: false: +; CHECK-NEXT: [[TMP33:%.*]] = fadd <2 x double> [[COL_LOAD202]], [[COL_LOAD202]] +; CHECK-NEXT: [[TMP34:%.*]] = fadd <2 x double> [[COL_LOAD205]], [[COL_LOAD205]] +; CHECK-NEXT: [[TMP35:%.*]] = fadd <2 x double> [[COL_LOAD208]], [[COL_LOAD208]] +; CHECK-NEXT: [[COL_CAST209:%.*]] = bitcast <6 x double>* [[B]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP33]], <2 x double>* [[COL_CAST209]], align 8 +; CHECK-NEXT: [[COL_GEP210:%.*]] = getelementptr <6 x double>, <6 x double>* [[B]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST211:%.*]] = bitcast double* [[COL_GEP210]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP34]], <2 x double>* [[COL_CAST211]], align 8 +; CHECK-NEXT: [[COL_GEP212:%.*]] = getelementptr <6 x double>, <6 x double>* [[B]], i64 0, i64 4 +; CHECK-NEXT: [[COL_CAST213:%.*]] = bitcast double* [[COL_GEP212]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP35]], <2 x double>* [[COL_CAST213]], align 8 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[ST_B92:%.*]] = ptrtoint <9 x double>* [[C]] to i64 +; CHECK-NEXT: [[ST_E93:%.*]] = add nuw nsw i64 [[ST_B92]], 72 +; CHECK-NEXT: [[LD_B97:%.*]] = ptrtoint <6 x double>* [[A]] to i64 +; CHECK-NEXT: [[TMP36:%.*]] = icmp ugt i64 [[ST_E93]], [[LD_B97]] +; CHECK-NEXT: br i1 [[TMP36]], label [[ALIAS_CONT94:%.*]], label [[NO_ALIAS96:%.*]] +; CHECK: alias_cont94: +; CHECK-NEXT: [[LD_E98:%.*]] = add nuw nsw i64 [[LD_B97]], 48 +; CHECK-NEXT: [[TMP37:%.*]] = icmp ugt i64 [[LD_E98]], [[ST_B92]] +; CHECK-NEXT: br i1 [[TMP37]], label [[COPY95:%.*]], label [[NO_ALIAS96]] +; CHECK: copy95: +; CHECK-NEXT: [[TMP38:%.*]] = alloca <6 x double>, align 64 +; CHECK-NEXT: [[TMP39:%.*]] = bitcast <6 x double>* [[TMP38]] to i8* +; CHECK-NEXT: [[TMP40:%.*]] = bitcast <6 x double>* [[A]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 64 dereferenceable(48) [[TMP39]], i8* nonnull align 16 dereferenceable(48) [[TMP40]], i64 48, i1 false) +; CHECK-NEXT: br label [[NO_ALIAS96]] +; CHECK: no_alias96: +; CHECK-NEXT: [[TMP41:%.*]] = phi <6 x double>* [ [[A]], [[END]] ], [ [[A]], [[ALIAS_CONT94]] ], [ [[TMP38]], [[COPY95]] ] +; CHECK-NEXT: [[ST_B99:%.*]] = ptrtoint <9 x double>* [[C]] to i64 +; CHECK-NEXT: [[ST_E100:%.*]] = add nuw nsw i64 [[ST_B99]], 72 +; CHECK-NEXT: [[LD_B104:%.*]] = ptrtoint <6 x double>* [[B]] to i64 +; CHECK-NEXT: [[TMP42:%.*]] = icmp ugt i64 [[ST_E100]], [[LD_B104]] +; CHECK-NEXT: br i1 [[TMP42]], label [[ALIAS_CONT101:%.*]], label [[NO_ALIAS103:%.*]] +; CHECK: alias_cont101: +; CHECK-NEXT: [[LD_E105:%.*]] = add nuw nsw i64 [[LD_B104]], 48 +; CHECK-NEXT: [[TMP43:%.*]] = icmp ugt i64 [[LD_E105]], [[ST_B99]] +; CHECK-NEXT: br i1 [[TMP43]], label [[COPY102:%.*]], label [[NO_ALIAS103]] +; CHECK: copy102: +; CHECK-NEXT: [[TMP44:%.*]] = alloca <6 x double>, align 64 +; CHECK-NEXT: [[TMP45:%.*]] = bitcast <6 x double>* [[TMP44]] to i8* +; CHECK-NEXT: [[TMP46:%.*]] = bitcast <6 x double>* [[B]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 64 dereferenceable(48) [[TMP45]], i8* nonnull align 16 dereferenceable(48) [[TMP46]], i64 48, i1 false) +; CHECK-NEXT: br label [[NO_ALIAS103]] +; CHECK: no_alias103: +; CHECK-NEXT: [[TMP47:%.*]] = phi <6 x double>* [ [[B]], [[NO_ALIAS96]] ], [ [[B]], [[ALIAS_CONT101]] ], [ [[TMP44]], [[COPY102]] ] +; CHECK-NEXT: [[COL_CAST107:%.*]] = bitcast <6 x double>* [[TMP41]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD108:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST107]], align 8 +; CHECK-NEXT: [[COL_GEP109:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP41]], i64 0, i64 3 +; CHECK-NEXT: [[COL_CAST110:%.*]] = bitcast double* [[COL_GEP109]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD111:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST110]], align 8 +; CHECK-NEXT: [[COL_CAST113:%.*]] = bitcast <6 x double>* [[TMP47]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD114:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST113]], align 8 +; CHECK-NEXT: [[COL_GEP115:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP47]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST116:%.*]] = bitcast double* [[COL_GEP115]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD117:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST116]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT120:%.*]] = shufflevector <2 x double> [[COL_LOAD114]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP48:%.*]] = fmul <2 x double> [[COL_LOAD108]], [[SPLAT_SPLAT120]] +; CHECK-NEXT: [[SPLAT_SPLAT123:%.*]] = shufflevector <2 x double> [[COL_LOAD114]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP49:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD111]], <2 x double> [[SPLAT_SPLAT123]], <2 x double> [[TMP48]]) +; CHECK-NEXT: [[SPLAT_SPLAT126:%.*]] = shufflevector <2 x double> [[COL_LOAD117]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP50:%.*]] = fmul <2 x double> [[COL_LOAD108]], [[SPLAT_SPLAT126]] +; CHECK-NEXT: [[SPLAT_SPLAT129:%.*]] = shufflevector <2 x double> [[COL_LOAD117]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP51:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD111]], <2 x double> [[SPLAT_SPLAT129]], <2 x double> [[TMP50]]) +; CHECK-NEXT: [[COL_CAST131:%.*]] = bitcast <9 x double>* [[C]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP49]], <2 x double>* [[COL_CAST131]], align 8 +; CHECK-NEXT: [[COL_GEP132:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 3 +; CHECK-NEXT: [[COL_CAST133:%.*]] = bitcast double* [[COL_GEP132]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP51]], <2 x double>* [[COL_CAST133]], align 8 +; CHECK-NEXT: [[TMP52:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP41]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST135:%.*]] = bitcast double* [[TMP52]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD136:%.*]] = load <1 x double>, <1 x double>* [[COL_CAST135]], align 8 +; CHECK-NEXT: [[COL_GEP137:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP41]], i64 0, i64 5 +; CHECK-NEXT: [[COL_CAST138:%.*]] = bitcast double* [[COL_GEP137]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD139:%.*]] = load <1 x double>, <1 x double>* [[COL_CAST138]], align 8 +; CHECK-NEXT: [[COL_CAST141:%.*]] = bitcast <6 x double>* [[TMP47]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD142:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST141]], align 8 +; CHECK-NEXT: [[COL_GEP143:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP47]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST144:%.*]] = bitcast double* [[COL_GEP143]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD145:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST144]], align 8 +; CHECK-NEXT: [[SPLAT_SPLATINSERT147:%.*]] = shufflevector <2 x double> [[COL_LOAD142]], <2 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP53:%.*]] = fmul <1 x double> [[COL_LOAD136]], [[SPLAT_SPLATINSERT147]] +; CHECK-NEXT: [[SPLAT_SPLATINSERT150:%.*]] = shufflevector <2 x double> [[COL_LOAD142]], <2 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP54:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD139]], <1 x double> [[SPLAT_SPLATINSERT150]], <1 x double> [[TMP53]]) +; CHECK-NEXT: [[SPLAT_SPLATINSERT153:%.*]] = shufflevector <2 x double> [[COL_LOAD145]], <2 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP55:%.*]] = fmul <1 x double> [[COL_LOAD136]], [[SPLAT_SPLATINSERT153]] +; CHECK-NEXT: [[SPLAT_SPLATINSERT156:%.*]] = shufflevector <2 x double> [[COL_LOAD145]], <2 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP56:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD139]], <1 x double> [[SPLAT_SPLATINSERT156]], <1 x double> [[TMP55]]) +; CHECK-NEXT: [[TMP57:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST159:%.*]] = bitcast double* [[TMP57]] to <1 x double>* +; CHECK-NEXT: store <1 x double> [[TMP54]], <1 x double>* [[COL_CAST159]], align 8 +; CHECK-NEXT: [[COL_GEP160:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 5 +; CHECK-NEXT: [[COL_CAST161:%.*]] = bitcast double* [[COL_GEP160]] to <1 x double>* +; CHECK-NEXT: store <1 x double> [[TMP56]], <1 x double>* [[COL_CAST161]], align 8 +; CHECK-NEXT: [[COL_CAST163:%.*]] = bitcast <6 x double>* [[TMP41]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD164:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST163]], align 8 +; CHECK-NEXT: [[COL_GEP165:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP41]], i64 0, i64 3 +; CHECK-NEXT: [[COL_CAST166:%.*]] = bitcast double* [[COL_GEP165]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD167:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST166]], align 8 +; CHECK-NEXT: [[TMP58:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP47]], i64 0, i64 4 +; CHECK-NEXT: [[COL_CAST169:%.*]] = bitcast double* [[TMP58]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD170:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST169]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT173:%.*]] = shufflevector <2 x double> [[COL_LOAD170]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP59:%.*]] = fmul <2 x double> [[COL_LOAD164]], [[SPLAT_SPLAT173]] +; CHECK-NEXT: [[SPLAT_SPLAT176:%.*]] = shufflevector <2 x double> [[COL_LOAD170]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP60:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD167]], <2 x double> [[SPLAT_SPLAT176]], <2 x double> [[TMP59]]) +; CHECK-NEXT: [[TMP61:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 6 +; CHECK-NEXT: [[COL_CAST178:%.*]] = bitcast double* [[TMP61]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP60]], <2 x double>* [[COL_CAST178]], align 8 +; CHECK-NEXT: [[TMP62:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP41]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST180:%.*]] = bitcast double* [[TMP62]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD181:%.*]] = load <1 x double>, <1 x double>* [[COL_CAST180]], align 8 +; CHECK-NEXT: [[COL_GEP182:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP41]], i64 0, i64 5 +; CHECK-NEXT: [[COL_CAST183:%.*]] = bitcast double* [[COL_GEP182]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD184:%.*]] = load <1 x double>, <1 x double>* [[COL_CAST183]], align 8 +; CHECK-NEXT: [[TMP63:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP47]], i64 0, i64 4 +; CHECK-NEXT: [[COL_CAST186:%.*]] = bitcast double* [[TMP63]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD187:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST186]], align 8 +; CHECK-NEXT: [[SPLAT_SPLATINSERT189:%.*]] = shufflevector <2 x double> [[COL_LOAD187]], <2 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP64:%.*]] = fmul <1 x double> [[COL_LOAD181]], [[SPLAT_SPLATINSERT189]] +; CHECK-NEXT: [[SPLAT_SPLATINSERT192:%.*]] = shufflevector <2 x double> [[COL_LOAD187]], <2 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD184]], <1 x double> [[SPLAT_SPLATINSERT192]], <1 x double> [[TMP64]]) +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 8 +; CHECK-NEXT: [[COL_CAST195:%.*]] = bitcast double* [[TMP66]] to <1 x double>* +; CHECK-NEXT: store <1 x double> [[TMP65]], <1 x double>* [[COL_CAST195]], align 8 +; CHECK-NEXT: ret void +; +entry: + %a = load <6 x double>, <6 x double>* %A, align 16 + %b = load <6 x double>, <6 x double>* %B, align 16 + %c = call <9 x double> @llvm.matrix.multiply(<6 x double> %a, <6 x double> %b, i32 3, i32 2, i32 3) + store <9 x double> %c, <9 x double>* %C, align 16 + + br i1 %cond, label %true, label %false + +true: + %a.add = fadd <6 x double> %a, %a + store <6 x double> %a.add, <6 x double>* %A + br label %end + +false: + %b.add = fadd <6 x double> %b, %b + store <6 x double> %b.add, <6 x double>* %B + br label %end + +end: + %a.2 = load <6 x double>, <6 x double>* %A, align 16 + %b.2 = load <6 x double>, <6 x double>* %B, align 16 + %c.2 = call <9 x double> @llvm.matrix.multiply(<6 x double> %a.2, <6 x double> %b.2, i32 3, i32 2, i32 3) + store <9 x double> %c.2, <9 x double>* %C, align 16 + ret void +} + +declare <9 x double> @llvm.matrix.multiply(<6 x double>, <6 x double>, i32, i32, i32) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll @@ -0,0 +1,273 @@ +; RUN: opt -lower-matrix-intrinsics -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -instcombine %s -S | FileCheck %s + +; REQUIRES: aarch64-registered-target + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "aarch64-apple-ios" + +define void @multiply(<16 x double> * %A, <16 x double> * %B, <16 x double>* %C) { +; CHECK-LABEL: @multiply( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ST_B:%.*]] = ptrtoint <16 x double>* [[C:%.*]] to i64 +; CHECK-NEXT: [[ST_E:%.*]] = add nuw nsw i64 [[ST_B]], 128 +; CHECK-NEXT: [[LD_B:%.*]] = ptrtoint <16 x double>* [[A:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[ST_E]], [[LD_B]] +; CHECK-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]] +; CHECK: alias_cont: +; CHECK-NEXT: [[LD_E:%.*]] = add nuw nsw i64 [[LD_B]], 128 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[LD_E]], [[ST_B]] +; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]] +; CHECK: copy: +; CHECK-NEXT: [[TMP2:%.*]] = alloca <16 x double>, align 128 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x double>* [[TMP2]] to i8* +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x double>* [[A]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 128 dereferenceable(128) [[TMP3]], i8* nonnull align 16 dereferenceable(128) [[TMP4]], i64 128, i1 false) +; CHECK-NEXT: br label [[NO_ALIAS]] +; CHECK: no_alias: +; CHECK-NEXT: [[TMP5:%.*]] = phi <16 x double>* [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ] +; CHECK-NEXT: [[ST_B1:%.*]] = ptrtoint <16 x double>* [[C]] to i64 +; CHECK-NEXT: [[ST_E2:%.*]] = add nuw nsw i64 [[ST_B1]], 128 +; CHECK-NEXT: [[LD_B6:%.*]] = ptrtoint <16 x double>* [[B:%.*]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[ST_E2]], [[LD_B6]] +; CHECK-NEXT: br i1 [[TMP6]], label [[ALIAS_CONT3:%.*]], label [[NO_ALIAS5:%.*]] +; CHECK: alias_cont3: +; CHECK-NEXT: [[LD_E7:%.*]] = add nuw nsw i64 [[LD_B6]], 128 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[LD_E7]], [[ST_B1]] +; CHECK-NEXT: br i1 [[TMP7]], label [[COPY4:%.*]], label [[NO_ALIAS5]] +; CHECK: copy4: +; CHECK-NEXT: [[TMP8:%.*]] = alloca <16 x double>, align 128 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x double>* [[TMP8]] to i8* +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x double>* [[B]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 128 dereferenceable(128) [[TMP9]], i8* nonnull align 16 dereferenceable(128) [[TMP10]], i64 128, i1 false) +; CHECK-NEXT: br label [[NO_ALIAS5]] + +; CHECK: no_alias5: +; CHECK-NEXT: [[TMP11:%.*]] = phi <16 x double>* [ [[B]], [[NO_ALIAS]] ], [ [[B]], [[ALIAS_CONT3]] ], [ [[TMP8]], [[COPY4]] ] + +;; np.dot(a[0:2, 0:2], b[0:2, 0:2]) + +; CHECK-NEXT: [[COL_CAST8:%.*]] = bitcast <16 x double>* [[TMP5]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST8]], align 8 +; CHECK-NEXT: [[COL_GEP:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 4 +; CHECK-NEXT: [[COL_CAST9:%.*]] = bitcast double* [[COL_GEP]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD10:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST9]], align 8 +; CHECK-NEXT: [[COL_CAST12:%.*]] = bitcast <16 x double>* [[TMP11]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD13:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST12]], align 8 +; CHECK-NEXT: [[COL_GEP14:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 4 +; CHECK-NEXT: [[COL_CAST15:%.*]] = bitcast double* [[COL_GEP14]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD16:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST15]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD13]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = fmul <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]] +; CHECK-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <2 x double> [[COL_LOAD13]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD10]], <2 x double> [[SPLAT_SPLAT19]], <2 x double> [[TMP12]]) +; CHECK-NEXT: [[SPLAT_SPLAT22:%.*]] = shufflevector <2 x double> [[COL_LOAD16]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = fmul <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT22]] +; CHECK-NEXT: [[SPLAT_SPLAT25:%.*]] = shufflevector <2 x double> [[COL_LOAD16]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD10]], <2 x double> [[SPLAT_SPLAT25]], <2 x double> [[TMP14]]) + +;; + np.dot(a[0:2, 2:4], b[2:4, 0:2]) + +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 8 +; CHECK-NEXT: [[COL_CAST27:%.*]] = bitcast double* [[TMP16]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD28:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST27]], align 8 +; CHECK-NEXT: [[COL_GEP29:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 12 +; CHECK-NEXT: [[COL_CAST30:%.*]] = bitcast double* [[COL_GEP29]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD31:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST30]], align 8 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST33:%.*]] = bitcast double* [[TMP17]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD34:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST33]], align 8 +; CHECK-NEXT: [[COL_GEP35:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 6 +; CHECK-NEXT: [[COL_CAST36:%.*]] = bitcast double* [[COL_GEP35]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD37:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST36]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT41:%.*]] = shufflevector <2 x double> [[COL_LOAD34]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD28]], <2 x double> [[SPLAT_SPLAT41]], <2 x double> [[TMP13]]) +; CHECK-NEXT: [[SPLAT_SPLAT44:%.*]] = shufflevector <2 x double> [[COL_LOAD34]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD31]], <2 x double> [[SPLAT_SPLAT44]], <2 x double> [[TMP18]]) +; CHECK-NEXT: [[SPLAT_SPLAT48:%.*]] = shufflevector <2 x double> [[COL_LOAD37]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD28]], <2 x double> [[SPLAT_SPLAT48]], <2 x double> [[TMP15]]) +; CHECK-NEXT: [[SPLAT_SPLAT51:%.*]] = shufflevector <2 x double> [[COL_LOAD37]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD31]], <2 x double> [[SPLAT_SPLAT51]], <2 x double> [[TMP20]]) + +;; -> c[0:2, 0:2] + +; CHECK-NEXT: [[COL_CAST53:%.*]] = bitcast <16 x double>* [[C]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP19]], <2 x double>* [[COL_CAST53]], align 8 +; CHECK-NEXT: [[COL_GEP54:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 4 +; CHECK-NEXT: [[COL_CAST55:%.*]] = bitcast double* [[COL_GEP54]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP21]], <2 x double>* [[COL_CAST55]], align 8 + +;; np.dot(a[2:4, 0:2], b[0:2, 0:2]) + +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST57:%.*]] = bitcast double* [[TMP22]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD58:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST57]], align 8 +; CHECK-NEXT: [[COL_GEP59:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 6 +; CHECK-NEXT: [[COL_CAST60:%.*]] = bitcast double* [[COL_GEP59]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD61:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST60]], align 8 +; CHECK-NEXT: [[COL_CAST63:%.*]] = bitcast <16 x double>* [[TMP11]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD64:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST63]], align 8 +; CHECK-NEXT: [[COL_GEP65:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 4 +; CHECK-NEXT: [[COL_CAST66:%.*]] = bitcast double* [[COL_GEP65]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD67:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST66]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT70:%.*]] = shufflevector <2 x double> [[COL_LOAD64]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = fmul <2 x double> [[COL_LOAD58]], [[SPLAT_SPLAT70]] +; CHECK-NEXT: [[SPLAT_SPLAT73:%.*]] = shufflevector <2 x double> [[COL_LOAD64]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD61]], <2 x double> [[SPLAT_SPLAT73]], <2 x double> [[TMP23]]) +; CHECK-NEXT: [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD67]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = fmul <2 x double> [[COL_LOAD58]], [[SPLAT_SPLAT76]] +; CHECK-NEXT: [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD67]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD61]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP25]]) + +;; + np.dot(a[2:4, 2:4], b[2:4, 0:2]) + +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 10 +; CHECK-NEXT: [[COL_CAST81:%.*]] = bitcast double* [[TMP27]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD82:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST81]], align 8 +; CHECK-NEXT: [[COL_GEP83:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 14 +; CHECK-NEXT: [[COL_CAST84:%.*]] = bitcast double* [[COL_GEP83]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD85:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST84]], align 8 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST87:%.*]] = bitcast double* [[TMP28]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD88:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST87]], align 8 +; CHECK-NEXT: [[COL_GEP89:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 6 +; CHECK-NEXT: [[COL_CAST90:%.*]] = bitcast double* [[COL_GEP89]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD91:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST90]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD88]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP29:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD82]], <2 x double> [[SPLAT_SPLAT95]], <2 x double> [[TMP24]]) +; CHECK-NEXT: [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD88]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD85]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP29]]) +; CHECK-NEXT: [[SPLAT_SPLAT102:%.*]] = shufflevector <2 x double> [[COL_LOAD91]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP31:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD82]], <2 x double> [[SPLAT_SPLAT102]], <2 x double> [[TMP26]]) +; CHECK-NEXT: [[SPLAT_SPLAT105:%.*]] = shufflevector <2 x double> [[COL_LOAD91]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD85]], <2 x double> [[SPLAT_SPLAT105]], <2 x double> [[TMP31]]) + +;; -> c[2:4, 0:2] + +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST107:%.*]] = bitcast double* [[TMP33]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP30]], <2 x double>* [[COL_CAST107]], align 8 +; CHECK-NEXT: [[COL_GEP108:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 6 +; CHECK-NEXT: [[COL_CAST109:%.*]] = bitcast double* [[COL_GEP108]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP32]], <2 x double>* [[COL_CAST109]], align 8 + +;; np.dot(a[0:2, 0:2], b[0:2, 2:4]) + +; CHECK-NEXT: [[COL_CAST111:%.*]] = bitcast <16 x double>* [[TMP5]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD112:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST111]], align 8 +; CHECK-NEXT: [[COL_GEP113:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 4 +; CHECK-NEXT: [[COL_CAST114:%.*]] = bitcast double* [[COL_GEP113]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD115:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST114]], align 8 +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 8 +; CHECK-NEXT: [[COL_CAST117:%.*]] = bitcast double* [[TMP34]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD118:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST117]], align 8 +; CHECK-NEXT: [[COL_GEP119:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 12 +; CHECK-NEXT: [[COL_CAST120:%.*]] = bitcast double* [[COL_GEP119]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD121:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST120]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT124:%.*]] = shufflevector <2 x double> [[COL_LOAD118]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP35:%.*]] = fmul <2 x double> [[COL_LOAD112]], [[SPLAT_SPLAT124]] +; CHECK-NEXT: [[SPLAT_SPLAT127:%.*]] = shufflevector <2 x double> [[COL_LOAD118]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD115]], <2 x double> [[SPLAT_SPLAT127]], <2 x double> [[TMP35]]) +; CHECK-NEXT: [[SPLAT_SPLAT130:%.*]] = shufflevector <2 x double> [[COL_LOAD121]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP37:%.*]] = fmul <2 x double> [[COL_LOAD112]], [[SPLAT_SPLAT130]] +; CHECK-NEXT: [[SPLAT_SPLAT133:%.*]] = shufflevector <2 x double> [[COL_LOAD121]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD115]], <2 x double> [[SPLAT_SPLAT133]], <2 x double> [[TMP37]]) + +;; + np.dot(a[0:2, 2:4], b[2:4, 2:4]) + +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 8 +; CHECK-NEXT: [[COL_CAST135:%.*]] = bitcast double* [[TMP39]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD136:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST135]], align 8 +; CHECK-NEXT: [[COL_GEP137:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 12 +; CHECK-NEXT: [[COL_CAST138:%.*]] = bitcast double* [[COL_GEP137]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD139:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST138]], align 8 +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 10 +; CHECK-NEXT: [[COL_CAST141:%.*]] = bitcast double* [[TMP40]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD142:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST141]], align 8 +; CHECK-NEXT: [[COL_GEP143:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 14 +; CHECK-NEXT: [[COL_CAST144:%.*]] = bitcast double* [[COL_GEP143]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD145:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST144]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT149:%.*]] = shufflevector <2 x double> [[COL_LOAD142]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP41:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD136]], <2 x double> [[SPLAT_SPLAT149]], <2 x double> [[TMP36]]) +; CHECK-NEXT: [[SPLAT_SPLAT152:%.*]] = shufflevector <2 x double> [[COL_LOAD142]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP42:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD139]], <2 x double> [[SPLAT_SPLAT152]], <2 x double> [[TMP41]]) +; CHECK-NEXT: [[SPLAT_SPLAT156:%.*]] = shufflevector <2 x double> [[COL_LOAD145]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP43:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD136]], <2 x double> [[SPLAT_SPLAT156]], <2 x double> [[TMP38]]) +; CHECK-NEXT: [[SPLAT_SPLAT159:%.*]] = shufflevector <2 x double> [[COL_LOAD145]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP44:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD139]], <2 x double> [[SPLAT_SPLAT159]], <2 x double> [[TMP43]]) + +;; -> c[0:2, 2:4] + +; CHECK-NEXT: [[TMP45:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 8 +; CHECK-NEXT: [[COL_CAST161:%.*]] = bitcast double* [[TMP45]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP42]], <2 x double>* [[COL_CAST161]], align 8 +; CHECK-NEXT: [[COL_GEP162:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 12 +; CHECK-NEXT: [[COL_CAST163:%.*]] = bitcast double* [[COL_GEP162]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP44]], <2 x double>* [[COL_CAST163]], align 8 + +;; np.dot(a[2:4, 0:2], b[2:4, 0:2]) + +; CHECK-NEXT: [[TMP46:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST165:%.*]] = bitcast double* [[TMP46]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD166:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST165]], align 8 +; CHECK-NEXT: [[COL_GEP167:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 6 +; CHECK-NEXT: [[COL_CAST168:%.*]] = bitcast double* [[COL_GEP167]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD169:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST168]], align 8 +; CHECK-NEXT: [[TMP47:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 8 +; CHECK-NEXT: [[COL_CAST171:%.*]] = bitcast double* [[TMP47]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD172:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST171]], align 8 +; CHECK-NEXT: [[COL_GEP173:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 12 +; CHECK-NEXT: [[COL_CAST174:%.*]] = bitcast double* [[COL_GEP173]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD175:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST174]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT178:%.*]] = shufflevector <2 x double> [[COL_LOAD172]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP48:%.*]] = fmul <2 x double> [[COL_LOAD166]], [[SPLAT_SPLAT178]] +; CHECK-NEXT: [[SPLAT_SPLAT181:%.*]] = shufflevector <2 x double> [[COL_LOAD172]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP49:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD169]], <2 x double> [[SPLAT_SPLAT181]], <2 x double> [[TMP48]]) +; CHECK-NEXT: [[SPLAT_SPLAT184:%.*]] = shufflevector <2 x double> [[COL_LOAD175]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP50:%.*]] = fmul <2 x double> [[COL_LOAD166]], [[SPLAT_SPLAT184]] +; CHECK-NEXT: [[SPLAT_SPLAT187:%.*]] = shufflevector <2 x double> [[COL_LOAD175]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP51:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD169]], <2 x double> [[SPLAT_SPLAT187]], <2 x double> [[TMP50]]) + +;; + np.dot(a[2:4, 2:4], b[2:4, 2:4]) + +; CHECK-NEXT: [[TMP52:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 10 +; CHECK-NEXT: [[COL_CAST189:%.*]] = bitcast double* [[TMP52]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD190:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST189]], align 8 +; CHECK-NEXT: [[COL_GEP191:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 14 +; CHECK-NEXT: [[COL_CAST192:%.*]] = bitcast double* [[COL_GEP191]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD193:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST192]], align 8 +; CHECK-NEXT: [[TMP53:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 10 +; CHECK-NEXT: [[COL_CAST195:%.*]] = bitcast double* [[TMP53]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD196:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST195]], align 8 +; CHECK-NEXT: [[COL_GEP197:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 14 +; CHECK-NEXT: [[COL_CAST198:%.*]] = bitcast double* [[COL_GEP197]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD199:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST198]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT203:%.*]] = shufflevector <2 x double> [[COL_LOAD196]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP54:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD190]], <2 x double> [[SPLAT_SPLAT203]], <2 x double> [[TMP49]]) +; CHECK-NEXT: [[SPLAT_SPLAT206:%.*]] = shufflevector <2 x double> [[COL_LOAD196]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP55:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD193]], <2 x double> [[SPLAT_SPLAT206]], <2 x double> [[TMP54]]) +; CHECK-NEXT: [[SPLAT_SPLAT210:%.*]] = shufflevector <2 x double> [[COL_LOAD199]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP56:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD190]], <2 x double> [[SPLAT_SPLAT210]], <2 x double> [[TMP51]]) +; CHECK-NEXT: [[SPLAT_SPLAT213:%.*]] = shufflevector <2 x double> [[COL_LOAD199]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP57:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD193]], <2 x double> [[SPLAT_SPLAT213]], <2 x double> [[TMP56]]) + +;; -> c[2:4, 2:4] + +; CHECK-NEXT: [[TMP58:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 10 +; CHECK-NEXT: [[COL_CAST215:%.*]] = bitcast double* [[TMP58]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP55]], <2 x double>* [[COL_CAST215]], align 8 +; CHECK-NEXT: [[COL_GEP216:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 14 +; CHECK-NEXT: [[COL_CAST217:%.*]] = bitcast double* [[COL_GEP216]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP57]], <2 x double>* [[COL_CAST217]], align 8 +; CHECK-NEXT: ret void +; +entry: + %a = load <16 x double>, <16 x double>* %A, align 16 + %b = load <16 x double>, <16 x double>* %B, align 16 + + %c = call <16 x double> @llvm.matrix.multiply(<16 x double> %a, <16 x double> %b, i32 4, i32 4, i32 4) + + store <16 x double> %c, <16 x double>* %C, align 16 + ret void +} + +declare <16 x double> @llvm.matrix.multiply(<16 x double>, <16 x double>, i32, i32, i32)