diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -19,7 +19,10 @@ #include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/OrderedInstructions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" @@ -34,6 +37,7 @@ #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" using namespace llvm; using namespace PatternMatch; @@ -45,6 +49,9 @@ cl::desc("Enable/disable shape propagation from matrix intrinsics to other " "instructions.")); +static cl::opt FuseMatrix("fuse-matrix", cl::init(true)); +static cl::opt TileSize("fuse-matrix-tile-size", cl::init(4)); +static cl::opt ForceFusion("force-fuse-matrix", cl::init(false)); static cl::opt AllowContractEnabled( "matrix-allow-contract", cl::init(false), cl::Hidden, cl::desc("Allow the use of FMAs if available and profitable. This may " @@ -139,6 +146,10 @@ Function &Func; const DataLayout &DL; const TargetTransformInfo &TTI; + AliasAnalysis &AA; + DominatorTree &DT; + LoopInfo &LI; + OrderedInstructions OI; OptimizationRemarkEmitter &ORE; /// Contains estimates of the number of operations (loads, stores, compute) required to lower a matrix operation. @@ -271,8 +282,10 @@ public: LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI, + AliasAnalysis &AA, DominatorTree &DT, LoopInfo &LI, OptimizationRemarkEmitter &ORE) - : Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI), ORE(ORE) {} + : Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI), AA(AA), DT(DT), + LI(LI), OI(&DT), ORE(ORE) {} unsigned getNumOps(Type *VT) { assert(isa(VT) && "Expected vector type"); @@ -528,6 +541,35 @@ return NewWorkList; } + /// Visit \b BB and try to fuse matrix instructions. + bool visitBBFusion(BasicBlock *BB, + SmallVectorImpl &MatrixInsts) { + bool Changed = false; + for (auto I = BB->begin(); I != BB->end(); ++I) { + Instruction &Inst = *I; + bool Touched = false; + if (IntrinsicInst *IInst = dyn_cast(&Inst)) { + if (IInst->getIntrinsicID() == Intrinsic::matrix_multiply) { + + if (BasicBlock *NextBB = LowerMatrixMultiplyFused(IInst)) { + Touched = true; + // We create new basic blocks when fusing multiplies. Those will not + // be part of the RPO, so we visit the BB containing the remainder + // of the original instructions. + visitBBFusion(NextBB, MatrixInsts); + return true; + } + } + } + // Collect instructions producing matrix values, stores or bitcasts. + if (!Touched && ShapeMap.find(&Inst) != ShapeMap.end()) + MatrixInsts.push_back(&Inst); + + Changed |= Touched; + } + return Changed; + } + bool Visit() { if (EnableShapePropagation) { SmallVector WorkList; @@ -557,25 +599,27 @@ WorkList = propagateShapeBackward(WorkList); } } + bool Changed = false; + SmallVector MatrixInsts; ReversePostOrderTraversal RPOT(&Func); - bool Changed = false; - for (auto *BB : RPOT) { - for (Instruction &Inst : make_early_inc_range(*BB)) { - IRBuilder<> Builder(&Inst); - - if (CallInst *CInst = dyn_cast(&Inst)) - Changed |= VisitCallInst(CInst); - - Value *Op1; - Value *Op2; - if (auto *BinOp = dyn_cast(&Inst)) - Changed |= VisitBinaryOperator(BinOp); - if (match(&Inst, m_Load(m_Value(Op1)))) - Changed |= VisitLoad(&Inst, Op1, Builder); - else if (match(&Inst, m_Store(m_Value(Op1), m_Value(Op2)))) - Changed |= VisitStore(&Inst, Op1, Op2, Builder); - } + for (auto *BB : RPOT) + Changed |= visitBBFusion(BB, MatrixInsts); + + for (Instruction *Inst : MatrixInsts) { + IRBuilder<> Builder(Inst); + + if (CallInst *CInst = dyn_cast(Inst)) + Changed |= VisitCallInst(CInst); + + Value *Op1; + Value *Op2; + if (auto *BinOp = dyn_cast(Inst)) + Changed |= VisitBinaryOperator(BinOp); + if (match(Inst, m_Load(m_Value(Op1)))) + Changed |= VisitLoad(Inst, Op1, Builder); + else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2)))) + Changed |= VisitStore(Inst, Op1, Op2, Builder); } RemarkGenerator RemarkGen(Inst2ColumnMatrix, ORE, DL); @@ -888,6 +932,193 @@ } } + /// Ensure that the memory in \p Load does not alias \p Store by potentially + /// copying it to a new location. This new or otherwise the original location + /// is returned. + Value *getNonAliasingPointer(LoadInst *Load, StoreInst *Store, + CallInst *MatMul) { + MemoryLocation St = MemoryLocation::get(Store); + MemoryLocation Ld = MemoryLocation::get(Load); + + AliasResult LdAliased = AA.alias(Ld, St); + + // If we can statically determine noalias we're good. + if (!LdAliased) + return Load->getPointerOperand(); + + IRBuilder<> Builder(MatMul); + Type *IntPtrTy = Builder.getIntPtrTy(Load->getModule()->getDataLayout()); + + Value *St_b = + Builder.CreatePtrToInt(const_cast(St.Ptr), IntPtrTy, "st_b"); + Value *St_e = + Builder.CreateAdd(St_b, ConstantInt::get(IntPtrTy, St.Size.getValue()), + "st_e", true, true); + + BasicBlock *Check0 = MatMul->getParent(); + + // FIXME: Use lazy DTU and update SplitBlock to accept a DTU instead of a + // DT. Manually collect dominator tree updates, to avoid unnecessary work, + // as we adjust Check0 and Check1's branches. + SmallVector DTUpdates; + for (BasicBlock *Succ : successors(Check0)) + DTUpdates.push_back({DT.Delete, Check0, Succ}); + + BasicBlock *Check1 = SplitBlock(MatMul->getParent(), MatMul, nullptr, &LI, + nullptr, "alias_cont"); + BasicBlock *Copy = + SplitBlock(MatMul->getParent(), MatMul, nullptr, &LI, nullptr, "copy"); + BasicBlock *Fusion = SplitBlock(MatMul->getParent(), MatMul, nullptr, &LI, + nullptr, "no_alias"); + DTUpdates.push_back({DT.Insert, Fusion, Copy}); + DTUpdates.push_back({DT.Insert, Copy, Check1}); + + Check0->getTerminator()->eraseFromParent(); + Builder.SetInsertPoint(Check0); + Value *Ld_b = + Builder.CreatePtrToInt(const_cast(Ld.Ptr), IntPtrTy, "ld_b"); + Builder.CreateCondBr(Builder.CreateICmpULT(Ld_b, St_e), Check1, Fusion); + DTUpdates.push_back({DT.Insert, Check0, Check1}); + DTUpdates.push_back({DT.Insert, Check0, Fusion}); + + Check1->getTerminator()->eraseFromParent(); + Builder.SetInsertPoint(Check1, Check1->begin()); + Value *Ld_e = + Builder.CreateAdd(Ld_b, ConstantInt::get(IntPtrTy, Ld.Size.getValue()), + "ld_e", true, true); + Builder.CreateCondBr(Builder.CreateICmpULT(St_b, Ld_e), Copy, Fusion); + DTUpdates.push_back({DT.Insert, Check1, Copy}); + DTUpdates.push_back({DT.Insert, Check1, Fusion}); + DT.applyUpdates(DTUpdates); + + Builder.SetInsertPoint(Copy, Copy->begin()); + AllocaInst *NewLd = + Builder.CreateAlloca(Load->getType(), Load->getPointerAddressSpace()); + Builder.CreateMemCpy(NewLd, MaybeAlign(NewLd->getAlignment()), + Load->getPointerOperand(), Load->getAlign(), + Ld.Size.getValue()); + + Builder.SetInsertPoint(Fusion, Fusion->begin()); + PHINode *PHI = Builder.CreatePHI(Load->getPointerOperandType(), 3); + PHI->addIncoming(Load->getPointerOperand(), Check0); + PHI->addIncoming(Load->getPointerOperand(), Check1); + PHI->addIncoming(NewLd, Copy); + + return PHI; + } + + bool isFusionProfitable(CallInst *MatMul) { + if (ForceFusion) + return true; + + ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3)); + ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4)); + + const unsigned R = LShape.NumRows; + const unsigned C = RShape.NumColumns; + const unsigned M = LShape.NumColumns; + auto *EltType = cast(MatMul->getType())->getElementType(); + + const unsigned VF = + std::max(TTI.getRegisterBitWidth(true) / + EltType->getPrimitiveSizeInBits().getFixedSize(), + 1U); + + // Cost model for tiling + // + // For tiling to be beneficial, we need reuse either along the R or + // the C axis. We vectorize along the R axis so that means at least + // 3 elements. + if (R <= VF && C == 1) + return false; + // Then we need enough elements to exceed the number of vector + // registers we have. Note that this is an oversimplification since + // fusing also takes some extra loads which may exceed the number of + // reloads necessary. + unsigned Op0Regs = (R + VF - 1) / VF * M; + unsigned Op1Regs = (M + VF - 1) / VF * C; + return Op0Regs + Op1Regs > TTI.getNumberOfRegisters(true); + } + + ColumnMatrixTy getZeroMatrix(Type *EltType, unsigned R, unsigned C) { + ColumnMatrixTy Res; + Type *ColumType = VectorType::get(EltType, R); + for (unsigned I = 0; I < C; ++I) + Res.addColumn(ConstantAggregateZero::get(ColumType)); + return Res; + } + + BasicBlock *emitSIMDTiling(CallInst *MatMul, LoadInst *LoadOp0, + LoadInst *LoadOp1, StoreInst *Store) { + if (!isFusionProfitable(MatMul)) + return nullptr; + + ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3)); + ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4)); + + const unsigned R = LShape.NumRows; + const unsigned C = RShape.NumColumns; + const unsigned M = LShape.NumColumns; + auto *EltType = cast(MatMul->getType())->getElementType(); + + Value *APtr = getNonAliasingPointer(LoadOp0, Store, MatMul); + Value *BPtr = getNonAliasingPointer(LoadOp1, Store, MatMul); + Value *CPtr = Store->getPointerOperand(); + + bool AllowContract = AllowContractEnabled || (isa(MatMul) && + MatMul->hasAllowContract()); + + IRBuilder<> Builder(Store); + for (unsigned J = 0; J < C; J += TileSize) + for (unsigned I = 0; I < R; I += TileSize) { + const unsigned TileR = std::min(R - I, unsigned(TileSize)); + const unsigned TileC = std::min(C - J, unsigned(TileSize)); + ColumnMatrixTy Res = getZeroMatrix(EltType, TileR, TileC); + + for (unsigned K = 0; K < M; K += TileSize) { + const unsigned TileM = std::min(M - K, unsigned(TileSize)); + ColumnMatrixTy A = + loadMatrix(APtr, LShape, I, K, {TileR, TileM}, EltType, Builder); + ColumnMatrixTy B = + loadMatrix(BPtr, RShape, K, J, {TileM, TileC}, EltType, Builder); + emitChainedMatrixMultiply(Res, A, B, AllowContract, Builder, true); + } + storeMatrix(Res, CPtr, {R, M}, I, J, EltType, Builder); + } + + Store->eraseFromParent(); + BasicBlock *Cont = MatMul->getParent(); + MatMul->eraseFromParent(); + return Cont; + } + + /// Try to lower matrix multiply chains by fusing operations. + /// + /// Currently we only lower {ld, ld} -> matmul -> st chains. + // + /// No need to return LoweredMatrix since the single store user will be + /// lowered as part of this. + BasicBlock *LowerMatrixMultiplyFused(CallInst *MatMul) { + if (!FuseMatrix) + return nullptr; + + if (auto *LoadOp0 = dyn_cast(MatMul->getOperand(0))) + if (auto *LoadOp1 = dyn_cast(MatMul->getOperand(1))) + if (MatMul->hasOneUse()) + if (auto *Store = dyn_cast(*MatMul->user_begin())) { + // The store address must dominate the MatMul instruction, otherwise + // we create invalid IR. + // FIXME: See if we can hoist the store address computation. + auto *AddrI = dyn_cast(Store->getOperand(1)); + if (AddrI && (!OI.dominates(AddrI, MatMul))) + return nullptr; + + return emitSIMDTiling(MatMul, LoadOp0, LoadOp1, Store); + } + + return nullptr; + } + /// Lowers llvm.matrix.multiply. void LowerMultiply(CallInst *MatMul) { IRBuilder<> Builder(MatMul); @@ -1411,7 +1642,11 @@ FunctionAnalysisManager &AM) { auto &TTI = AM.getResult(F); auto &ORE = AM.getResult(F); - LowerMatrixIntrinsics LMT(F, TTI, ORE); + auto &AA = AM.getResult(F); + auto &DT = AM.getResult(F); + auto &LI = AM.getResult(F); + + LowerMatrixIntrinsics LMT(F, TTI, AA, DT, LI, ORE); if (LMT.Visit()) { PreservedAnalyses PA; PA.preserveSet(); @@ -1434,7 +1669,10 @@ bool runOnFunction(Function &F) override { auto &TTI = getAnalysis().getTTI(F); auto &ORE = getAnalysis().getORE(); - LowerMatrixIntrinsics LMT(F, TTI, ORE); + auto &AA = getAnalysis().getAAResults(); + auto &DT = getAnalysis().getDomTree(); + auto &LI = getAnalysis().getLoopInfo(); + LowerMatrixIntrinsics LMT(F, TTI, AA, DT, LI, ORE); bool C = LMT.Visit(); return C; } @@ -1442,7 +1680,11 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); - AU.setPreservesCFG(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); } }; } // namespace @@ -1452,6 +1694,9 @@ INITIALIZE_PASS_BEGIN(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name, false, false) INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name, false, false) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll @@ -0,0 +1,259 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -lower-matrix-intrinsics -fuse-matrix-tile-size=2 -force-fuse-matrix -instcombine %s -S | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "aarch64-apple-ios" + +define void @multiply(<16 x double> * %A, <16 x double> * %B, <16 x double>* %C) { +; CHECK-LABEL: @multiply( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ST_B:%.*]] = ptrtoint <16 x double>* [[C:%.*]] to i64 +; CHECK-NEXT: [[ST_E:%.*]] = add nuw nsw i64 [[ST_B]], 128 +; CHECK-NEXT: [[LD_B:%.*]] = ptrtoint <16 x double>* [[A:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[ST_E]], [[LD_B]] +; CHECK-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]] +; CHECK: alias_cont: +; CHECK-NEXT: [[LD_E:%.*]] = add nuw nsw i64 [[LD_B]], 128 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[LD_E]], [[ST_B]] +; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]] +; CHECK: copy: +; CHECK-NEXT: [[TMP2:%.*]] = alloca <16 x double>, align 128 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x double>* [[TMP2]] to i8* +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x double>* [[A]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 128 dereferenceable(128) [[TMP3]], i8* nonnull align 16 dereferenceable(128) [[TMP4]], i64 128, i1 false) +; CHECK-NEXT: br label [[NO_ALIAS]] +; CHECK: no_alias: +; CHECK-NEXT: [[TMP5:%.*]] = phi <16 x double>* [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ] +; CHECK-NEXT: [[ST_B1:%.*]] = ptrtoint <16 x double>* [[C]] to i64 +; CHECK-NEXT: [[ST_E2:%.*]] = add nuw nsw i64 [[ST_B1]], 128 +; CHECK-NEXT: [[LD_B6:%.*]] = ptrtoint <16 x double>* [[B:%.*]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[ST_E2]], [[LD_B6]] +; CHECK-NEXT: br i1 [[TMP6]], label [[ALIAS_CONT3:%.*]], label [[NO_ALIAS5:%.*]] +; CHECK: alias_cont3: +; CHECK-NEXT: [[LD_E7:%.*]] = add nuw nsw i64 [[LD_B6]], 128 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[LD_E7]], [[ST_B1]] +; CHECK-NEXT: br i1 [[TMP7]], label [[COPY4:%.*]], label [[NO_ALIAS5]] +; CHECK: copy4: +; CHECK-NEXT: [[TMP8:%.*]] = alloca <16 x double>, align 128 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x double>* [[TMP8]] to i8* +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x double>* [[B]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 128 dereferenceable(128) [[TMP9]], i8* nonnull align 16 dereferenceable(128) [[TMP10]], i64 128, i1 false) +; CHECK-NEXT: br label [[NO_ALIAS5]] +; CHECK: no_alias5: +; CHECK-NEXT: [[TMP11:%.*]] = phi <16 x double>* [ [[B]], [[NO_ALIAS]] ], [ [[B]], [[ALIAS_CONT3]] ], [ [[TMP8]], [[COPY4]] ] +; CHECK-NEXT: [[COL_CAST8:%.*]] = bitcast <16 x double>* [[TMP5]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST8]], align 8 +; CHECK-NEXT: [[COL_GEP:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST9:%.*]] = bitcast double* [[COL_GEP]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD10:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST9]], align 8 +; CHECK-NEXT: [[COL_CAST12:%.*]] = bitcast <16 x double>* [[TMP11]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD13:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST12]], align 8 +; CHECK-NEXT: [[COL_GEP14:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST15:%.*]] = bitcast double* [[COL_GEP14]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD16:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST15]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD13]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = fmul <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]] +; CHECK-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <2 x double> [[COL_LOAD13]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = fmul <2 x double> [[COL_LOAD10]], [[SPLAT_SPLAT19]] +; CHECK-NEXT: [[TMP14:%.*]] = fadd <2 x double> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[SPLAT_SPLAT22:%.*]] = shufflevector <2 x double> [[COL_LOAD16]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = fmul <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT22]] +; CHECK-NEXT: [[SPLAT_SPLAT25:%.*]] = shufflevector <2 x double> [[COL_LOAD16]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = fmul <2 x double> [[COL_LOAD10]], [[SPLAT_SPLAT25]] +; CHECK-NEXT: [[TMP17:%.*]] = fadd <2 x double> [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 8 +; CHECK-NEXT: [[COL_CAST27:%.*]] = bitcast double* [[TMP18]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD28:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST27]], align 8 +; CHECK-NEXT: [[COL_GEP29:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 10 +; CHECK-NEXT: [[COL_CAST30:%.*]] = bitcast double* [[COL_GEP29]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD31:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST30]], align 8 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST33:%.*]] = bitcast double* [[TMP19]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD34:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST33]], align 8 +; CHECK-NEXT: [[COL_GEP35:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 4 +; CHECK-NEXT: [[COL_CAST36:%.*]] = bitcast double* [[COL_GEP35]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD37:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST36]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT41:%.*]] = shufflevector <2 x double> [[COL_LOAD34]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = fmul <2 x double> [[COL_LOAD28]], [[SPLAT_SPLAT41]] +; CHECK-NEXT: [[TMP21:%.*]] = fadd <2 x double> [[TMP14]], [[TMP20]] +; CHECK-NEXT: [[SPLAT_SPLAT44:%.*]] = shufflevector <2 x double> [[COL_LOAD34]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = fmul <2 x double> [[COL_LOAD31]], [[SPLAT_SPLAT44]] +; CHECK-NEXT: [[TMP23:%.*]] = fadd <2 x double> [[TMP21]], [[TMP22]] +; CHECK-NEXT: [[SPLAT_SPLAT48:%.*]] = shufflevector <2 x double> [[COL_LOAD37]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = fmul <2 x double> [[COL_LOAD28]], [[SPLAT_SPLAT48]] +; CHECK-NEXT: [[TMP25:%.*]] = fadd <2 x double> [[TMP17]], [[TMP24]] +; CHECK-NEXT: [[SPLAT_SPLAT51:%.*]] = shufflevector <2 x double> [[COL_LOAD37]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = fmul <2 x double> [[COL_LOAD31]], [[SPLAT_SPLAT51]] +; CHECK-NEXT: [[TMP27:%.*]] = fadd <2 x double> [[TMP25]], [[TMP26]] +; CHECK-NEXT: [[COL_CAST53:%.*]] = bitcast <16 x double>* [[C]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP23]], <2 x double>* [[COL_CAST53]], align 8 +; CHECK-NEXT: [[COL_GEP54:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST55:%.*]] = bitcast double* [[COL_GEP54]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP27]], <2 x double>* [[COL_CAST55]], align 8 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST57:%.*]] = bitcast double* [[TMP28]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD58:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST57]], align 8 +; CHECK-NEXT: [[COL_GEP59:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 4 +; CHECK-NEXT: [[COL_CAST60:%.*]] = bitcast double* [[COL_GEP59]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD61:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST60]], align 8 +; CHECK-NEXT: [[COL_CAST63:%.*]] = bitcast <16 x double>* [[TMP11]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD64:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST63]], align 8 +; CHECK-NEXT: [[COL_GEP65:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST66:%.*]] = bitcast double* [[COL_GEP65]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD67:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST66]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT70:%.*]] = shufflevector <2 x double> [[COL_LOAD64]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP29:%.*]] = fmul <2 x double> [[COL_LOAD58]], [[SPLAT_SPLAT70]] +; CHECK-NEXT: [[SPLAT_SPLAT73:%.*]] = shufflevector <2 x double> [[COL_LOAD64]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = fmul <2 x double> [[COL_LOAD61]], [[SPLAT_SPLAT73]] +; CHECK-NEXT: [[TMP31:%.*]] = fadd <2 x double> [[TMP29]], [[TMP30]] +; CHECK-NEXT: [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD67]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP32:%.*]] = fmul <2 x double> [[COL_LOAD58]], [[SPLAT_SPLAT76]] +; CHECK-NEXT: [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD67]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = fmul <2 x double> [[COL_LOAD61]], [[SPLAT_SPLAT79]] +; CHECK-NEXT: [[TMP34:%.*]] = fadd <2 x double> [[TMP32]], [[TMP33]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 10 +; CHECK-NEXT: [[COL_CAST81:%.*]] = bitcast double* [[TMP35]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD82:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST81]], align 8 +; CHECK-NEXT: [[COL_GEP83:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 12 +; CHECK-NEXT: [[COL_CAST84:%.*]] = bitcast double* [[COL_GEP83]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD85:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST84]], align 8 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST87:%.*]] = bitcast double* [[TMP36]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD88:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST87]], align 8 +; CHECK-NEXT: [[COL_GEP89:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 4 +; CHECK-NEXT: [[COL_CAST90:%.*]] = bitcast double* [[COL_GEP89]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD91:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST90]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD88]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP37:%.*]] = fmul <2 x double> [[COL_LOAD82]], [[SPLAT_SPLAT95]] +; CHECK-NEXT: [[TMP38:%.*]] = fadd <2 x double> [[TMP31]], [[TMP37]] +; CHECK-NEXT: [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD88]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP39:%.*]] = fmul <2 x double> [[COL_LOAD85]], [[SPLAT_SPLAT98]] +; CHECK-NEXT: [[TMP40:%.*]] = fadd <2 x double> [[TMP38]], [[TMP39]] +; CHECK-NEXT: [[SPLAT_SPLAT102:%.*]] = shufflevector <2 x double> [[COL_LOAD91]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP41:%.*]] = fmul <2 x double> [[COL_LOAD82]], [[SPLAT_SPLAT102]] +; CHECK-NEXT: [[TMP42:%.*]] = fadd <2 x double> [[TMP34]], [[TMP41]] +; CHECK-NEXT: [[SPLAT_SPLAT105:%.*]] = shufflevector <2 x double> [[COL_LOAD91]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP43:%.*]] = fmul <2 x double> [[COL_LOAD85]], [[SPLAT_SPLAT105]] +; CHECK-NEXT: [[TMP44:%.*]] = fadd <2 x double> [[TMP42]], [[TMP43]] +; CHECK-NEXT: [[TMP45:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST107:%.*]] = bitcast double* [[TMP45]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP40]], <2 x double>* [[COL_CAST107]], align 8 +; CHECK-NEXT: [[COL_GEP108:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 4 +; CHECK-NEXT: [[COL_CAST109:%.*]] = bitcast double* [[COL_GEP108]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP44]], <2 x double>* [[COL_CAST109]], align 8 +; CHECK-NEXT: [[COL_CAST111:%.*]] = bitcast <16 x double>* [[TMP5]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD112:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST111]], align 8 +; CHECK-NEXT: [[COL_GEP113:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST114:%.*]] = bitcast double* [[COL_GEP113]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD115:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST114]], align 8 +; CHECK-NEXT: [[TMP46:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 8 +; CHECK-NEXT: [[COL_CAST117:%.*]] = bitcast double* [[TMP46]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD118:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST117]], align 8 +; CHECK-NEXT: [[COL_GEP119:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 10 +; CHECK-NEXT: [[COL_CAST120:%.*]] = bitcast double* [[COL_GEP119]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD121:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST120]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT124:%.*]] = shufflevector <2 x double> [[COL_LOAD118]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP47:%.*]] = fmul <2 x double> [[COL_LOAD112]], [[SPLAT_SPLAT124]] +; CHECK-NEXT: [[SPLAT_SPLAT127:%.*]] = shufflevector <2 x double> [[COL_LOAD118]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP48:%.*]] = fmul <2 x double> [[COL_LOAD115]], [[SPLAT_SPLAT127]] +; CHECK-NEXT: [[TMP49:%.*]] = fadd <2 x double> [[TMP47]], [[TMP48]] +; CHECK-NEXT: [[SPLAT_SPLAT130:%.*]] = shufflevector <2 x double> [[COL_LOAD121]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP50:%.*]] = fmul <2 x double> [[COL_LOAD112]], [[SPLAT_SPLAT130]] +; CHECK-NEXT: [[SPLAT_SPLAT133:%.*]] = shufflevector <2 x double> [[COL_LOAD121]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP51:%.*]] = fmul <2 x double> [[COL_LOAD115]], [[SPLAT_SPLAT133]] +; CHECK-NEXT: [[TMP52:%.*]] = fadd <2 x double> [[TMP50]], [[TMP51]] +; CHECK-NEXT: [[TMP53:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 8 +; CHECK-NEXT: [[COL_CAST135:%.*]] = bitcast double* [[TMP53]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD136:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST135]], align 8 +; CHECK-NEXT: [[COL_GEP137:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 10 +; CHECK-NEXT: [[COL_CAST138:%.*]] = bitcast double* [[COL_GEP137]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD139:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST138]], align 8 +; CHECK-NEXT: [[TMP54:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 10 +; CHECK-NEXT: [[COL_CAST141:%.*]] = bitcast double* [[TMP54]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD142:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST141]], align 8 +; CHECK-NEXT: [[COL_GEP143:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 12 +; CHECK-NEXT: [[COL_CAST144:%.*]] = bitcast double* [[COL_GEP143]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD145:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST144]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT149:%.*]] = shufflevector <2 x double> [[COL_LOAD142]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP55:%.*]] = fmul <2 x double> [[COL_LOAD136]], [[SPLAT_SPLAT149]] +; CHECK-NEXT: [[TMP56:%.*]] = fadd <2 x double> [[TMP49]], [[TMP55]] +; CHECK-NEXT: [[SPLAT_SPLAT152:%.*]] = shufflevector <2 x double> [[COL_LOAD142]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP57:%.*]] = fmul <2 x double> [[COL_LOAD139]], [[SPLAT_SPLAT152]] +; CHECK-NEXT: [[TMP58:%.*]] = fadd <2 x double> [[TMP56]], [[TMP57]] +; CHECK-NEXT: [[SPLAT_SPLAT156:%.*]] = shufflevector <2 x double> [[COL_LOAD145]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP59:%.*]] = fmul <2 x double> [[COL_LOAD136]], [[SPLAT_SPLAT156]] +; CHECK-NEXT: [[TMP60:%.*]] = fadd <2 x double> [[TMP52]], [[TMP59]] +; CHECK-NEXT: [[SPLAT_SPLAT159:%.*]] = shufflevector <2 x double> [[COL_LOAD145]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP61:%.*]] = fmul <2 x double> [[COL_LOAD139]], [[SPLAT_SPLAT159]] +; CHECK-NEXT: [[TMP62:%.*]] = fadd <2 x double> [[TMP60]], [[TMP61]] +; CHECK-NEXT: [[TMP63:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 8 +; CHECK-NEXT: [[COL_CAST161:%.*]] = bitcast double* [[TMP63]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP58]], <2 x double>* [[COL_CAST161]], align 8 +; CHECK-NEXT: [[COL_GEP162:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 10 +; CHECK-NEXT: [[COL_CAST163:%.*]] = bitcast double* [[COL_GEP162]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP62]], <2 x double>* [[COL_CAST163]], align 8 +; CHECK-NEXT: [[TMP64:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 2 +; CHECK-NEXT: [[COL_CAST165:%.*]] = bitcast double* [[TMP64]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD166:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST165]], align 8 +; CHECK-NEXT: [[COL_GEP167:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 4 +; CHECK-NEXT: [[COL_CAST168:%.*]] = bitcast double* [[COL_GEP167]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD169:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST168]], align 8 +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 8 +; CHECK-NEXT: [[COL_CAST171:%.*]] = bitcast double* [[TMP65]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD172:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST171]], align 8 +; CHECK-NEXT: [[COL_GEP173:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 10 +; CHECK-NEXT: [[COL_CAST174:%.*]] = bitcast double* [[COL_GEP173]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD175:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST174]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT178:%.*]] = shufflevector <2 x double> [[COL_LOAD172]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP66:%.*]] = fmul <2 x double> [[COL_LOAD166]], [[SPLAT_SPLAT178]] +; CHECK-NEXT: [[SPLAT_SPLAT181:%.*]] = shufflevector <2 x double> [[COL_LOAD172]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP67:%.*]] = fmul <2 x double> [[COL_LOAD169]], [[SPLAT_SPLAT181]] +; CHECK-NEXT: [[TMP68:%.*]] = fadd <2 x double> [[TMP66]], [[TMP67]] +; CHECK-NEXT: [[SPLAT_SPLAT184:%.*]] = shufflevector <2 x double> [[COL_LOAD175]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP69:%.*]] = fmul <2 x double> [[COL_LOAD166]], [[SPLAT_SPLAT184]] +; CHECK-NEXT: [[SPLAT_SPLAT187:%.*]] = shufflevector <2 x double> [[COL_LOAD175]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP70:%.*]] = fmul <2 x double> [[COL_LOAD169]], [[SPLAT_SPLAT187]] +; CHECK-NEXT: [[TMP71:%.*]] = fadd <2 x double> [[TMP69]], [[TMP70]] +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 10 +; CHECK-NEXT: [[COL_CAST189:%.*]] = bitcast double* [[TMP72]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD190:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST189]], align 8 +; CHECK-NEXT: [[COL_GEP191:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 12 +; CHECK-NEXT: [[COL_CAST192:%.*]] = bitcast double* [[COL_GEP191]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD193:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST192]], align 8 +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 10 +; CHECK-NEXT: [[COL_CAST195:%.*]] = bitcast double* [[TMP73]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD196:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST195]], align 8 +; CHECK-NEXT: [[COL_GEP197:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 12 +; CHECK-NEXT: [[COL_CAST198:%.*]] = bitcast double* [[COL_GEP197]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD199:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST198]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT203:%.*]] = shufflevector <2 x double> [[COL_LOAD196]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP74:%.*]] = fmul <2 x double> [[COL_LOAD190]], [[SPLAT_SPLAT203]] +; CHECK-NEXT: [[TMP75:%.*]] = fadd <2 x double> [[TMP68]], [[TMP74]] +; CHECK-NEXT: [[SPLAT_SPLAT206:%.*]] = shufflevector <2 x double> [[COL_LOAD196]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP76:%.*]] = fmul <2 x double> [[COL_LOAD193]], [[SPLAT_SPLAT206]] +; CHECK-NEXT: [[TMP77:%.*]] = fadd <2 x double> [[TMP75]], [[TMP76]] +; CHECK-NEXT: [[SPLAT_SPLAT210:%.*]] = shufflevector <2 x double> [[COL_LOAD199]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP78:%.*]] = fmul <2 x double> [[COL_LOAD190]], [[SPLAT_SPLAT210]] +; CHECK-NEXT: [[TMP79:%.*]] = fadd <2 x double> [[TMP71]], [[TMP78]] +; CHECK-NEXT: [[SPLAT_SPLAT213:%.*]] = shufflevector <2 x double> [[COL_LOAD199]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP80:%.*]] = fmul <2 x double> [[COL_LOAD193]], [[SPLAT_SPLAT213]] +; CHECK-NEXT: [[TMP81:%.*]] = fadd <2 x double> [[TMP79]], [[TMP80]] +; CHECK-NEXT: [[TMP82:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 10 +; CHECK-NEXT: [[COL_CAST215:%.*]] = bitcast double* [[TMP82]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP77]], <2 x double>* [[COL_CAST215]], align 8 +; CHECK-NEXT: [[COL_GEP216:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 12 +; CHECK-NEXT: [[COL_CAST217:%.*]] = bitcast double* [[COL_GEP216]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP81]], <2 x double>* [[COL_CAST217]], align 8 +; CHECK-NEXT: ret void +; +entry: + %a = load <16 x double>, <16 x double>* %A, align 16 + %b = load <16 x double>, <16 x double>* %B, align 16 + + %c = call <16 x double> @llvm.matrix.multiply(<16 x double> %a, <16 x double> %b, i32 4, i32 4, i32 4) + + store <16 x double> %c, <16 x double>* %C, align 16 + ret void +} + +declare <16 x double> @llvm.matrix.multiply(<16 x double>, <16 x double>, i32, i32, i32)