diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -40,6 +40,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/MatrixUtils.h" using namespace llvm; using namespace PatternMatch; @@ -1176,6 +1177,57 @@ return Res; } + void createTiledLoops(CallInst *MatMul, Value *LPtr, ShapeInfo LShape, + Value *RPtr, ShapeInfo RShape, Value *ResPtr, + bool AllowContract) { + auto *EltType = cast(MatMul->getType())->getElementType(); + + // Create the main tiling loop nest. + TileInfo TI(LShape.NumRows, RShape.NumColumns, LShape.NumColumns, TileSize); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + Instruction *InsertI = cast(MatMul); + BasicBlock *Start = InsertI->getParent(); + BasicBlock *End = SplitBlock(InsertI->getParent(), InsertI, &DT, nullptr, + nullptr, "continue"); + IRBuilder<> Builder(MatMul); + BasicBlock *InnerBody = TI.CreateTiledLoops(Start, End, Builder, DTU, LI); + Builder.SetInsertPoint(InnerBody->getTerminator()); + + Builder.SetInsertPoint(TI.RowLoopHeader->getTerminator()); + Type *TileVecTy = + VectorType::get(MatMul->getType()->getScalarType(), TileSize); + + MatrixTy TileResult; + // Insert in the inner loop header. + Builder.SetInsertPoint(TI.InnerLoopHeader->getTerminator()); + // Create PHI nodes for the result columns to accumulate across iterations. + SmallVector ColumnPhis; + for (unsigned I = 0; I < TileSize; I++) { + auto *Phi = Builder.CreatePHI(TileVecTy, 2); + Phi->addIncoming(ConstantAggregateZero::get(TileVecTy), + TI.RowLoopHeader->getSingleSuccessor()); + TileResult.addVector(Phi); + ColumnPhis.push_back(Phi); + } + + // Insert in the inner loop body, which computes + // Res += Load(CurrentRow, K) * Load(K, CurrentColumn) + Builder.SetInsertPoint(InnerBody->getTerminator()); + // Load tiles of the operands. + MatrixTy A = loadMatrix(LPtr, LShape, TI.CurrentRow, TI.CurrentK, + {TileSize, TileSize}, EltType, Builder); + MatrixTy B = loadMatrix(RPtr, RShape, TI.CurrentK, TI.CurrentCol, + {TileSize, TileSize}, EltType, Builder); + emitMatrixMultiply(TileResult, A, B, AllowContract, Builder, true); + // Store result after the inner loop is done. + Builder.SetInsertPoint(TI.RowLoopLatch->getTerminator()); + storeMatrix(TileResult, ResPtr, {LShape.NumRows, RShape.NumColumns}, + TI.CurrentRow, TI.CurrentCol, EltType, Builder); + + for (unsigned I = 0; I < TileResult.getNumVectors(); I++) + ColumnPhis[I]->addIncoming(TileResult.getVector(I), TI.InnerLoopLatch); + } + void emitSIMDTiling(CallInst *MatMul, LoadInst *LoadOp0, LoadInst *LoadOp1, StoreInst *Store, SmallPtrSetImpl &FusedInsts) { @@ -1198,26 +1250,30 @@ bool AllowContract = AllowContractEnabled || (isa(MatMul) && MatMul->hasAllowContract()); - IRBuilder<> Builder(Store); - for (unsigned J = 0; J < C; J += TileSize) - for (unsigned I = 0; I < R; I += TileSize) { - const unsigned TileR = std::min(R - I, unsigned(TileSize)); - const unsigned TileC = std::min(C - J, unsigned(TileSize)); - MatrixTy Res = getZeroMatrix(EltType, TileR, TileC); - - for (unsigned K = 0; K < M; K += TileSize) { - const unsigned TileM = std::min(M - K, unsigned(TileSize)); - MatrixTy A = - loadMatrix(APtr, LShape, Builder.getInt32(I), Builder.getInt32(K), - {TileR, TileM}, EltType, Builder); - MatrixTy B = - loadMatrix(BPtr, RShape, Builder.getInt32(K), Builder.getInt32(J), - {TileM, TileC}, EltType, Builder); - emitMatrixMultiply(Res, A, B, AllowContract, Builder, true); + if (R % TileSize == 0 && C % TileSize == 0) { + createTiledLoops(MatMul, APtr, LShape, BPtr, RShape, CPtr, AllowContract); + } else { + IRBuilder<> Builder(Store); + for (unsigned J = 0; J < C; J += TileSize) + for (unsigned I = 0; I < R; I += TileSize) { + const unsigned TileR = std::min(R - I, unsigned(TileSize)); + const unsigned TileC = std::min(C - J, unsigned(TileSize)); + MatrixTy Res = getZeroMatrix(EltType, TileR, TileC); + + for (unsigned K = 0; K < M; K += TileSize) { + const unsigned TileM = std::min(M - K, unsigned(TileSize)); + MatrixTy A = loadMatrix(APtr, LShape, Builder.getInt32(I), + Builder.getInt32(K), {TileR, TileM}, + EltType, Builder); + MatrixTy B = loadMatrix(BPtr, RShape, Builder.getInt32(K), + Builder.getInt32(J), {TileM, TileC}, + EltType, Builder); + emitMatrixMultiply(Res, A, B, AllowContract, Builder, true); + } + storeMatrix(Res, CPtr, {R, M}, Builder.getInt32(I), + Builder.getInt32(J), EltType, Builder); } - storeMatrix(Res, CPtr, {R, M}, Builder.getInt32(I), Builder.getInt32(J), - EltType, Builder); - } + } // Mark eliminated instructions as fused and remove them. FusedInsts.insert(Store); diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll @@ -0,0 +1,402 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -lower-matrix-intrinsics -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -instcombine -verify-dom-info %s -S | FileCheck %s + +; REQUIRES: aarch64-registered-target + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "aarch64-apple-ios" + +define void @multiply_noalias_4x4(<16 x double>* noalias %A, <16 x double>* noalias %B, <16 x double>* noalias %C) { +; CHECK-LABEL: @multiply_noalias_4x4( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[COLS_HEADER:%.*]] +; CHECK: cols.header: +; CHECK-NEXT: [[COLS_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[COLS_STEP:%.*]], [[COLS_LATCH:%.*]] ] +; CHECK-NEXT: br label [[COLS_BODY:%.*]] +; CHECK: cols.body: +; CHECK-NEXT: br label [[ROWS_HEADER:%.*]] +; CHECK: rows.header: +; CHECK-NEXT: [[ROWS_IV:%.*]] = phi i32 [ 0, [[COLS_BODY]] ], [ [[ROWS_STEP:%.*]], [[ROWS_LATCH:%.*]] ] +; CHECK-NEXT: br label [[ROWS_BODY:%.*]] +; CHECK: rows.body: +; CHECK-NEXT: br label [[INNER_HEADER:%.*]] +; CHECK: inner.header: +; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ 0, [[ROWS_BODY]] ], [ [[INNER_STEP:%.*]], [[INNER_LATCH:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP11:%.*]], [[INNER_LATCH]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x double> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP13:%.*]], [[INNER_LATCH]] ] +; CHECK-NEXT: br label [[INNER_BODY:%.*]] +; CHECK: inner.body: +; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[INNER_IV]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[ROWS_IV]] +; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr <16 x double>, <16 x double>* [[A:%.*]], i64 0, i64 [[TMP4]] +; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast double* [[TMP5]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST]], align 8 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP5]], i64 4 +; CHECK-NEXT: [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST1]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = shl i32 [[COLS_IV]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], [[INNER_IV]] +; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr <16 x double>, <16 x double>* [[B:%.*]], i64 0, i64 [[TMP8]] +; CHECK-NEXT: [[VEC_CAST4:%.*]] = bitcast double* [[TMP9]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD5:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST4]], align 8 +; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr double, double* [[TMP9]], i64 4 +; CHECK-NEXT: [[VEC_CAST7:%.*]] = bitcast double* [[VEC_GEP6]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD8:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST7]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD5]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD]], <2 x double> [[SPLAT_SPLAT]], <2 x double> [[TMP0]]) +; CHECK-NEXT: [[SPLAT_SPLAT12:%.*]] = shufflevector <2 x double> [[COL_LOAD5]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP11]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT12]], <2 x double> [[TMP10]]) +; CHECK-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <2 x double> [[COL_LOAD8]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[TMP1]]) +; CHECK-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <2 x double> [[COL_LOAD8]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP13]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT19]], <2 x double> [[TMP12]]) +; CHECK-NEXT: br label [[INNER_LATCH]] +; CHECK: inner.latch: +; CHECK-NEXT: [[INNER_STEP]] = add i32 [[INNER_IV]], 2 +; CHECK-NEXT: [[INNER_COND:%.*]] = icmp eq i32 [[INNER_STEP]], 4 +; CHECK-NEXT: br i1 [[INNER_COND]], label [[ROWS_LATCH]], label [[INNER_HEADER]] +; CHECK: rows.latch: +; CHECK-NEXT: [[ROWS_STEP]] = add i32 [[ROWS_IV]], 2 +; CHECK-NEXT: [[ROWS_COND:%.*]] = icmp eq i32 [[ROWS_STEP]], 4 +; CHECK-NEXT: [[TMP14:%.*]] = shl i32 [[COLS_IV]], 2 +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], [[ROWS_IV]] +; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr <16 x double>, <16 x double>* [[C:%.*]], i64 0, i64 [[TMP16]] +; CHECK-NEXT: [[VEC_CAST21:%.*]] = bitcast double* [[TMP17]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[VEC_CAST21]], align 8 +; CHECK-NEXT: [[VEC_GEP22:%.*]] = getelementptr double, double* [[TMP17]], i64 4 +; CHECK-NEXT: [[VEC_CAST23:%.*]] = bitcast double* [[VEC_GEP22]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP13]], <2 x double>* [[VEC_CAST23]], align 8 +; CHECK-NEXT: br i1 [[ROWS_COND]], label [[COLS_LATCH]], label [[ROWS_HEADER]] +; CHECK: cols.latch: +; CHECK-NEXT: [[COLS_STEP]] = add i32 [[COLS_IV]], 2 +; CHECK-NEXT: [[COLS_COND:%.*]] = icmp eq i32 [[COLS_STEP]], 4 +; CHECK-NEXT: br i1 [[COLS_COND]], label [[CONTINUE:%.*]], label [[COLS_HEADER]] +; CHECK: continue: +; CHECK-NEXT: ret void +; + +entry: + %a = load <16 x double>, <16 x double>* %A, align 16 + %b = load <16 x double>, <16 x double>* %B, align 16 + + %c = call <16 x double> @llvm.matrix.multiply.v16f64.v16f64.v16f64(<16 x double> %a, <16 x double> %b, i32 4, i32 4, i32 4) + + store <16 x double> %c, <16 x double>* %C, align 16 + ret void +} + + +declare <16 x double> @llvm.matrix.multiply.v16f64.v16f64.v16f64(<16 x double>, <16 x double>, i32, i32, i32) + +define void @multiply_noalias_2x4(<8 x i32>* noalias %A, <8 x i32>* noalias %B, <4 x i32>* noalias %C) { +; CHECK-LABEL: @multiply_noalias_2x4( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[COLS_HEADER:%.*]] +; CHECK: cols.header: +; CHECK-NEXT: [[COLS_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[COLS_STEP:%.*]], [[COLS_LATCH:%.*]] ] +; CHECK-NEXT: br label [[COLS_BODY:%.*]] +; CHECK: cols.body: +; CHECK-NEXT: br label [[ROWS_HEADER:%.*]] +; CHECK: rows.header: +; CHECK-NEXT: [[ROWS_IV:%.*]] = phi i32 [ 0, [[COLS_BODY]] ], [ [[ROWS_STEP:%.*]], [[ROWS_LATCH:%.*]] ] +; CHECK-NEXT: br label [[ROWS_BODY:%.*]] +; CHECK: rows.body: +; CHECK-NEXT: br label [[INNER_HEADER:%.*]] + +; In the inner loop, compute +; Result += Load(A, ROWS_IV, INNER_IV) * Load(B, INNER_IV, COLS_IV) + +; CHECK: inner.header: +; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ 0, [[ROWS_BODY]] ], [ [[INNER_STEP:%.*]], [[INNER_LATCH:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP13:%.*]], [[INNER_LATCH]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP17:%.*]], [[INNER_LATCH]] ] +; CHECK-NEXT: br label [[INNER_BODY:%.*]] +; CHECK: inner.body: +; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[INNER_IV]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[ROWS_IV]] +; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr <8 x i32>, <8 x i32>* [[A:%.*]], i64 0, i64 [[TMP4]] +; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>* +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[VEC_CAST]], align 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, i32* [[TMP5]], i64 2 +; CHECK-NEXT: [[VEC_CAST1:%.*]] = bitcast i32* [[VEC_GEP]] to <2 x i32>* +; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x i32>, <2 x i32>* [[VEC_CAST1]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = shl i32 [[COLS_IV]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], [[INNER_IV]] +; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr <8 x i32>, <8 x i32>* [[B:%.*]], i64 0, i64 [[TMP8]] +; CHECK-NEXT: [[VEC_CAST4:%.*]] = bitcast i32* [[TMP9]] to <2 x i32>* +; CHECK-NEXT: [[COL_LOAD5:%.*]] = load <2 x i32>, <2 x i32>* [[VEC_CAST4]], align 4 +; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr i32, i32* [[TMP9]], i64 4 +; CHECK-NEXT: [[VEC_CAST7:%.*]] = bitcast i32* [[VEC_GEP6]] to <2 x i32>* +; CHECK-NEXT: [[COL_LOAD8:%.*]] = load <2 x i32>, <2 x i32>* [[VEC_CAST7]], align 4 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x i32> [[COL_LOAD5]], <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i32> [[COL_LOAD]], [[SPLAT_SPLAT]] +; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i32> [[TMP0]], [[TMP10]] +; CHECK-NEXT: [[SPLAT_SPLAT12:%.*]] = shufflevector <2 x i32> [[COL_LOAD5]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = mul <2 x i32> [[COL_LOAD2]], [[SPLAT_SPLAT12]] +; CHECK-NEXT: [[TMP13]] = add <2 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <2 x i32> [[COL_LOAD8]], <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = mul <2 x i32> [[COL_LOAD]], [[SPLAT_SPLAT16]] +; CHECK-NEXT: [[TMP15:%.*]] = add <2 x i32> [[TMP1]], [[TMP14]] +; CHECK-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <2 x i32> [[COL_LOAD8]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = mul <2 x i32> [[COL_LOAD2]], [[SPLAT_SPLAT19]] +; CHECK-NEXT: [[TMP17]] = add <2 x i32> [[TMP15]], [[TMP16]] +; CHECK-NEXT: br label [[INNER_LATCH]] +; CHECK: inner.latch: +; CHECK-NEXT: [[INNER_STEP]] = add i32 [[INNER_IV]], 2 +; CHECK-NEXT: [[INNER_COND:%.*]] = icmp eq i32 [[INNER_STEP]], 4 +; CHECK-NEXT: br i1 [[INNER_COND]], label [[ROWS_LATCH]], label [[INNER_HEADER]] + +; Store the current 2x2 tile. + +; CHECK: rows.latch: +; CHECK-NEXT: [[ROWS_STEP]] = add i32 [[ROWS_IV]], 2 +; CHECK-NEXT: [[ROWS_COND:%.*]] = icmp eq i32 [[ROWS_IV]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = shl i32 [[COLS_IV]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], [[ROWS_IV]] +; CHECK-NEXT: [[TMP20:%.*]] = sext i32 [[TMP19]] to i64 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr <4 x i32>, <4 x i32>* [[C:%.*]], i64 0, i64 [[TMP20]] +; CHECK-NEXT: [[VEC_CAST21:%.*]] = bitcast i32* [[TMP21]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP13]], <2 x i32>* [[VEC_CAST21]], align 4 +; CHECK-NEXT: [[VEC_GEP22:%.*]] = getelementptr i32, i32* [[TMP21]], i64 2 +; CHECK-NEXT: [[VEC_CAST23:%.*]] = bitcast i32* [[VEC_GEP22]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP17]], <2 x i32>* [[VEC_CAST23]], align 4 +; CHECK-NEXT: br i1 [[ROWS_COND]], label [[COLS_LATCH]], label [[ROWS_HEADER]] +; CHECK: cols.latch: +; CHECK-NEXT: [[COLS_STEP]] = add i32 [[COLS_IV]], 2 +; CHECK-NEXT: [[COLS_COND:%.*]] = icmp eq i32 [[COLS_IV]], 0 +; CHECK-NEXT: br i1 [[COLS_COND]], label [[CONTINUE:%.*]], label [[COLS_HEADER]] +; CHECK: continue: +; CHECK-NEXT: ret void +; +entry: + %a = load <8 x i32>, <8 x i32>* %A, align 16 + %b = load <8 x i32>, <8 x i32>* %B, align 16 + + %c = call <4 x i32> @llvm.matrix.multiply.v4i32.v8i32.v8i32(<8 x i32> %a, <8 x i32> %b, i32 2, i32 4, i32 2) + + store <4 x i32> %c, <4 x i32>* %C, align 16 + ret void +} + + +declare <4 x i32> @llvm.matrix.multiply.v4i32.v8i32.v8i32(<8 x i32>, <8 x i32>, i32, i32, i32) + +define void @multiply_noalias_4x2_2x8(<8 x i32>* noalias %A, <16 x i32>* noalias %B, <32 x i32>* noalias %C) { +; CHECK-LABEL: @multiply_noalias_4x2_2x8( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[COLS_HEADER:%.*]] +; CHECK: cols.header: +; CHECK-NEXT: [[COLS_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[COLS_STEP:%.*]], [[COLS_LATCH:%.*]] ] +; CHECK-NEXT: br label [[COLS_BODY:%.*]] +; CHECK: cols.body: +; CHECK-NEXT: br label [[ROWS_HEADER:%.*]] +; CHECK: rows.header: +; CHECK-NEXT: [[ROWS_IV:%.*]] = phi i32 [ 0, [[COLS_BODY]] ], [ [[ROWS_STEP:%.*]], [[ROWS_LATCH:%.*]] ] +; CHECK-NEXT: br label [[ROWS_BODY:%.*]] +; CHECK: rows.body: +; CHECK-NEXT: br label [[INNER_HEADER:%.*]] + +; In the inner loop, compute +; Result += Load(A, ROWS_IV, INNER_IV) * Load(B, INNER_IV, COLS_IV) + +; CHECK: inner.header: +; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ 0, [[ROWS_BODY]] ], [ [[INNER_STEP:%.*]], [[INNER_LATCH:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP13:%.*]], [[INNER_LATCH]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP17:%.*]], [[INNER_LATCH]] ] +; CHECK-NEXT: br label [[INNER_BODY:%.*]] +; CHECK: inner.body: +; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[INNER_IV]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[ROWS_IV]] +; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr <8 x i32>, <8 x i32>* [[A:%.*]], i64 0, i64 [[TMP4]] +; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>* +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[VEC_CAST]], align 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, i32* [[TMP5]], i64 4 +; CHECK-NEXT: [[VEC_CAST1:%.*]] = bitcast i32* [[VEC_GEP]] to <2 x i32>* +; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x i32>, <2 x i32>* [[VEC_CAST1]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = shl i32 [[COLS_IV]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], [[INNER_IV]] +; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr <16 x i32>, <16 x i32>* [[B:%.*]], i64 0, i64 [[TMP8]] +; CHECK-NEXT: [[VEC_CAST4:%.*]] = bitcast i32* [[TMP9]] to <2 x i32>* +; CHECK-NEXT: [[COL_LOAD5:%.*]] = load <2 x i32>, <2 x i32>* [[VEC_CAST4]], align 4 +; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr i32, i32* [[TMP9]], i64 2 +; CHECK-NEXT: [[VEC_CAST7:%.*]] = bitcast i32* [[VEC_GEP6]] to <2 x i32>* +; CHECK-NEXT: [[COL_LOAD8:%.*]] = load <2 x i32>, <2 x i32>* [[VEC_CAST7]], align 4 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x i32> [[COL_LOAD5]], <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i32> [[COL_LOAD]], [[SPLAT_SPLAT]] +; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i32> [[TMP0]], [[TMP10]] +; CHECK-NEXT: [[SPLAT_SPLAT12:%.*]] = shufflevector <2 x i32> [[COL_LOAD5]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = mul <2 x i32> [[COL_LOAD2]], [[SPLAT_SPLAT12]] +; CHECK-NEXT: [[TMP13]] = add <2 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <2 x i32> [[COL_LOAD8]], <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = mul <2 x i32> [[COL_LOAD]], [[SPLAT_SPLAT16]] +; CHECK-NEXT: [[TMP15:%.*]] = add <2 x i32> [[TMP1]], [[TMP14]] +; CHECK-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <2 x i32> [[COL_LOAD8]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = mul <2 x i32> [[COL_LOAD2]], [[SPLAT_SPLAT19]] +; CHECK-NEXT: [[TMP17]] = add <2 x i32> [[TMP15]], [[TMP16]] +; CHECK-NEXT: br label [[INNER_LATCH]] +; CHECK: inner.latch: +; CHECK-NEXT: [[INNER_STEP]] = add i32 [[INNER_IV]], 2 +; CHECK-NEXT: [[INNER_COND:%.*]] = icmp eq i32 [[INNER_IV]], 0 +; CHECK-NEXT: br i1 [[INNER_COND]], label [[ROWS_LATCH]], label [[INNER_HEADER]] + +; Store the current 2x2 tile. + +; CHECK: rows.latch: +; CHECK-NEXT: [[ROWS_STEP]] = add i32 [[ROWS_IV]], 2 +; CHECK-NEXT: [[ROWS_COND:%.*]] = icmp eq i32 [[ROWS_STEP]], 4 +; CHECK-NEXT: [[TMP18:%.*]] = shl i32 [[COLS_IV]], 2 +; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], [[ROWS_IV]] +; CHECK-NEXT: [[TMP20:%.*]] = sext i32 [[TMP19]] to i64 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr <32 x i32>, <32 x i32>* [[C:%.*]], i64 0, i64 [[TMP20]] +; CHECK-NEXT: [[VEC_CAST21:%.*]] = bitcast i32* [[TMP21]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP13]], <2 x i32>* [[VEC_CAST21]], align 4 +; CHECK-NEXT: [[VEC_GEP22:%.*]] = getelementptr i32, i32* [[TMP21]], i64 4 +; CHECK-NEXT: [[VEC_CAST23:%.*]] = bitcast i32* [[VEC_GEP22]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP17]], <2 x i32>* [[VEC_CAST23]], align 4 +; CHECK-NEXT: br i1 [[ROWS_COND]], label [[COLS_LATCH]], label [[ROWS_HEADER]] +; CHECK: cols.latch: +; CHECK-NEXT: [[COLS_STEP]] = add i32 [[COLS_IV]], 2 +; CHECK-NEXT: [[COLS_COND:%.*]] = icmp eq i32 [[COLS_STEP]], 8 +; CHECK-NEXT: br i1 [[COLS_COND]], label [[CONTINUE:%.*]], label [[COLS_HEADER]] +; CHECK: continue: +; CHECK-NEXT: ret void +; +entry: + %a = load <8 x i32>, <8 x i32>* %A, align 16 + %b = load <16 x i32>, <16 x i32>* %B, align 16 + + %c = call <32 x i32> @llvm.matrix.multiply.v32i32.v8i32.v16i32(<8 x i32> %a, <16 x i32> %b, i32 4, i32 2, i32 8) + + store <32 x i32> %c, <32 x i32>* %C, align 16 + ret void +} + +declare <32 x i32> @llvm.matrix.multiply.v32i32.v8i32.v16i32(<8 x i32>, <16 x i32>, i32, i32, i32) + + +; Check the runtime aliasing checks. +define void @multiply_alias_2x2(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) { +; CHECK-LABEL: @multiply_alias_2x2( + +; First, check for aliasing at runtime, create non-aliasing copies if required. +; CHECK-NEXT: entry: +; CHECK-NEXT: [[STORE_BEGIN:%.*]] = ptrtoint <4 x float>* [[C:%.*]] to i64 +; CHECK-NEXT: [[STORE_END:%.*]] = add nuw nsw i64 [[STORE_BEGIN]], 16 +; CHECK-NEXT: [[LOAD_BEGIN:%.*]] = ptrtoint <4 x float>* [[A:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[STORE_END]], [[LOAD_BEGIN]] +; CHECK-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]] +; CHECK: alias_cont: +; CHECK-NEXT: [[LOAD_END:%.*]] = add nuw nsw i64 [[LOAD_BEGIN]], 16 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[LOAD_END]], [[STORE_BEGIN]] +; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]] +; CHECK: copy: +; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x float>, align 16 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float>* [[TMP2]] to i8* +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float>* [[A]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 16 dereferenceable(16) [[TMP3]], i8* nonnull align 16 dereferenceable(16) [[TMP4]], i64 16, i1 false) +; CHECK-NEXT: br label [[NO_ALIAS]] +; CHECK: no_alias: +; CHECK-NEXT: [[TMP5:%.*]] = phi <4 x float>* [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ] +; CHECK-NEXT: [[STORE_BEGIN4:%.*]] = ptrtoint <4 x float>* [[C]] to i64 +; CHECK-NEXT: [[STORE_END5:%.*]] = add nuw nsw i64 [[STORE_BEGIN4]], 16 +; CHECK-NEXT: [[LOAD_BEGIN6:%.*]] = ptrtoint <4 x float>* [[B:%.*]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[STORE_END5]], [[LOAD_BEGIN6]] +; CHECK-NEXT: br i1 [[TMP6]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]] +; CHECK: alias_cont1: +; CHECK-NEXT: [[LOAD_END7:%.*]] = add nuw nsw i64 [[LOAD_BEGIN6]], 16 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[LOAD_END7]], [[STORE_BEGIN4]] +; CHECK-NEXT: br i1 [[TMP7]], label [[COPY2:%.*]], label [[NO_ALIAS3]] +; CHECK: copy2: +; CHECK-NEXT: [[TMP8:%.*]] = alloca <4 x float>, align 16 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x float>* [[TMP8]] to i8* +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x float>* [[B]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 16 dereferenceable(16) [[TMP9]], i8* nonnull align 16 dereferenceable(16) [[TMP10]], i64 16, i1 false) +; CHECK-NEXT: br label [[NO_ALIAS3]] +; CHECK: no_alias3: +; CHECK-NEXT: [[TMP11:%.*]] = phi <4 x float>* [ [[B]], [[NO_ALIAS]] ], [ [[B]], [[ALIAS_CONT1]] ], [ [[TMP8]], [[COPY2]] ] +; CHECK-NEXT: br label [[COLS_HEADER:%.*]] +; CHECK: cols.header: +; CHECK-NEXT: [[COLS_IV:%.*]] = phi i32 [ 0, [[NO_ALIAS3]] ], [ [[COLS_STEP:%.*]], [[COLS_LATCH:%.*]] ] +; CHECK-NEXT: br label [[COLS_BODY:%.*]] +; CHECK: cols.body: +; CHECK-NEXT: br label [[ROWS_HEADER:%.*]] +; CHECK: rows.header: +; CHECK-NEXT: [[ROWS_IV:%.*]] = phi i32 [ 0, [[COLS_BODY]] ], [ [[ROWS_STEP:%.*]], [[ROWS_LATCH:%.*]] ] +; CHECK-NEXT: br label [[ROWS_BODY:%.*]] +; CHECK: rows.body: +; CHECK-NEXT: br label [[INNER_HEADER:%.*]] +; CHECK: inner.header: +; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ 0, [[ROWS_BODY]] ], [ [[INNER_STEP:%.*]], [[INNER_LATCH:%.*]] ] +; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x float> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP23:%.*]], [[INNER_LATCH]] ] +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x float> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP25:%.*]], [[INNER_LATCH]] ] +; CHECK-NEXT: br label [[INNER_BODY:%.*]] +; CHECK: inner.body: +; CHECK-NEXT: [[TMP14:%.*]] = shl i32 [[INNER_IV]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], [[ROWS_IV]] +; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr <4 x float>, <4 x float>* [[TMP5]], i64 0, i64 [[TMP16]] +; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast float* [[TMP17]] to <2 x float>* +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, <2 x float>* [[VEC_CAST]], align 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, float* [[TMP17]], i64 2 +; CHECK-NEXT: [[VEC_CAST8:%.*]] = bitcast float* [[VEC_GEP]] to <2 x float>* +; CHECK-NEXT: [[COL_LOAD9:%.*]] = load <2 x float>, <2 x float>* [[VEC_CAST8]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = shl i32 [[COLS_IV]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], [[INNER_IV]] +; CHECK-NEXT: [[TMP20:%.*]] = sext i32 [[TMP19]] to i64 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr <4 x float>, <4 x float>* [[TMP11]], i64 0, i64 [[TMP20]] +; CHECK-NEXT: [[VEC_CAST11:%.*]] = bitcast float* [[TMP21]] to <2 x float>* +; CHECK-NEXT: [[COL_LOAD12:%.*]] = load <2 x float>, <2 x float>* [[VEC_CAST11]], align 4 +; CHECK-NEXT: [[VEC_GEP13:%.*]] = getelementptr float, float* [[TMP21]], i64 2 +; CHECK-NEXT: [[VEC_CAST14:%.*]] = bitcast float* [[VEC_GEP13]] to <2 x float>* +; CHECK-NEXT: [[COL_LOAD15:%.*]] = load <2 x float>, <2 x float>* [[VEC_CAST14]], align 4 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x float> [[COL_LOAD12]], <2 x float> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[COL_LOAD]], <2 x float> [[SPLAT_SPLAT]], <2 x float> [[TMP12]]) +; CHECK-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <2 x float> [[COL_LOAD12]], <2 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP23]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[COL_LOAD9]], <2 x float> [[SPLAT_SPLAT19]], <2 x float> [[TMP22]]) +; CHECK-NEXT: [[SPLAT_SPLAT23:%.*]] = shufflevector <2 x float> [[COL_LOAD15]], <2 x float> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[COL_LOAD]], <2 x float> [[SPLAT_SPLAT23]], <2 x float> [[TMP13]]) +; CHECK-NEXT: [[SPLAT_SPLAT26:%.*]] = shufflevector <2 x float> [[COL_LOAD15]], <2 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP25]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[COL_LOAD9]], <2 x float> [[SPLAT_SPLAT26]], <2 x float> [[TMP24]]) +; CHECK-NEXT: br label [[INNER_LATCH]] +; CHECK: inner.latch: +; CHECK-NEXT: [[INNER_STEP]] = add i32 [[INNER_IV]], 2 +; CHECK-NEXT: [[INNER_COND:%.*]] = icmp eq i32 [[INNER_IV]], 0 +; CHECK-NEXT: br i1 [[INNER_COND]], label [[ROWS_LATCH]], label [[INNER_HEADER]] +; CHECK: rows.latch: +; CHECK-NEXT: [[ROWS_STEP]] = add i32 [[ROWS_IV]], 2 +; CHECK-NEXT: [[ROWS_COND:%.*]] = icmp eq i32 [[ROWS_IV]], 0 +; CHECK-NEXT: [[TMP26:%.*]] = shl i32 [[COLS_IV]], 1 +; CHECK-NEXT: [[TMP27:%.*]] = add i32 [[TMP26]], [[ROWS_IV]] +; CHECK-NEXT: [[TMP28:%.*]] = sext i32 [[TMP27]] to i64 +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr <4 x float>, <4 x float>* [[C]], i64 0, i64 [[TMP28]] +; CHECK-NEXT: [[VEC_CAST28:%.*]] = bitcast float* [[TMP29]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP23]], <2 x float>* [[VEC_CAST28]], align 4 +; CHECK-NEXT: [[VEC_GEP29:%.*]] = getelementptr float, float* [[TMP29]], i64 2 +; CHECK-NEXT: [[VEC_CAST30:%.*]] = bitcast float* [[VEC_GEP29]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP25]], <2 x float>* [[VEC_CAST30]], align 4 +; CHECK-NEXT: br i1 [[ROWS_COND]], label [[COLS_LATCH]], label [[ROWS_HEADER]] +; CHECK: cols.latch: +; CHECK-NEXT: [[COLS_STEP]] = add i32 [[COLS_IV]], 2 +; CHECK-NEXT: [[COLS_COND:%.*]] = icmp eq i32 [[COLS_IV]], 0 +; CHECK-NEXT: br i1 [[COLS_COND]], label [[CONTINUE:%.*]], label [[COLS_HEADER]] +; CHECK: continue: +; CHECK-NEXT: ret void +; +entry: + %a = load <4 x float>, <4 x float>* %A, align 16 + %b = load <4 x float>, <4 x float>* %B, align 16 + + %c = call <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2) + + store <4 x float> %c, <4 x float>* %C, align 16 + ret void +} + +declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -lower-matrix-intrinsics -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -instcombine -verify-dom-info %s -S | FileCheck %s ; REQUIRES: aarch64-registered-target @@ -5,269 +6,173 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "aarch64-apple-ios" -define void @multiply(<16 x double> * %A, <16 x double> * %B, <16 x double>* %C) { +define void @multiply(<15 x double> * %A, <6 x double> * %B, <10 x double>* %C) { ; CHECK-LABEL: @multiply( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ST_B:%.*]] = ptrtoint <16 x double>* [[C:%.*]] to i64 -; CHECK-NEXT: [[ST_E:%.*]] = add nuw nsw i64 [[ST_B]], 128 -; CHECK-NEXT: [[LD_B:%.*]] = ptrtoint <16 x double>* [[A:%.*]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[ST_E]], [[LD_B]] +; CHECK-NEXT: [[STORE_BEGIN:%.*]] = ptrtoint <10 x double>* [[C:%.*]] to i64 +; CHECK-NEXT: [[STORE_END:%.*]] = add nuw nsw i64 [[STORE_BEGIN]], 80 +; CHECK-NEXT: [[LOAD_BEGIN:%.*]] = ptrtoint <15 x double>* [[A:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[STORE_END]], [[LOAD_BEGIN]] ; CHECK-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]] ; CHECK: alias_cont: -; CHECK-NEXT: [[LD_E:%.*]] = add nuw nsw i64 [[LD_B]], 128 -; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[LD_E]], [[ST_B]] +; CHECK-NEXT: [[LOAD_END:%.*]] = add nuw nsw i64 [[LOAD_BEGIN]], 120 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[LOAD_END]], [[STORE_BEGIN]] ; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]] ; CHECK: copy: -; CHECK-NEXT: [[TMP2:%.*]] = alloca <16 x double>, align 128 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x double>* [[TMP2]] to i8* -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x double>* [[A]] to i8* -; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 128 dereferenceable(128) [[TMP3]], i8* nonnull align 16 dereferenceable(128) [[TMP4]], i64 128, i1 false) +; CHECK-NEXT: [[TMP2:%.*]] = alloca <15 x double>, align 128 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <15 x double>* [[TMP2]] to i8* +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <15 x double>* [[A]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 128 dereferenceable(120) [[TMP3]], i8* nonnull align 16 dereferenceable(120) [[TMP4]], i64 120, i1 false) ; CHECK-NEXT: br label [[NO_ALIAS]] ; CHECK: no_alias: -; CHECK-NEXT: [[TMP5:%.*]] = phi <16 x double>* [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ] -; CHECK-NEXT: [[ST_B1:%.*]] = ptrtoint <16 x double>* [[C]] to i64 -; CHECK-NEXT: [[ST_E2:%.*]] = add nuw nsw i64 [[ST_B1]], 128 -; CHECK-NEXT: [[LD_B6:%.*]] = ptrtoint <16 x double>* [[B:%.*]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[ST_E2]], [[LD_B6]] -; CHECK-NEXT: br i1 [[TMP6]], label [[ALIAS_CONT3:%.*]], label [[NO_ALIAS5:%.*]] +; CHECK-NEXT: [[TMP5:%.*]] = phi <15 x double>* [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ] +; CHECK-NEXT: [[STORE_BEGIN4:%.*]] = ptrtoint <10 x double>* [[C]] to i64 +; CHECK-NEXT: [[STORE_END5:%.*]] = add nuw nsw i64 [[STORE_BEGIN4]], 80 +; CHECK-NEXT: [[LOAD_BEGIN6:%.*]] = ptrtoint <6 x double>* [[B:%.*]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[STORE_END5]], [[LOAD_BEGIN6]] +; CHECK-NEXT: br i1 [[TMP6]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]] ; CHECK: alias_cont1: -; CHECK-NEXT: [[LD_E7:%.*]] = add nuw nsw i64 [[LD_B6]], 128 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[LD_E7]], [[ST_B1]] -; CHECK-NEXT: br i1 [[TMP7]], label [[COPY4:%.*]], label [[NO_ALIAS5]] +; CHECK-NEXT: [[LOAD_END7:%.*]] = add nuw nsw i64 [[LOAD_BEGIN6]], 48 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[LOAD_END7]], [[STORE_BEGIN4]] +; CHECK-NEXT: br i1 [[TMP7]], label [[COPY2:%.*]], label [[NO_ALIAS3]] ; CHECK: copy2: -; CHECK-NEXT: [[TMP8:%.*]] = alloca <16 x double>, align 128 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x double>* [[TMP8]] to i8* -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x double>* [[B]] to i8* -; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 128 dereferenceable(128) [[TMP9]], i8* nonnull align 16 dereferenceable(128) [[TMP10]], i64 128, i1 false) -; CHECK-NEXT: br label [[NO_ALIAS5]] - +; CHECK-NEXT: [[TMP8:%.*]] = alloca <6 x double>, align 64 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <6 x double>* [[TMP8]] to i8* +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <6 x double>* [[B]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 64 dereferenceable(48) [[TMP9]], i8* nonnull align 16 dereferenceable(48) [[TMP10]], i64 48, i1 false) +; CHECK-NEXT: br label [[NO_ALIAS3]] ; CHECK: no_alias3: -; CHECK-NEXT: [[TMP11:%.*]] = phi <16 x double>* [ [[B]], [[NO_ALIAS]] ], [ [[B]], [[ALIAS_CONT3]] ], [ [[TMP8]], [[COPY4]] ] - -;; np.dot(a[0:2, 0:2], b[0:2, 0:2]) - -; CHECK-NEXT: [[COL_CAST8:%.*]] = bitcast <16 x double>* [[TMP5]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST8]], align 8 -; CHECK-NEXT: [[COL_GEP:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 4 -; CHECK-NEXT: [[COL_CAST9:%.*]] = bitcast double* [[COL_GEP]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD10:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST9]], align 8 -; CHECK-NEXT: [[COL_CAST12:%.*]] = bitcast <16 x double>* [[TMP11]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD13:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST12]], align 8 -; CHECK-NEXT: [[COL_GEP14:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 4 -; CHECK-NEXT: [[COL_CAST15:%.*]] = bitcast double* [[COL_GEP14]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD16:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST15]], align 8 -; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD13]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = phi <6 x double>* [ [[B]], [[NO_ALIAS]] ], [ [[B]], [[ALIAS_CONT1]] ], [ [[TMP8]], [[COPY2]] ] + +; c[0:2][0:2] = a[0:2][0:2] * b[0:2][0:2] + a[0:2][2] * b[2][0:2] + +; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast <15 x double>* [[TMP5]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST]], align 8 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr <15 x double>, <15 x double>* [[TMP5]], i64 0, i64 5 +; CHECK-NEXT: [[VEC_CAST8:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD9:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST8]], align 8 +; CHECK-NEXT: [[VEC_CAST11:%.*]] = bitcast <6 x double>* [[TMP11]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD12:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST11]], align 8 +; CHECK-NEXT: [[VEC_GEP13:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP11]], i64 0, i64 3 +; CHECK-NEXT: [[VEC_CAST14:%.*]] = bitcast double* [[VEC_GEP13]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD15:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST14]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD12]], <2 x double> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP12:%.*]] = fmul <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]] -; CHECK-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <2 x double> [[COL_LOAD13]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD10]], <2 x double> [[SPLAT_SPLAT19]], <2 x double> [[TMP12]]) -; CHECK-NEXT: [[SPLAT_SPLAT22:%.*]] = shufflevector <2 x double> [[COL_LOAD16]], <2 x double> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = fmul <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT22]] -; CHECK-NEXT: [[SPLAT_SPLAT25:%.*]] = shufflevector <2 x double> [[COL_LOAD16]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD10]], <2 x double> [[SPLAT_SPLAT25]], <2 x double> [[TMP14]]) - -;; + np.dot(a[0:2, 2:4], b[2:4, 0:2]) - -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 8 -; CHECK-NEXT: [[COL_CAST27:%.*]] = bitcast double* [[TMP16]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD28:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST27]], align 8 -; CHECK-NEXT: [[COL_GEP29:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 12 -; CHECK-NEXT: [[COL_CAST30:%.*]] = bitcast double* [[COL_GEP29]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD31:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST30]], align 8 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 2 -; CHECK-NEXT: [[COL_CAST33:%.*]] = bitcast double* [[TMP17]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD34:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST33]], align 8 -; CHECK-NEXT: [[COL_GEP35:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 6 -; CHECK-NEXT: [[COL_CAST36:%.*]] = bitcast double* [[COL_GEP35]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD37:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST36]], align 8 -; CHECK-NEXT: [[SPLAT_SPLAT41:%.*]] = shufflevector <2 x double> [[COL_LOAD34]], <2 x double> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD28]], <2 x double> [[SPLAT_SPLAT41]], <2 x double> [[TMP13]]) -; CHECK-NEXT: [[SPLAT_SPLAT44:%.*]] = shufflevector <2 x double> [[COL_LOAD34]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD31]], <2 x double> [[SPLAT_SPLAT44]], <2 x double> [[TMP18]]) -; CHECK-NEXT: [[SPLAT_SPLAT48:%.*]] = shufflevector <2 x double> [[COL_LOAD37]], <2 x double> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD28]], <2 x double> [[SPLAT_SPLAT48]], <2 x double> [[TMP15]]) -; CHECK-NEXT: [[SPLAT_SPLAT51:%.*]] = shufflevector <2 x double> [[COL_LOAD37]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD31]], <2 x double> [[SPLAT_SPLAT51]], <2 x double> [[TMP20]]) - -;; -> c[0:2, 0:2] - -; CHECK-NEXT: [[COL_CAST53:%.*]] = bitcast <16 x double>* [[C]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP19]], <2 x double>* [[COL_CAST53]], align 8 -; CHECK-NEXT: [[COL_GEP54:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 4 -; CHECK-NEXT: [[COL_CAST55:%.*]] = bitcast double* [[COL_GEP54]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP21]], <2 x double>* [[COL_CAST55]], align 8 - -;; np.dot(a[2:4, 0:2], b[0:2, 0:2]) - -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 2 -; CHECK-NEXT: [[COL_CAST57:%.*]] = bitcast double* [[TMP22]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD58:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST57]], align 8 -; CHECK-NEXT: [[COL_GEP59:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 6 -; CHECK-NEXT: [[COL_CAST60:%.*]] = bitcast double* [[COL_GEP59]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD61:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST60]], align 8 -; CHECK-NEXT: [[COL_CAST63:%.*]] = bitcast <16 x double>* [[TMP11]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD64:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST63]], align 8 -; CHECK-NEXT: [[COL_GEP65:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 4 -; CHECK-NEXT: [[COL_CAST66:%.*]] = bitcast double* [[COL_GEP65]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD67:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST66]], align 8 -; CHECK-NEXT: [[SPLAT_SPLAT70:%.*]] = shufflevector <2 x double> [[COL_LOAD64]], <2 x double> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP23:%.*]] = fmul <2 x double> [[COL_LOAD58]], [[SPLAT_SPLAT70]] -; CHECK-NEXT: [[SPLAT_SPLAT73:%.*]] = shufflevector <2 x double> [[COL_LOAD64]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD61]], <2 x double> [[SPLAT_SPLAT73]], <2 x double> [[TMP23]]) -; CHECK-NEXT: [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD67]], <2 x double> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP25:%.*]] = fmul <2 x double> [[COL_LOAD58]], [[SPLAT_SPLAT76]] -; CHECK-NEXT: [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD67]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD61]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP25]]) - -;; + np.dot(a[2:4, 2:4], b[2:4, 0:2]) - -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 10 -; CHECK-NEXT: [[COL_CAST81:%.*]] = bitcast double* [[TMP27]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD82:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST81]], align 8 -; CHECK-NEXT: [[COL_GEP83:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 14 -; CHECK-NEXT: [[COL_CAST84:%.*]] = bitcast double* [[COL_GEP83]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD85:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST84]], align 8 -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 2 -; CHECK-NEXT: [[COL_CAST87:%.*]] = bitcast double* [[TMP28]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD88:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST87]], align 8 -; CHECK-NEXT: [[COL_GEP89:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 6 -; CHECK-NEXT: [[COL_CAST90:%.*]] = bitcast double* [[COL_GEP89]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD91:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST90]], align 8 -; CHECK-NEXT: [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD88]], <2 x double> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP29:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD82]], <2 x double> [[SPLAT_SPLAT95]], <2 x double> [[TMP24]]) -; CHECK-NEXT: [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD88]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD85]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP29]]) -; CHECK-NEXT: [[SPLAT_SPLAT102:%.*]] = shufflevector <2 x double> [[COL_LOAD91]], <2 x double> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP31:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD82]], <2 x double> [[SPLAT_SPLAT102]], <2 x double> [[TMP26]]) -; CHECK-NEXT: [[SPLAT_SPLAT105:%.*]] = shufflevector <2 x double> [[COL_LOAD91]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD85]], <2 x double> [[SPLAT_SPLAT105]], <2 x double> [[TMP31]]) - -;; -> c[2:4, 0:2] - -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 2 -; CHECK-NEXT: [[COL_CAST107:%.*]] = bitcast double* [[TMP33]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP30]], <2 x double>* [[COL_CAST107]], align 8 -; CHECK-NEXT: [[COL_GEP108:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 6 -; CHECK-NEXT: [[COL_CAST109:%.*]] = bitcast double* [[COL_GEP108]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP32]], <2 x double>* [[COL_CAST109]], align 8 - -;; np.dot(a[0:2, 0:2], b[0:2, 2:4]) - -; CHECK-NEXT: [[COL_CAST111:%.*]] = bitcast <16 x double>* [[TMP5]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD112:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST111]], align 8 -; CHECK-NEXT: [[COL_GEP113:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 4 -; CHECK-NEXT: [[COL_CAST114:%.*]] = bitcast double* [[COL_GEP113]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD115:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST114]], align 8 -; CHECK-NEXT: [[TMP34:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 8 -; CHECK-NEXT: [[COL_CAST117:%.*]] = bitcast double* [[TMP34]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD118:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST117]], align 8 -; CHECK-NEXT: [[COL_GEP119:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 12 -; CHECK-NEXT: [[COL_CAST120:%.*]] = bitcast double* [[COL_GEP119]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD121:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST120]], align 8 -; CHECK-NEXT: [[SPLAT_SPLAT124:%.*]] = shufflevector <2 x double> [[COL_LOAD118]], <2 x double> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP35:%.*]] = fmul <2 x double> [[COL_LOAD112]], [[SPLAT_SPLAT124]] -; CHECK-NEXT: [[SPLAT_SPLAT127:%.*]] = shufflevector <2 x double> [[COL_LOAD118]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[TMP36:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD115]], <2 x double> [[SPLAT_SPLAT127]], <2 x double> [[TMP35]]) -; CHECK-NEXT: [[SPLAT_SPLAT130:%.*]] = shufflevector <2 x double> [[COL_LOAD121]], <2 x double> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP37:%.*]] = fmul <2 x double> [[COL_LOAD112]], [[SPLAT_SPLAT130]] -; CHECK-NEXT: [[SPLAT_SPLAT133:%.*]] = shufflevector <2 x double> [[COL_LOAD121]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD115]], <2 x double> [[SPLAT_SPLAT133]], <2 x double> [[TMP37]]) - -;; + np.dot(a[0:2, 2:4], b[2:4, 2:4]) - -; CHECK-NEXT: [[TMP39:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 8 -; CHECK-NEXT: [[COL_CAST135:%.*]] = bitcast double* [[TMP39]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD136:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST135]], align 8 -; CHECK-NEXT: [[COL_GEP137:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 12 -; CHECK-NEXT: [[COL_CAST138:%.*]] = bitcast double* [[COL_GEP137]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD139:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST138]], align 8 -; CHECK-NEXT: [[TMP40:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 10 -; CHECK-NEXT: [[COL_CAST141:%.*]] = bitcast double* [[TMP40]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD142:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST141]], align 8 -; CHECK-NEXT: [[COL_GEP143:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 14 -; CHECK-NEXT: [[COL_CAST144:%.*]] = bitcast double* [[COL_GEP143]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD145:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST144]], align 8 -; CHECK-NEXT: [[SPLAT_SPLAT149:%.*]] = shufflevector <2 x double> [[COL_LOAD142]], <2 x double> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP41:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD136]], <2 x double> [[SPLAT_SPLAT149]], <2 x double> [[TMP36]]) -; CHECK-NEXT: [[SPLAT_SPLAT152:%.*]] = shufflevector <2 x double> [[COL_LOAD142]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[TMP42:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD139]], <2 x double> [[SPLAT_SPLAT152]], <2 x double> [[TMP41]]) -; CHECK-NEXT: [[SPLAT_SPLAT156:%.*]] = shufflevector <2 x double> [[COL_LOAD145]], <2 x double> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP43:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD136]], <2 x double> [[SPLAT_SPLAT156]], <2 x double> [[TMP38]]) -; CHECK-NEXT: [[SPLAT_SPLAT159:%.*]] = shufflevector <2 x double> [[COL_LOAD145]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[TMP44:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD139]], <2 x double> [[SPLAT_SPLAT159]], <2 x double> [[TMP43]]) - -;; -> c[0:2, 2:4] - -; CHECK-NEXT: [[TMP45:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 8 -; CHECK-NEXT: [[COL_CAST161:%.*]] = bitcast double* [[TMP45]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP42]], <2 x double>* [[COL_CAST161]], align 8 -; CHECK-NEXT: [[COL_GEP162:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 12 -; CHECK-NEXT: [[COL_CAST163:%.*]] = bitcast double* [[COL_GEP162]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP44]], <2 x double>* [[COL_CAST163]], align 8 - -;; np.dot(a[2:4, 0:2], b[2:4, 0:2]) - -; CHECK-NEXT: [[TMP46:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 2 -; CHECK-NEXT: [[COL_CAST165:%.*]] = bitcast double* [[TMP46]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD166:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST165]], align 8 -; CHECK-NEXT: [[COL_GEP167:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 6 -; CHECK-NEXT: [[COL_CAST168:%.*]] = bitcast double* [[COL_GEP167]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD169:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST168]], align 8 -; CHECK-NEXT: [[TMP47:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 8 -; CHECK-NEXT: [[COL_CAST171:%.*]] = bitcast double* [[TMP47]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD172:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST171]], align 8 -; CHECK-NEXT: [[COL_GEP173:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 12 -; CHECK-NEXT: [[COL_CAST174:%.*]] = bitcast double* [[COL_GEP173]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD175:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST174]], align 8 -; CHECK-NEXT: [[SPLAT_SPLAT178:%.*]] = shufflevector <2 x double> [[COL_LOAD172]], <2 x double> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP48:%.*]] = fmul <2 x double> [[COL_LOAD166]], [[SPLAT_SPLAT178]] -; CHECK-NEXT: [[SPLAT_SPLAT181:%.*]] = shufflevector <2 x double> [[COL_LOAD172]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[TMP49:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD169]], <2 x double> [[SPLAT_SPLAT181]], <2 x double> [[TMP48]]) -; CHECK-NEXT: [[SPLAT_SPLAT184:%.*]] = shufflevector <2 x double> [[COL_LOAD175]], <2 x double> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP50:%.*]] = fmul <2 x double> [[COL_LOAD166]], [[SPLAT_SPLAT184]] -; CHECK-NEXT: [[SPLAT_SPLAT187:%.*]] = shufflevector <2 x double> [[COL_LOAD175]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[TMP51:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD169]], <2 x double> [[SPLAT_SPLAT187]], <2 x double> [[TMP50]]) - -;; + np.dot(a[2:4, 2:4], b[2:4, 2:4]) - -; CHECK-NEXT: [[TMP52:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 10 -; CHECK-NEXT: [[COL_CAST189:%.*]] = bitcast double* [[TMP52]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD190:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST189]], align 8 -; CHECK-NEXT: [[COL_GEP191:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 14 -; CHECK-NEXT: [[COL_CAST192:%.*]] = bitcast double* [[COL_GEP191]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD193:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST192]], align 8 -; CHECK-NEXT: [[TMP53:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 10 -; CHECK-NEXT: [[COL_CAST195:%.*]] = bitcast double* [[TMP53]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD196:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST195]], align 8 -; CHECK-NEXT: [[COL_GEP197:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 14 -; CHECK-NEXT: [[COL_CAST198:%.*]] = bitcast double* [[COL_GEP197]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD199:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST198]], align 8 -; CHECK-NEXT: [[SPLAT_SPLAT203:%.*]] = shufflevector <2 x double> [[COL_LOAD196]], <2 x double> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP54:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD190]], <2 x double> [[SPLAT_SPLAT203]], <2 x double> [[TMP49]]) -; CHECK-NEXT: [[SPLAT_SPLAT206:%.*]] = shufflevector <2 x double> [[COL_LOAD196]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[TMP55:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD193]], <2 x double> [[SPLAT_SPLAT206]], <2 x double> [[TMP54]]) -; CHECK-NEXT: [[SPLAT_SPLAT210:%.*]] = shufflevector <2 x double> [[COL_LOAD199]], <2 x double> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP56:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD190]], <2 x double> [[SPLAT_SPLAT210]], <2 x double> [[TMP51]]) -; CHECK-NEXT: [[SPLAT_SPLAT213:%.*]] = shufflevector <2 x double> [[COL_LOAD199]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[TMP57:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD193]], <2 x double> [[SPLAT_SPLAT213]], <2 x double> [[TMP56]]) - -;; -> c[2:4, 2:4] - -; CHECK-NEXT: [[TMP58:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 10 -; CHECK-NEXT: [[COL_CAST215:%.*]] = bitcast double* [[TMP58]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP55]], <2 x double>* [[COL_CAST215]], align 8 -; CHECK-NEXT: [[COL_GEP216:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 14 -; CHECK-NEXT: [[COL_CAST217:%.*]] = bitcast double* [[COL_GEP216]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP57]], <2 x double>* [[COL_CAST217]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT18:%.*]] = shufflevector <2 x double> [[COL_LOAD12]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD9]], <2 x double> [[SPLAT_SPLAT18]], <2 x double> [[TMP12]]) +; CHECK-NEXT: [[SPLAT_SPLAT21:%.*]] = shufflevector <2 x double> [[COL_LOAD15]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = fmul <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT21]] +; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <2 x double> [[COL_LOAD15]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD9]], <2 x double> [[SPLAT_SPLAT24]], <2 x double> [[TMP14]]) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr <15 x double>, <15 x double>* [[TMP5]], i64 0, i64 10 +; CHECK-NEXT: [[VEC_CAST26:%.*]] = bitcast double* [[TMP16]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD27:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST26]], align 8 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP11]], i64 0, i64 2 +; CHECK-NEXT: [[VEC_CAST29:%.*]] = bitcast double* [[TMP17]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD30:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST29]], align 8 +; CHECK-NEXT: [[VEC_GEP31:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP11]], i64 0, i64 5 +; CHECK-NEXT: [[VEC_CAST32:%.*]] = bitcast double* [[VEC_GEP31]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD33:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST32]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT37:%.*]] = shufflevector <1 x double> [[COL_LOAD30]], <1 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD27]], <2 x double> [[SPLAT_SPLAT37]], <2 x double> [[TMP13]]) +; CHECK-NEXT: [[SPLAT_SPLAT41:%.*]] = shufflevector <1 x double> [[COL_LOAD33]], <1 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD27]], <2 x double> [[SPLAT_SPLAT41]], <2 x double> [[TMP15]]) +; CHECK-NEXT: [[VEC_CAST43:%.*]] = bitcast <10 x double>* [[C]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP18]], <2 x double>* [[VEC_CAST43]], align 8 +; CHECK-NEXT: [[VEC_GEP44:%.*]] = getelementptr <10 x double>, <10 x double>* [[C]], i64 0, i64 5 +; CHECK-NEXT: [[VEC_CAST45:%.*]] = bitcast double* [[VEC_GEP44]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP19]], <2 x double>* [[VEC_CAST45]], align 8 + + +; c[2:4][0:2] = a[2:4][0:2] * b[0:2][0:2] + a[2:4][2] * b[2][0:2] + +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr <15 x double>, <15 x double>* [[TMP5]], i64 0, i64 2 +; CHECK-NEXT: [[VEC_CAST47:%.*]] = bitcast double* [[TMP20]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD48:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST47]], align 8 +; CHECK-NEXT: [[VEC_GEP49:%.*]] = getelementptr <15 x double>, <15 x double>* [[TMP5]], i64 0, i64 7 +; CHECK-NEXT: [[VEC_CAST50:%.*]] = bitcast double* [[VEC_GEP49]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD51:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST50]], align 8 +; CHECK-NEXT: [[VEC_CAST53:%.*]] = bitcast <6 x double>* [[TMP11]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD54:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST53]], align 8 +; CHECK-NEXT: [[VEC_GEP55:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP11]], i64 0, i64 3 +; CHECK-NEXT: [[VEC_CAST56:%.*]] = bitcast double* [[VEC_GEP55]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD57:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST56]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT60:%.*]] = shufflevector <2 x double> [[COL_LOAD54]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = fmul <2 x double> [[COL_LOAD48]], [[SPLAT_SPLAT60]] +; CHECK-NEXT: [[SPLAT_SPLAT63:%.*]] = shufflevector <2 x double> [[COL_LOAD54]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD51]], <2 x double> [[SPLAT_SPLAT63]], <2 x double> [[TMP21]]) +; CHECK-NEXT: [[SPLAT_SPLAT66:%.*]] = shufflevector <2 x double> [[COL_LOAD57]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = fmul <2 x double> [[COL_LOAD48]], [[SPLAT_SPLAT66]] +; CHECK-NEXT: [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD57]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD51]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP23]]) +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr <15 x double>, <15 x double>* [[TMP5]], i64 0, i64 12 +; CHECK-NEXT: [[VEC_CAST71:%.*]] = bitcast double* [[TMP25]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD72:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST71]], align 8 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP11]], i64 0, i64 2 +; CHECK-NEXT: [[VEC_CAST74:%.*]] = bitcast double* [[TMP26]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD75:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST74]], align 8 +; CHECK-NEXT: [[VEC_GEP76:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP11]], i64 0, i64 5 +; CHECK-NEXT: [[VEC_CAST77:%.*]] = bitcast double* [[VEC_GEP76]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD78:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST77]], align 8 +; CHECK-NEXT: [[SPLAT_SPLAT82:%.*]] = shufflevector <1 x double> [[COL_LOAD75]], <1 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD72]], <2 x double> [[SPLAT_SPLAT82]], <2 x double> [[TMP22]]) +; CHECK-NEXT: [[SPLAT_SPLAT86:%.*]] = shufflevector <1 x double> [[COL_LOAD78]], <1 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP28:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD72]], <2 x double> [[SPLAT_SPLAT86]], <2 x double> [[TMP24]]) +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr <10 x double>, <10 x double>* [[C]], i64 0, i64 2 +; CHECK-NEXT: [[VEC_CAST88:%.*]] = bitcast double* [[TMP29]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP27]], <2 x double>* [[VEC_CAST88]], align 8 +; CHECK-NEXT: [[VEC_GEP89:%.*]] = getelementptr <10 x double>, <10 x double>* [[C]], i64 0, i64 7 +; CHECK-NEXT: [[VEC_CAST90:%.*]] = bitcast double* [[VEC_GEP89]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP28]], <2 x double>* [[VEC_CAST90]], align 8 + +; c[4][0:2] = a[4][0:2] * b[0:2][0:2] + a[4][0:2] * b[2][0:2] + +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr <15 x double>, <15 x double>* [[TMP5]], i64 0, i64 4 +; CHECK-NEXT: [[VEC_CAST92:%.*]] = bitcast double* [[TMP30]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD93:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST92]], align 8 +; CHECK-NEXT: [[VEC_GEP94:%.*]] = getelementptr <15 x double>, <15 x double>* [[TMP5]], i64 0, i64 9 +; CHECK-NEXT: [[VEC_CAST95:%.*]] = bitcast double* [[VEC_GEP94]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD96:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST95]], align 8 +; CHECK-NEXT: [[VEC_CAST98:%.*]] = bitcast <6 x double>* [[TMP11]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD99:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST98]], align 8 +; CHECK-NEXT: [[VEC_GEP100:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP11]], i64 0, i64 3 +; CHECK-NEXT: [[VEC_CAST101:%.*]] = bitcast double* [[VEC_GEP100]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD102:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST101]], align 8 +; CHECK-NEXT: [[SPLAT_SPLATINSERT104:%.*]] = shufflevector <2 x double> [[COL_LOAD99]], <2 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP31:%.*]] = fmul <1 x double> [[COL_LOAD93]], [[SPLAT_SPLATINSERT104]] +; CHECK-NEXT: [[SPLAT_SPLATINSERT107:%.*]] = shufflevector <2 x double> [[COL_LOAD99]], <2 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD96]], <1 x double> [[SPLAT_SPLATINSERT107]], <1 x double> [[TMP31]]) +; CHECK-NEXT: [[SPLAT_SPLATINSERT110:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP33:%.*]] = fmul <1 x double> [[COL_LOAD93]], [[SPLAT_SPLATINSERT110]] +; CHECK-NEXT: [[SPLAT_SPLATINSERT113:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD96]], <1 x double> [[SPLAT_SPLATINSERT113]], <1 x double> [[TMP33]]) +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr <15 x double>, <15 x double>* [[TMP5]], i64 0, i64 14 +; CHECK-NEXT: [[VEC_CAST116:%.*]] = bitcast double* [[TMP35]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD117:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST116]], align 8 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP11]], i64 0, i64 2 +; CHECK-NEXT: [[VEC_CAST119:%.*]] = bitcast double* [[TMP36]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD120:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST119]], align 8 +; CHECK-NEXT: [[VEC_GEP121:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP11]], i64 0, i64 5 +; CHECK-NEXT: [[VEC_CAST122:%.*]] = bitcast double* [[VEC_GEP121]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD123:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST122]], align 8 +; CHECK-NEXT: [[TMP37:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD117]], <1 x double> [[COL_LOAD120]], <1 x double> [[TMP32]]) +; CHECK-NEXT: [[TMP38:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD117]], <1 x double> [[COL_LOAD123]], <1 x double> [[TMP34]]) +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr <10 x double>, <10 x double>* [[C]], i64 0, i64 4 +; CHECK-NEXT: [[VEC_CAST133:%.*]] = bitcast double* [[TMP39]] to <1 x double>* +; CHECK-NEXT: store <1 x double> [[TMP37]], <1 x double>* [[VEC_CAST133]], align 8 +; CHECK-NEXT: [[VEC_GEP134:%.*]] = getelementptr <10 x double>, <10 x double>* [[C]], i64 0, i64 9 +; CHECK-NEXT: [[VEC_CAST135:%.*]] = bitcast double* [[VEC_GEP134]] to <1 x double>* +; CHECK-NEXT: store <1 x double> [[TMP38]], <1 x double>* [[VEC_CAST135]], align 8 ; CHECK-NEXT: ret void ; entry: - %a = load <16 x double>, <16 x double>* %A, align 16 - %b = load <16 x double>, <16 x double>* %B, align 16 + %a = load <15 x double>, <15 x double>* %A, align 16 + %b = load <6 x double>, <6 x double>* %B, align 16 - %c = call <16 x double> @llvm.matrix.multiply(<16 x double> %a, <16 x double> %b, i32 4, i32 4, i32 4) + %c = call <10 x double> @llvm.matrix.multiply(<15 x double> %a, <6 x double> %b, i32 5, i32 3, i32 2) - store <16 x double> %c, <16 x double>* %C, align 16 + store <10 x double> %c, <10 x double>* %C, align 16 ret void } -declare <16 x double> @llvm.matrix.multiply(<16 x double>, <16 x double>, i32, i32, i32) +declare <10 x double> @llvm.matrix.multiply(<15 x double>, <6 x double>, i32, i32, i32)