diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -719,9 +719,9 @@ if (auto *BinOp = dyn_cast(Inst)) Changed |= VisitBinaryOperator(BinOp); if (match(Inst, m_Load(m_Value(Op1)))) - Changed |= VisitLoad(Inst, Op1, Builder); + Changed |= VisitLoad(cast(Inst), Op1, Builder); else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2)))) - Changed |= VisitStore(Inst, Op1, Op2, Builder); + Changed |= VisitStore(cast(Inst), Op1, Op2, Builder); } RemarkGenerator RemarkGen(Inst2ColumnMatrix, ORE, Func); @@ -733,16 +733,18 @@ return Changed; } - LoadInst *createVectorLoad(Value *ColumnPtr, Type *EltType, + LoadInst *createVectorLoad(Value *ColumnPtr, Type *EltType, bool IsVolatile, IRBuilder<> &Builder) { - return Builder.CreateAlignedLoad( - ColumnPtr, Align(DL.getABITypeAlignment(EltType)), "col.load"); + return Builder.CreateAlignedLoad(ColumnPtr, + Align(DL.getABITypeAlignment(EltType)), + IsVolatile, "col.load"); } StoreInst *createVectorStore(Value *ColumnValue, Value *ColumnPtr, - Type *EltType, IRBuilder<> &Builder) { + Type *EltType, bool IsVolatile, + IRBuilder<> &Builder) { return Builder.CreateAlignedStore(ColumnValue, ColumnPtr, - DL.getABITypeAlign(EltType)); + DL.getABITypeAlign(EltType), IsVolatile); } /// Turns \p BasePtr into an elementwise pointer to \p EltType. @@ -778,8 +780,8 @@ /// Load a matrix with \p Shape starting at \p Ptr and using \p Stride between /// vectors. - MatrixTy loadMatrix(Type *Ty, Value *Ptr, Value *Stride, ShapeInfo Shape, - IRBuilder<> &Builder) { + MatrixTy loadMatrix(Type *Ty, Value *Ptr, Value *Stride, bool IsVolatile, + ShapeInfo Shape, IRBuilder<> &Builder) { auto VType = cast(Ty); Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); MatrixTy Result; @@ -787,7 +789,8 @@ Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(I), Stride, Shape.getStride(), VType->getElementType(), Builder); - Value *Vector = createVectorLoad(GEP, VType->getElementType(), Builder); + Value *Vector = + createVectorLoad(GEP, VType->getElementType(), IsVolatile, Builder); Result.addVector(Vector); } return Result.addNumLoads(getNumOps(Result.getVectorTy()) * @@ -796,8 +799,8 @@ /// Loads a sub-matrix with shape \p ResultShape from a \p R x \p C matrix, /// starting at \p MatrixPtr[I][J]. - MatrixTy loadMatrix(Value *MatrixPtr, ShapeInfo MatrixShape, Value *I, - Value *J, ShapeInfo ResultShape, Type *EltTy, + MatrixTy loadMatrix(Value *MatrixPtr, bool IsVolatile, ShapeInfo MatrixShape, + Value *I, Value *J, ShapeInfo ResultShape, Type *EltTy, IRBuilder<> &Builder) { Value *Offset = Builder.CreateAdd( @@ -814,17 +817,18 @@ Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast"); return loadMatrix(TileTy, TilePtr, - Builder.getInt64(MatrixShape.getStride()), ResultShape, - Builder); + Builder.getInt64(MatrixShape.getStride()), IsVolatile, + ResultShape, Builder); } /// Lower a load instruction with shape information. - void LowerLoad(Instruction *Inst, Value *Ptr, Value *Stride, + void LowerLoad(Instruction *Inst, Value *Ptr, Value *Stride, bool IsVolatile, ShapeInfo Shape) { IRBuilder<> Builder(Inst); - finalizeLowering(Inst, - loadMatrix(Inst->getType(), Ptr, Stride, Shape, Builder), - Builder); + finalizeLowering( + Inst, + loadMatrix(Inst->getType(), Ptr, Stride, IsVolatile, Shape, Builder), + Builder); } /// Lowers llvm.matrix.column.major.load. @@ -836,12 +840,13 @@ Value *Ptr = Inst->getArgOperand(0); Value *Stride = Inst->getArgOperand(1); LowerLoad(Inst, Ptr, Stride, + cast(Inst->getArgOperand(2))->isOne(), {Inst->getArgOperand(3), Inst->getArgOperand(4)}); } /// Stores a sub-matrix \p StoreVal into the \p R x \p C matrix starting at \p /// MatrixPtr[I][J]. - void storeMatrix(const MatrixTy &StoreVal, Value *MatrixPtr, + void storeMatrix(const MatrixTy &StoreVal, Value *MatrixPtr, bool IsVolatile, ShapeInfo MatrixShape, Value *I, Value *J, Type *EltTy, IRBuilder<> &Builder) { Value *Offset = Builder.CreateAdd( @@ -858,20 +863,21 @@ Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast"); storeMatrix(TileTy, StoreVal, TilePtr, - Builder.getInt64(MatrixShape.getStride()), Builder); + Builder.getInt64(MatrixShape.getStride()), IsVolatile, Builder); } /// Store matrix \p StoreVal starting at \p Ptr and using \p Stride between /// vectors. MatrixTy storeMatrix(Type *Ty, MatrixTy StoreVal, Value *Ptr, Value *Stride, - IRBuilder<> &Builder) { + bool IsVolatile, IRBuilder<> &Builder) { auto VType = cast(Ty); Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); for (auto Vec : enumerate(StoreVal.vectors())) { Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(Vec.index()), Stride, StoreVal.getStride(), VType->getElementType(), Builder); - createVectorStore(Vec.value(), GEP, VType->getElementType(), Builder); + createVectorStore(Vec.value(), GEP, VType->getElementType(), IsVolatile, + Builder); } return MatrixTy().addNumStores(getNumOps(StoreVal.getVectorTy()) * StoreVal.getNumVectors()); @@ -879,12 +885,13 @@ /// Lower a store instruction with shape information. void LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr, Value *Stride, - ShapeInfo Shape) { + bool IsVolatile, ShapeInfo Shape) { IRBuilder<> Builder(Inst); auto StoreVal = getMatrix(Matrix, Shape, Builder); - finalizeLowering( - Inst, storeMatrix(Matrix->getType(), StoreVal, Ptr, Stride, Builder), - Builder); + finalizeLowering(Inst, + storeMatrix(Matrix->getType(), StoreVal, Ptr, Stride, + IsVolatile, Builder), + Builder); } /// Lowers llvm.matrix.column.major.store. @@ -897,6 +904,7 @@ Value *Ptr = Inst->getArgOperand(1); Value *Stride = Inst->getArgOperand(2); LowerStore(Inst, Matrix, Ptr, Stride, + cast(Inst->getArgOperand(3))->isOne(), {Inst->getArgOperand(4), Inst->getArgOperand(5)}); } @@ -1208,16 +1216,16 @@ for (unsigned K = 0; K < M; K += TileSize) { const unsigned TileM = std::min(M - K, unsigned(TileSize)); - MatrixTy A = - loadMatrix(APtr, LShape, Builder.getInt64(I), Builder.getInt64(K), - {TileR, TileM}, EltType, Builder); - MatrixTy B = - loadMatrix(BPtr, RShape, Builder.getInt64(K), Builder.getInt64(J), - {TileM, TileC}, EltType, Builder); + MatrixTy A = loadMatrix(APtr, LoadOp0->isVolatile(), LShape, + Builder.getInt64(I), Builder.getInt64(K), + {TileR, TileM}, EltType, Builder); + MatrixTy B = loadMatrix(BPtr, LoadOp1->isVolatile(), RShape, + Builder.getInt64(K), Builder.getInt64(J), + {TileM, TileC}, EltType, Builder); emitMatrixMultiply(Res, A, B, AllowContract, Builder, true); } - storeMatrix(Res, CPtr, {R, M}, Builder.getInt64(I), Builder.getInt64(J), - EltType, Builder); + storeMatrix(Res, CPtr, Store->isVolatile(), {R, M}, Builder.getInt64(I), + Builder.getInt64(J), EltType, Builder); } // Mark eliminated instructions as fused and remove them. @@ -1325,23 +1333,24 @@ } /// Lower load instructions, if shape information is available. - bool VisitLoad(Instruction *Inst, Value *Ptr, IRBuilder<> &Builder) { + bool VisitLoad(LoadInst *Inst, Value *Ptr, IRBuilder<> &Builder) { auto I = ShapeMap.find(Inst); if (I == ShapeMap.end()) return false; - LowerLoad(Inst, Ptr, Builder.getInt64(I->second.getStride()), I->second); + LowerLoad(Inst, Ptr, Builder.getInt64(I->second.getStride()), + Inst->isVolatile(), I->second); return true; } - bool VisitStore(Instruction *Inst, Value *StoredVal, Value *Ptr, + bool VisitStore(StoreInst *Inst, Value *StoredVal, Value *Ptr, IRBuilder<> &Builder) { auto I = ShapeMap.find(StoredVal); if (I == ShapeMap.end()) return false; LowerStore(Inst, StoredVal, Ptr, Builder.getInt64(I->second.getStride()), - I->second); + Inst->isVolatile(), I->second); return true; } diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/load-align-volatile.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/load-align-volatile.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/load-align-volatile.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/load-align-volatile.ll @@ -8,15 +8,15 @@ ; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]] ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP0]], i64 [[VEC_START]] ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast double* [[VEC_GEP]] to <3 x double>* -; CHECK-NEXT: load <3 x double>, <3 x double>* [[VEC_CAST]], align 8 +; CHECK-NEXT: load volatile <3 x double>, <3 x double>* [[VEC_CAST]], align 8 ; CHECK-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]] ; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, double* [[TMP0]], i64 [[VEC_START1]] ; CHECK-NEXT: [[VEC_CAST3:%.*]] = bitcast double* [[VEC_GEP2]] to <3 x double>* -; CHECK-NEXT: load <3 x double>, <3 x double>* [[VEC_CAST3]], align 8 +; CHECK-NEXT: load volatile <3 x double>, <3 x double>* [[VEC_CAST3]], align 8 ; CHECK-NEXT: [[VEC_START5:%.*]] = mul i64 2, [[STRIDE]] ; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr double, double* [[TMP0]], i64 [[VEC_START5]] ; CHECK-NEXT: [[VEC_CAST7:%.*]] = bitcast double* [[VEC_GEP6]] to <3 x double>* -; CHECK-NEXT: load <3 x double>, <3 x double>* [[VEC_CAST7]], align 8 +; CHECK-NEXT: load volatile <3 x double>, <3 x double>* [[VEC_CAST7]], align 8 ; CHECK-NOT: = load ; entry: @@ -30,10 +30,10 @@ ; CHECK-LABEL: @load_volatile_multiply( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x double>* [[IN:%.*]] to double* ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast double* [[TMP1]] to <2 x double>* -; CHECK-NEXT: load <2 x double>, <2 x double>* [[VEC_CAST]], align 8 +; CHECK-NEXT: load volatile <2 x double>, <2 x double>* [[VEC_CAST]], align 8 ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP1]], i64 2 ; CHECK-NEXT: [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>* -; CHECK-NEXT: load <2 x double>, <2 x double>* [[VEC_CAST1]], align 8 +; CHECK-NEXT: load volatile <2 x double>, <2 x double>* [[VEC_CAST1]], align 8 ; CHECK-NOT: = load ; %in.m = load volatile <4 x double>, <4 x double>* %in, align 8 diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-volatile.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-volatile.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-volatile.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-volatile.ll @@ -14,29 +14,29 @@ ; CHECK-NEXT: [[COL_CAST:%.*]] = bitcast double* [[TMP1]] to <4 x double>* ; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x double>* [[COL_CAST]] to double* ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast double* [[TMP2]] to <2 x double>* -; CHECK-NEXT: load <2 x double>, <2 x double>* [[VEC_CAST]], align 8 +; CHECK-NEXT: load volatile <2 x double>, <2 x double>* [[VEC_CAST]], align 8 ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP2]], i64 2 ; CHECK-NEXT: [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>* -; CHECK-NEXT: load <2 x double>, <2 x double>* [[VEC_CAST1]], align 8 +; CHECK-NEXT: load volatile <2 x double>, <2 x double>* [[VEC_CAST1]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x double>* [[B:%.*]] to double* ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr double, double* [[TMP3]], i64 0 ; CHECK-NEXT: [[COL_CAST3:%.*]] = bitcast double* [[TMP4]] to <4 x double>* ; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x double>* [[COL_CAST3]] to double* ; CHECK-NEXT: [[VEC_CAST4:%.*]] = bitcast double* [[TMP5]] to <2 x double>* -; CHECK-NEXT: load <2 x double>, <2 x double>* [[VEC_CAST4]], align 8 +; CHECK-NEXT: load volatile <2 x double>, <2 x double>* [[VEC_CAST4]], align 8 ; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr double, double* [[TMP5]], i64 2 ; CHECK-NEXT: [[VEC_CAST7:%.*]] = bitcast double* [[VEC_GEP6]] to <2 x double>* -; CHECK-NEXT: load <2 x double>, <2 x double>* [[VEC_CAST7]], align 8 +; CHECK-NEXT: load volatile <2 x double>, <2 x double>* [[VEC_CAST7]], align 8 ; CHECK: [[TMP18:%.*]] = bitcast <4 x double>* [[C:%.*]] to double* ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr double, double* [[TMP18]], i64 0 ; CHECK-NEXT: [[COL_CAST18:%.*]] = bitcast double* [[TMP19]] to <4 x double>* ; CHECK-NEXT: [[TMP20:%.*]] = bitcast <4 x double>* [[COL_CAST18]] to double* ; CHECK-NEXT: [[VEC_CAST19:%.*]] = bitcast double* [[TMP20]] to <2 x double>* -; CHECK-NEXT: store <2 x double> {{.*}}, <2 x double>* [[VEC_CAST19]], align 8 +; CHECK-NEXT: store volatile <2 x double> {{.*}}, <2 x double>* [[VEC_CAST19]], align 8 ; CHECK-NEXT: [[VEC_GEP20:%.*]] = getelementptr double, double* [[TMP20]], i64 2 ; CHECK-NEXT: [[VEC_CAST21:%.*]] = bitcast double* [[VEC_GEP20]] to <2 x double>* -; CHECK-NEXT: store <2 x double> {{.*}}, <2 x double>* [[VEC_CAST21]], align 8 +; CHECK-NEXT: store volatile <2 x double> {{.*}}, <2 x double>* [[VEC_CAST21]], align 8 ; CHECK-NEXT: ret void ; @@ -59,10 +59,10 @@ ; CHECK-NEXT: [[COL_CAST:%.*]] = bitcast double* [[TMP1]] to <4 x double>* ; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x double>* [[COL_CAST]] to double* ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast double* [[TMP2]] to <2 x double>* -; CHECK-NEXT: load <2 x double>, <2 x double>* [[VEC_CAST]], align 8 +; CHECK-NEXT: load volatile <2 x double>, <2 x double>* [[VEC_CAST]], align 8 ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP2]], i64 2 ; CHECK-NEXT: [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>* -; CHECK-NEXT: load <2 x double>, <2 x double>* [[VEC_CAST1]], align 8 +; CHECK-NEXT: load volatile <2 x double>, <2 x double>* [[VEC_CAST1]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x double>* [[B:%.*]] to double* ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr double, double* [[TMP3]], i64 0 ; CHECK-NEXT: [[COL_CAST3:%.*]] = bitcast double* [[TMP4]] to <4 x double>* @@ -112,10 +112,10 @@ ; CHECK-NEXT: [[COL_CAST3:%.*]] = bitcast double* [[TMP4]] to <4 x double>* ; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x double>* [[COL_CAST3]] to double* ; CHECK-NEXT: [[VEC_CAST4:%.*]] = bitcast double* [[TMP5]] to <2 x double>* -; CHECK-NEXT: load <2 x double>, <2 x double>* [[VEC_CAST4]], align 8 +; CHECK-NEXT: load volatile <2 x double>, <2 x double>* [[VEC_CAST4]], align 8 ; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr double, double* [[TMP5]], i64 2 ; CHECK-NEXT: [[VEC_CAST7:%.*]] = bitcast double* [[VEC_GEP6]] to <2 x double>* -; CHECK-NEXT: load <2 x double>, <2 x double>* [[VEC_CAST7]], align 8 +; CHECK-NEXT: load volatile <2 x double>, <2 x double>* [[VEC_CAST7]], align 8 ; CHECK: [[TMP18:%.*]] = bitcast <4 x double>* [[C:%.*]] to double* ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr double, double* [[TMP18]], i64 0 @@ -166,10 +166,10 @@ ; CHECK-NEXT: [[COL_CAST18:%.*]] = bitcast double* [[TMP19]] to <4 x double>* ; CHECK-NEXT: [[TMP20:%.*]] = bitcast <4 x double>* [[COL_CAST18]] to double* ; CHECK-NEXT: [[VEC_CAST19:%.*]] = bitcast double* [[TMP20]] to <2 x double>* -; CHECK-NEXT: store <2 x double> {{.*}}, <2 x double>* [[VEC_CAST19]], align 8 +; CHECK-NEXT: store volatile <2 x double> {{.*}}, <2 x double>* [[VEC_CAST19]], align 8 ; CHECK-NEXT: [[VEC_GEP20:%.*]] = getelementptr double, double* [[TMP20]], i64 2 ; CHECK-NEXT: [[VEC_CAST21:%.*]] = bitcast double* [[VEC_GEP20]] to <2 x double>* -; CHECK-NEXT: store <2 x double> {{.*}}, <2 x double>* [[VEC_CAST21]], align 8 +; CHECK-NEXT: store volatile <2 x double> {{.*}}, <2 x double>* [[VEC_CAST21]], align 8 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/store-align-volatile.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/store-align-volatile.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/store-align-volatile.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/store-align-volatile.ll @@ -6,10 +6,10 @@ ; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <6 x i32> [[IN:%.*]], <6 x i32> undef, <3 x i32> ; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <6 x i32> [[IN]], <6 x i32> undef, <3 x i32> ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast i32* [[OUT:%.*]] to <3 x i32>* -; CHECK-NEXT: store <3 x i32> [[SPLIT]], <3 x i32>* [[VEC_CAST]], align 4 +; CHECK-NEXT: store volatile <3 x i32> [[SPLIT]], <3 x i32>* [[VEC_CAST]], align 4 ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, i32* [[OUT]], i64 5 ; CHECK-NEXT: [[VEC_CAST2:%.*]] = bitcast i32* [[VEC_GEP]] to <3 x i32>* -; CHECK-NEXT: store <3 x i32> [[SPLIT1]], <3 x i32>* [[VEC_CAST2]], align 4 +; CHECK-NEXT: store volatile <3 x i32> [[SPLIT1]], <3 x i32>* [[VEC_CAST2]], align 4 ; CHECK-NEXT: ret void ; call void @llvm.matrix.column.major.store(<6 x i32> %in, i32* %out, i64 5, i1 true, i64 3, i64 2) @@ -23,10 +23,10 @@ ; CHECK-LABEL: @multiply_store_volatile( ; CHECK: [[TMP29:%.*]] = bitcast <4 x i32>* %out to i32* ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast i32* [[TMP29]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> {{.*}}, <2 x i32>* [[VEC_CAST]], align 4 +; CHECK-NEXT: store volatile <2 x i32> {{.*}}, <2 x i32>* [[VEC_CAST]], align 4 ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, i32* [[TMP29]], i64 2 ; CHECK-NEXT: [[VEC_CAST25:%.*]] = bitcast i32* [[VEC_GEP]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> {{.*}}, <2 x i32>* [[VEC_CAST25]], align 4 +; CHECK-NEXT: store volatile <2 x i32> {{.*}}, <2 x i32>* [[VEC_CAST25]], align 4 ; CHECK-NEXT: ret void ; %res = call <4 x i32> @llvm.matrix.multiply(<4 x i32> %in, <4 x i32> %in, i32 2, i32 2, i32 2) @@ -43,11 +43,11 @@ ; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]] ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, i32* [[OUT:%.*]], i64 [[VEC_START]] ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast i32* [[VEC_GEP]] to <3 x i32>* -; CHECK-NEXT: store <3 x i32> [[SPLIT]], <3 x i32>* [[VEC_CAST]], align 4 +; CHECK-NEXT: store volatile <3 x i32> [[SPLIT]], <3 x i32>* [[VEC_CAST]], align 4 ; CHECK-NEXT: [[VEC_START2:%.*]] = mul i64 1, [[STRIDE]] ; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr i32, i32* [[OUT]], i64 [[VEC_START2]] ; CHECK-NEXT: [[VEC_CAST4:%.*]] = bitcast i32* [[VEC_GEP3]] to <3 x i32>* -; CHECK-NEXT: store <3 x i32> [[SPLIT1]], <3 x i32>* [[VEC_CAST4]], align 4 +; CHECK-NEXT: store volatile <3 x i32> [[SPLIT1]], <3 x i32>* [[VEC_CAST4]], align 4 ; CHECK-NEXT: ret void ; call void @llvm.matrix.column.major.store(<6 x i32> %in, i32* align 32 %out, i64 %stride, i1 true, i64 3, i64 2)