diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -35,6 +35,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/IR/MatrixBuilder.h" #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/Path.h" @@ -1939,6 +1940,10 @@ return EmitLoadOfGlobalRegLValue(LV); if (LV.isMatrixElt()) { + llvm::Value *Idx = LV.getMatrixIdx(); + auto MatTy = LV.getType()->getAs(); + llvm::MatrixBuilder MB(Builder); + MB.CreateIndexAssumption(Idx, MatTy->getNumElementsFlattened()); llvm::LoadInst *Load = Builder.CreateLoad(LV.getMatrixAddress(), LV.isVolatileQualified()); return RValue::get( @@ -2080,9 +2085,13 @@ return EmitStoreThroughGlobalRegLValue(Src, Dst); if (Dst.isMatrixElt()) { - llvm::Value *Vec = Builder.CreateLoad(Dst.getMatrixAddress()); - Vec = Builder.CreateInsertElement(Vec, Src.getScalarVal(), - Dst.getMatrixIdx(), "matins"); + llvm::Value *Idx = Dst.getMatrixIdx(); + auto MatTy = Dst.getType()->getAs(); + llvm::MatrixBuilder MB(Builder); + MB.CreateIndexAssumption(Idx, MatTy->getNumElementsFlattened()); + llvm::Instruction *Load = Builder.CreateLoad(Dst.getMatrixAddress()); + llvm::Value *Vec = + Builder.CreateInsertElement(Load, Src.getScalarVal(), Idx, "matins"); Builder.CreateStore(Vec, Dst.getMatrixAddress(), Dst.isVolatileQualified()); return; diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -1754,13 +1754,17 @@ // integer value. Value *RowIdx = Visit(E->getRowIdx()); Value *ColumnIdx = Visit(E->getColumnIdx()); + + auto *MatrixTy = E->getBase()->getType()->castAs(); + unsigned NumRows = MatrixTy->getNumRows(); + llvm::MatrixBuilder MB(Builder); + Value *Idx = MB.CreateIndex(RowIdx, ColumnIdx, NumRows); + MB.CreateIndexAssumption(Idx, MatrixTy->getNumElementsFlattened()); + Value *Matrix = Visit(E->getBase()); // TODO: Should we emit bounds checks with SanitizerKind::ArrayBounds? - llvm::MatrixBuilder MB(Builder); - return MB.CreateExtractElement( - Matrix, RowIdx, ColumnIdx, - E->getBase()->getType()->castAs()->getNumRows()); + return Builder.CreateExtractElement(Matrix, Idx, "matrixext"); } static int getMaskElt(llvm::ShuffleVectorInst *SVI, unsigned Idx, diff --git a/clang/test/CodeGen/matrix-type-operators.c b/clang/test/CodeGen/matrix-type-operators.c --- a/clang/test/CodeGen/matrix-type-operators.c +++ b/clang/test/CodeGen/matrix-type-operators.c @@ -874,6 +874,8 @@ // CHECK-NEXT: [[K_EXT:%.*]] = zext i32 [[K]] to i64 // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[K_EXT]], 2 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[J_EXT]] + // CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6 + // CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT:%.*]] = load <6 x float>, <6 x float>* [[MAT_ADDR:%.*]], align 4 // CHECK-NEXT: [[MATINS:%.*]] = insertelement <6 x float> [[MAT]], float [[E]], i64 [[IDX2]] // CHECK-NEXT: store <6 x float> [[MATINS]], <6 x float>* [[MAT_ADDR]], align 4 @@ -890,6 +892,8 @@ // CHECK-NEXT: [[K:%.*]] = load i64, i64* %k.addr, align 8 // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[K]], 2 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[J_EXT]] + // CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6 + // CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT:%.*]] = load <6 x float>, <6 x float>* [[MAT_ADDR:%.*]], align 4 // CHECK-NEXT: [[MATINS:%.*]] = insertelement <6 x float> [[MAT]], float [[E]], i64 [[IDX2]] // CHECK-NEXT: store <6 x float> [[MATINS]], <6 x float>* [[MAT_ADDR]], align 4 @@ -907,6 +911,8 @@ // CHECK-NEXT: [[I2_ADD:%.*]] = add nsw i32 4, [[I2]] // CHECK-NEXT: [[ADD_EXT:%.*]] = sext i32 [[I2_ADD]] to i64 // CHECK-NEXT: [[IDX2:%.*]] = add i64 18, [[ADD_EXT]] + // CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 27 + // CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT:%.*]] = load <27 x i32>, <27 x i32>* [[MAT_ADDR:%.*]], align 4 // CHECK-NEXT: [[MATINS:%.*]] = insertelement <27 x i32> [[MAT]], i32 [[I1]], i64 [[IDX2]] // CHECK-NEXT: store <27 x i32> [[MATINS]], <27 x i32>* [[MAT_ADDR]], align 4 @@ -980,9 +986,11 @@ // CHECK-LABEL: @extract_int( // CHECK: [[J1:%.*]] = load i64, i64* %j.addr, align 8 // CHECK-NEXT: [[J2:%.*]] = load i64, i64* %j.addr, align 8 - // CHECK-NEXT: [[MAT:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4 // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[J2]], 9 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[J1]] + // CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 27 + // CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) + // CHECK-NEXT: [[MAT:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4 // CHECK-NEXT: [[MATEXT:%.*]] = extractelement <27 x i32> [[MAT]], i64 [[IDX2]] // CHECK-NEXT: ret i32 [[MATEXT]] @@ -995,13 +1003,15 @@ // CHECK-LABEL: @test_extract_matrix_pointer1( // CHECK: [[J:%.*]] = load i32, i32* %j.addr, align 4 // CHECK-NEXT: [[J_EXT:%.*]] = zext i32 [[J]] to i64 + // CHECK-NEXT: [[IDX:%.*]] = add i64 3, [[J_EXT]] + // CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX]], 6 + // CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[PTR:%.*]] = load [6 x double]**, [6 x double]*** %ptr.addr, align 8 // CHECK-NEXT: [[PTR_IDX:%.*]] = getelementptr inbounds [6 x double]*, [6 x double]** [[PTR]], i64 1 // CHECK-NEXT: [[PTR2:%.*]] = load [6 x double]*, [6 x double]** [[PTR_IDX]], align 8 // CHECK-NEXT: [[PTR2_IDX:%.*]] = getelementptr inbounds [6 x double], [6 x double]* [[PTR2]], i64 2 // CHECK-NEXT: [[MAT_ADDR:%.*]] = bitcast [6 x double]* [[PTR2_IDX]] to <6 x double>* // CHECK-NEXT: [[MAT:%.*]] = load <6 x double>, <6 x double>* [[MAT_ADDR]], align 8 - // CHECK-NEXT: [[IDX:%.*]] = add i64 3, [[J_EXT]] // CHECK-NEXT: [[MATEXT:%.*]] = extractelement <6 x double> [[MAT]], i64 [[IDX]] // CHECK-NEXT: ret double [[MATEXT]] @@ -1027,13 +1037,17 @@ // CHECK-LABEL: @insert_extract( // CHECK: [[K:%.*]] = load i16, i16* %k.addr, align 2 // CHECK-NEXT: [[K_EXT:%.*]] = sext i16 [[K]] to i64 - // CHECK-NEXT: [[MAT:%.*]] = load <9 x float>, <9 x float>* [[MAT_ADDR:%.*]], align 4 // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[K_EXT]], 3 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], 0 - // CHECK-NEXT: [[MATEXT:%.*]] = extractelement <9 x float> [[MAT]], i64 [[IDX]] + // CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 9 + // CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) + // CHECK-NEXT: [[MAT:%.*]] = load <9 x float>, <9 x float>* [[MAT_ADDR:%.*]], align 4 + // CHECK-NEXT: [[MATEXT:%.*]] = extractelement <9 x float> [[MAT]], i64 [[IDX2]] // CHECK-NEXT: [[J:%.*]] = load i64, i64* %j.addr, align 8 // CHECK-NEXT: [[IDX3:%.*]] = mul i64 [[J]], 3 // CHECK-NEXT: [[IDX4:%.*]] = add i64 [[IDX3]], 2 + // CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX4]], 9 + // CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT2:%.*]] = load <9 x float>, <9 x float>* [[MAT_ADDR]], align 4 // CHECK-NEXT: [[MATINS:%.*]] = insertelement <9 x float> [[MAT2]], float [[MATEXT]], i64 [[IDX4]] // CHECK-NEXT: store <9 x float> [[MATINS]], <9 x float>* [[MAT_ADDR]], align 4 @@ -1068,9 +1082,13 @@ // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[J_EXT]], 2 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]] // CHECK-NEXT: [[MAT_PTR:%.*]] = bitcast [6 x float]* %mat to <6 x float>* + // CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6 + // CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT:%.*]] = load <6 x float>, <6 x float>* [[MAT_PTR]], align 4 // CHECK-NEXT: [[EXT:%.*]] = extractelement <6 x float> [[MAT]], i64 [[IDX2]] // CHECK-NEXT: [[SUM:%.*]] = fadd float [[EXT]], {{.*}} + // CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6 + // CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT2:%.*]] = load <6 x float>, <6 x float>* [[MAT_PTR]], align 4 // CHECK-NEXT: [[INS:%.*]] = insertelement <6 x float> [[MAT2]], float [[SUM]], i64 [[IDX2]] // CHECK-NEXT: store <6 x float> [[INS]], <6 x float>* [[MAT_PTR]], align 4 @@ -1085,23 +1103,29 @@ // CHECK-NEXT: [[I1_EXT:%.*]] = sext i32 [[I1]] to i64 // CHECK-NEXT: [[J1:%.*]] = load i32, i32* %j.addr, align 4 // CHECK-NEXT: [[J1_EXT:%.*]] = sext i32 [[J1]] to i64 - // CHECK-NEXT: [[A:%.*]] = load <27 x i32>, <27 x i32>* %0, align 4 // CHECK-NEXT: [[IDX1_1:%.*]] = mul i64 [[J1_EXT]], 9 // CHECK-NEXT: [[IDX1_2:%.*]] = add i64 [[IDX1_1]], [[I1_EXT]] + // CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX1_2]], 27 + // CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) + // CHECK-NEXT: [[A:%.*]] = load <27 x i32>, <27 x i32>* %0, align 4 // CHECK-NEXT: [[MI1:%.*]] = extractelement <27 x i32> [[A]], i64 [[IDX1_2]] // CHECK-NEXT: [[MI1_EXT:%.*]] = sext i32 [[MI1]] to i64 // CHECK-NEXT: [[J2:%.*]] = load i32, i32* %j.addr, align 4 // CHECK-NEXT: [[J2_EXT:%.*]] = sext i32 [[J2]] to i64 // CHECK-NEXT: [[I2:%.*]] = load i32, i32* %i.addr, align 4 // CHECK-NEXT: [[I2_EXT:%.*]] = sext i32 [[I2]] to i64 - // CHECK-NEXT: [[A2:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4 // CHECK-NEXT: [[IDX2_1:%.*]] = mul i64 [[I2_EXT]], 9 // CHECK-NEXT: [[IDX2_2:%.*]] = add i64 [[IDX2_1]], [[J2_EXT]] + // CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2_2]], 27 + // CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) + // CHECK-NEXT: [[A2:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4 // CHECK-NEXT: [[MI2:%.*]] = extractelement <27 x i32> [[A2]], i64 [[IDX2_2]] // CHECK-NEXT: [[MI3:%.*]] = add nsw i32 [[MI2]], 2 // CHECK-NEXT: [[MI3_EXT:%.*]] = sext i32 [[MI3]] to i64 // CHECK-NEXT: [[IDX3_1:%.*]] = mul i64 [[MI3_EXT]], 5 // CHECK-NEXT: [[IDX3_2:%.*]] = add i64 [[IDX3_1]], [[MI1_EXT]] + // CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX3_2]], 25 + // CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[B:%.*]] = load <25 x double>, <25 x double>* [[B_PTR:%.*]], align 8 // CHECK-NEXT: [[INS:%.*]] = insertelement <25 x double> [[B]], double 1.500000e+00, i64 [[IDX3_2]] // CHECK-NEXT: store <25 x double> [[INS]], <25 x double>* [[B_PTR]], align 8 diff --git a/clang/test/CodeGenCXX/matrix-type-operators.cpp b/clang/test/CodeGenCXX/matrix-type-operators.cpp --- a/clang/test/CodeGenCXX/matrix-type-operators.cpp +++ b/clang/test/CodeGenCXX/matrix-type-operators.cpp @@ -219,6 +219,8 @@ // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[J_EXT]], 2 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]] // CHECK-NEXT: [[MAT_ADDR:%.*]] = bitcast [4 x i32]* {{.*}} to <4 x i32>* + // CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 4 + // CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT:%.*]] = load <4 x i32>, <4 x i32>* [[MAT_ADDR]], align 4 // CHECK-NEXT: [[MATINS:%.*]] = insertelement <4 x i32> [[MAT]], i32 [[E]], i64 [[IDX2]] // CHECK-NEXT: store <4 x i32> [[MATINS]], <4 x i32>* [[MAT_ADDR]], align 4 @@ -243,6 +245,8 @@ // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[J_EXT]], 3 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]] // CHECK-NEXT: [[MAT_ADDR:%.*]] = bitcast [24 x float]* {{.*}} to <24 x float>* + // CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 24 + // CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT:%.*]] = load <24 x float>, <24 x float>* [[MAT_ADDR]], align 4 // CHECK-NEXT: [[MATINS:%.*]] = insertelement <24 x float> [[MAT]], float [[E]], i64 [[IDX2]] // CHECK-NEXT: store <24 x float> [[MATINS]], <24 x float>* [[MAT_ADDR]], align 4 @@ -315,11 +319,13 @@ // CHECK-NEXT: [[J:%.*]] = call i32 @_ZN15UnsignedWrappercvjEv(%struct.UnsignedWrapper* {{[^,]*}} %j) // CHECK-NEXT: [[J_SUB:%.*]] = sub i32 [[J]], 1 // CHECK-NEXT: [[J_SUB_EXT:%.*]] = zext i32 [[J_SUB]] to i64 + // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[J_SUB_EXT]], 4 + // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_ADD_EXT]] + // CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 16 + // CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT_ADDR:%.*]] = load [16 x double]*, [16 x double]** %m.addr, align 8 // CHECK-NEXT: [[MAT_ADDR2:%.*]] = bitcast [16 x double]* [[MAT_ADDR]] to <16 x double>* // CHECK-NEXT: [[MAT:%.*]] = load <16 x double>, <16 x double>* [[MAT_ADDR2]], align 8 - // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[J_SUB_EXT]], 4 - // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_ADD_EXT]] // CHECK-NEXT: [[MATEXT:%.*]] = extractelement <16 x double> [[MAT]], i64 [[IDX2]] // CHECK-NEXT: ret double [[MATEXT]] return m[i + 1][j - 1]; @@ -358,6 +364,8 @@ // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[I2_EXT]], 4 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]] // CHECK-NEXT: [[MAT_ADDR:%.*]] = bitcast [16 x float]* %result to <16 x float>* + // CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 16 + // CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT:%.*]] = load <16 x float>, <16 x float>* [[MAT_ADDR]], align 4 // CHECK-NEXT: [[MATINS:%.*]] = insertelement <16 x float> [[MAT]], float 1.000000e+00, i64 [[IDX2]] // CHECK-NEXT: store <16 x float> [[MATINS]], <16 x float>* [[MAT_ADDR]], align 4 @@ -386,6 +394,8 @@ // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[I2_EXT]], 5 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]] // CHECK-NEXT: [[MAT_ADDR:%.*]] = bitcast [25 x i32]* %result to <25 x i32>* + // CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 25 + // CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT:%.*]] = load <25 x i32>, <25 x i32>* [[MAT_ADDR]], align 4 // CHECK-NEXT: [[MATINS:%.*]] = insertelement <25 x i32> [[MAT]], i32 1, i64 [[IDX2]] // CHECK-NEXT: store <25 x i32> [[MATINS]], <25 x i32>* [[MAT_ADDR]], align 4 diff --git a/clang/test/CodeGenObjC/matrix-type-operators.m b/clang/test/CodeGenObjC/matrix-type-operators.m --- a/clang/test/CodeGenObjC/matrix-type-operators.m +++ b/clang/test/CodeGenObjC/matrix-type-operators.m @@ -22,9 +22,11 @@ // CHECK-NEXT: [[IV2_PTR:%.*]] = bitcast %0* [[IV2]] to i8* // CHECK-NEXT: [[CALL1:%.*]] = call i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*)(i8* [[IV2_PTR]], i8* [[SEL2]]) // CHECK-NEXT: [[CONV2:%.*]] = sext i32 [[CALL1]] to i64 -// CHECK-NEXT: [[MAT:%.*]] = load <16 x double>, <16 x double>* {{.*}} align 8 // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[CONV2]], 4 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[CONV]] +// CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 16 +// CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) +// CHECK-NEXT: [[MAT:%.*]] = load <16 x double>, <16 x double>* {{.*}} align 8 // CHECK-NEXT: [[MATEXT:%.*]] = extractelement <16 x double> [[MAT]], i64 [[IDX2]] // CHECK-NEXT: ret double [[MATEXT]] // @@ -49,12 +51,14 @@ // CHECK-NEXT: [[IV2_PTR:%.*]] = bitcast %0* [[IV2]] to i8* // CHECK-NEXT: [[CALL1:%.*]] = call i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*)(i8* [[IV2_PTR]], i8* [[SEL2]]) // CHECK-NEXT: [[CONV2:%.*]] = sext i32 [[CALL1]] to i64 +// CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[CONV2]], 4 +// CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[CONV]] +// CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 16 +// CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[M:%.*]] = load %1*, %1** %m.addr, align 8 // CHECK-NEXT: [[SEL3:%.*]] = load i8*, i8** @OBJC_SELECTOR_REFERENCES_, align 8, !invariant.load !7 // CHECK-NEXT: [[M_PTR:%.*]] = bitcast %1* [[M]] to i8* // CHECK-NEXT: [[MAT:%.*]] = call <16 x double> bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to <16 x double> (i8*, i8*)*)(i8* [[M_PTR]], i8* [[SEL3]]) -// CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[CONV2]], 4 -// CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[CONV]] // CHECK-NEXT: [[MATEXT:%.*]] = extractelement <16 x double> [[MAT]], i64 [[IDX2]] // CHECK-NEXT: ret double [[MATEXT]] // diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h --- a/llvm/include/llvm/IR/MatrixBuilder.h +++ b/llvm/include/llvm/IR/MatrixBuilder.h @@ -231,9 +231,25 @@ : (IsUnsigned ? B.CreateUDiv(LHS, RHS) : B.CreateSDiv(LHS, RHS)); } - /// Extracts the element at (\p RowIdx, \p ColumnIdx) from \p Matrix. - Value *CreateExtractElement(Value *Matrix, Value *RowIdx, Value *ColumnIdx, - unsigned NumRows, Twine const &Name = "") { + /// Create an assumption that \p Idx is less than \p NumElements. + void CreateIndexAssumption(Value *Idx, unsigned NumElements, + Twine const &Name = "") { + + Value *NumElts = + B.getIntN(Idx->getType()->getScalarSizeInBits(), NumElements); + + auto *Cmp = B.CreateICmpULT(Idx, NumElts); + if (!isa(Cmp)) { + Function *TheFn = + Intrinsic::getDeclaration(getModule(), Intrinsic::assume); + B.CreateCall(TheFn->getFunctionType(), TheFn, {Cmp}, Name); + } + } + + /// Compute the index to access the element at (\p RowIdx, \p ColumnIdx) from + /// a matrix with \p NumRows embedded in a vector. + Value *CreateIndex(Value *RowIdx, Value *ColumnIdx, unsigned NumRows, + Twine const &Name = "") { unsigned MaxWidth = std::max(RowIdx->getType()->getScalarSizeInBits(), ColumnIdx->getType()->getScalarSizeInBits()); @@ -241,9 +257,7 @@ RowIdx = B.CreateZExt(RowIdx, IntTy); ColumnIdx = B.CreateZExt(ColumnIdx, IntTy); Value *NumRowsV = B.getIntN(MaxWidth, NumRows); - return B.CreateExtractElement( - Matrix, B.CreateAdd(B.CreateMul(ColumnIdx, NumRowsV), RowIdx), - "matext"); + return B.CreateAdd(B.CreateMul(ColumnIdx, NumRowsV), RowIdx); } }; diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -807,6 +807,11 @@ // Delete small array after loop unroll. FPM.addPass(SROA()); + // The matrix extension can introduce large vector operations early, which can + // benefit from running vector-combine early on. + if (EnableMatrix) + FPM.addPass(VectorCombinePass()); + // Eliminate redundancies. FPM.addPass(MergedLoadStoreMotionPass()); if (RunNewGVN) diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -427,6 +427,11 @@ MPM.add(createCFGSimplificationPass()); // Merge & remove BBs MPM.add(createReassociatePass()); // Reassociate expressions + // The matrix extension can introduce large vector operations early, which can + // benefit from running vector-combine early on. + if (EnableMatrix) + MPM.add(createVectorCombinePass()); + // Begin the loop pass pipeline. if (EnableSimpleLoopUnswitch) { // The simple loop unswitch pass relies on separate cleanup passes. Schedule diff --git a/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll b/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll --- a/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll +++ b/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll @@ -109,6 +109,7 @@ ; LEGACY-NEXT: Dominator Tree Construction ; LEGACY-NEXT: Basic Alias Analysis (stateless AA impl) ; LEGACY-NEXT: Function Alias Analysis Results +; LEGACY-NEXT: Optimize scalar/vector ops ; LEGACY-NEXT: Memory SSA ; LEGACY-NEXT: Natural Loop Information ; LEGACY-NEXT: Canonicalize natural loops @@ -424,6 +425,7 @@ ; NEWPM-NEXT: Running pass: LoopSimplifyPass on f ; NEWPM-NEXT: Running pass: LCSSAPass on f ; NEWPM-NEXT: Running pass: SROA on f +; NEWPM-NEXT: Running pass: VectorCombinePass on f ; NEWPM-NEXT: Running pass: MergedLoadStoreMotionPass on f ; NEWPM-NEXT: Running pass: GVN on f ; NEWPM-NEXT: Running analysis: MemoryDependenceAnalysis on f diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll --- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll @@ -26,8 +26,7 @@ ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP1]] ; CHECK-NEXT: [[MATRIXEXT7:%.*]] = load double, double* [[TMP9]], align 8 ; CHECK-NEXT: [[SUB:%.*]] = fsub double [[MATRIXEXT7]], [[MUL]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP1]] -; CHECK-NEXT: store double [[SUB]], double* [[TMP10]], align 8 +; CHECK-NEXT: store double [[SUB]], double* [[TMP9]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -93,43 +92,99 @@ ; CHECK-NEXT: [[CONV6:%.*]] = zext i32 [[I:%.*]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast [225 x double]* [[B:%.*]] to <225 x double>* ; CHECK-NEXT: [[CMP212_NOT:%.*]] = icmp eq i32 [[I]], 0 -; CHECK-NEXT: br i1 [[CMP212_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]] -; CHECK: for.cond1.preheader.us.preheader: -; CHECK-NEXT: [[DOTPRE_PRE:%.*]] = load <225 x double>, <225 x double>* [[TMP1]], align 8 -; CHECK-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]] +; CHECK-NEXT: br i1 [[CMP212_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US:%.*]] ; CHECK: for.cond1.preheader.us: -; CHECK-NEXT: [[DOTPRE:%.*]] = phi <225 x double> [ [[MATINS_US:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ [[DOTPRE_PRE]], [[FOR_COND1_PREHEADER_US_PREHEADER]] ] -; CHECK-NEXT: [[J_014_US:%.*]] = phi i32 [ [[INC13_US:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ] -; CHECK-NEXT: [[CONV5_US:%.*]] = zext i32 [[J_014_US]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw i64 [[CONV5_US]], 15 -; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], [[CONV6]] -; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 225 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP4]]) +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 [[I]], 225 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[CONV6]] ; CHECK-NEXT: br label [[FOR_BODY4_US:%.*]] ; CHECK: for.body4.us: -; CHECK-NEXT: [[TMP5:%.*]] = phi <225 x double> [ [[DOTPRE]], [[FOR_COND1_PREHEADER_US]] ], [ [[MATINS_US]], [[FOR_BODY4_US]] ] ; CHECK-NEXT: [[K_013_US:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[FOR_BODY4_US]] ] ; CHECK-NEXT: [[CONV_US:%.*]] = zext i32 [[K_013_US]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP2]], [[CONV_US]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 225 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP7]]) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP6]] -; CHECK-NEXT: [[MATRIXEXT_US:%.*]] = load double, double* [[TMP8]], align 8 -; CHECK-NEXT: [[MATRIXEXT8_US:%.*]] = extractelement <225 x double> [[TMP5]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[K_013_US]], 225 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[CONV_US]] +; CHECK-NEXT: [[MATRIXEXT_US:%.*]] = load double, double* [[TMP5]], align 8 +; CHECK-NEXT: [[MATRIXEXT8_US:%.*]] = load double, double* [[TMP3]], align 8 ; CHECK-NEXT: [[MUL_US:%.*]] = fmul double [[MATRIXEXT_US]], [[MATRIXEXT8_US]] -; CHECK-NEXT: [[MATRIXEXT11_US:%.*]] = extractelement <225 x double> [[TMP5]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[CONV_US]] +; CHECK-NEXT: [[MATRIXEXT11_US:%.*]] = load double, double* [[TMP6]], align 8 ; CHECK-NEXT: [[SUB_US:%.*]] = fsub double [[MATRIXEXT11_US]], [[MUL_US]] -; CHECK-NEXT: [[MATINS_US]] = insertelement <225 x double> [[TMP5]], double [[SUB_US]], i64 [[TMP6]] -; CHECK-NEXT: store <225 x double> [[MATINS_US]], <225 x double>* [[TMP1]], align 8 -; CHECK-NEXT: [[INC_US]] = add nuw i32 [[K_013_US]], 1 +; CHECK-NEXT: store double [[SUB_US]], double* [[TMP6]], align 8 +; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[K_013_US]], 1 ; CHECK-NEXT: [[CMP2_US:%.*]] = icmp ult i32 [[INC_US]], [[I]] -; CHECK-NEXT: br i1 [[CMP2_US]], label [[FOR_BODY4_US]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] +; CHECK-NEXT: br i1 [[CMP2_US]], label [[FOR_BODY4_US]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us: -; CHECK-NEXT: [[INC13_US]] = add nuw nsw i32 [[J_014_US]], 1 -; CHECK-NEXT: [[CMP_US:%.*]] = icmp ult i32 [[J_014_US]], 3 -; CHECK-NEXT: br i1 [[CMP_US]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_COND_CLEANUP]] +; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[CONV6]], 15 +; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i32 [[I]], 210 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP8]]) +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP7]] +; CHECK-NEXT: br label [[FOR_BODY4_US_1:%.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void +; CHECK: for.body4.us.1: +; CHECK-NEXT: [[K_013_US_1:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ [[INC_US_1:%.*]], [[FOR_BODY4_US_1]] ] +; CHECK-NEXT: [[NARROW:%.*]] = add nuw nsw i32 [[K_013_US_1]], 15 +; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[NARROW]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = icmp ult i32 [[K_013_US_1]], 210 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP10]] +; CHECK-NEXT: [[MATRIXEXT_US_1:%.*]] = load double, double* [[TMP12]], align 8 +; CHECK-NEXT: [[MATRIXEXT8_US_1:%.*]] = load double, double* [[TMP9]], align 8 +; CHECK-NEXT: [[MUL_US_1:%.*]] = fmul double [[MATRIXEXT_US_1]], [[MATRIXEXT8_US_1]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP10]] +; CHECK-NEXT: [[MATRIXEXT11_US_1:%.*]] = load double, double* [[TMP13]], align 8 +; CHECK-NEXT: [[SUB_US_1:%.*]] = fsub double [[MATRIXEXT11_US_1]], [[MUL_US_1]] +; CHECK-NEXT: store double [[SUB_US_1]], double* [[TMP13]], align 8 +; CHECK-NEXT: [[INC_US_1]] = add nuw nsw i32 [[K_013_US_1]], 1 +; CHECK-NEXT: [[CMP2_US_1:%.*]] = icmp ult i32 [[INC_US_1]], [[I]] +; CHECK-NEXT: br i1 [[CMP2_US_1]], label [[FOR_BODY4_US_1]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1:%.*]] +; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us.1: +; CHECK-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[CONV6]], 30 +; CHECK-NEXT: [[TMP15:%.*]] = icmp ult i32 [[I]], 195 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP15]]) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP14]] +; CHECK-NEXT: br label [[FOR_BODY4_US_2:%.*]] +; CHECK: for.body4.us.2: +; CHECK-NEXT: [[K_013_US_2:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1]] ], [ [[INC_US_2:%.*]], [[FOR_BODY4_US_2]] ] +; CHECK-NEXT: [[NARROW16:%.*]] = add nuw nsw i32 [[K_013_US_2]], 30 +; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[NARROW16]] to i64 +; CHECK-NEXT: [[TMP18:%.*]] = icmp ult i32 [[K_013_US_2]], 195 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP18]]) +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP17]] +; CHECK-NEXT: [[MATRIXEXT_US_2:%.*]] = load double, double* [[TMP19]], align 8 +; CHECK-NEXT: [[MATRIXEXT8_US_2:%.*]] = load double, double* [[TMP16]], align 8 +; CHECK-NEXT: [[MUL_US_2:%.*]] = fmul double [[MATRIXEXT_US_2]], [[MATRIXEXT8_US_2]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP17]] +; CHECK-NEXT: [[MATRIXEXT11_US_2:%.*]] = load double, double* [[TMP20]], align 8 +; CHECK-NEXT: [[SUB_US_2:%.*]] = fsub double [[MATRIXEXT11_US_2]], [[MUL_US_2]] +; CHECK-NEXT: store double [[SUB_US_2]], double* [[TMP20]], align 8 +; CHECK-NEXT: [[INC_US_2]] = add nuw nsw i32 [[K_013_US_2]], 1 +; CHECK-NEXT: [[CMP2_US_2:%.*]] = icmp ult i32 [[INC_US_2]], [[I]] +; CHECK-NEXT: br i1 [[CMP2_US_2]], label [[FOR_BODY4_US_2]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2:%.*]] +; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us.2: +; CHECK-NEXT: [[TMP21:%.*]] = add nuw nsw i64 [[CONV6]], 45 +; CHECK-NEXT: [[TMP22:%.*]] = icmp ult i32 [[I]], 180 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP22]]) +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP21]] +; CHECK-NEXT: br label [[FOR_BODY4_US_3:%.*]] +; CHECK: for.body4.us.3: +; CHECK-NEXT: [[K_013_US_3:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2]] ], [ [[INC_US_3:%.*]], [[FOR_BODY4_US_3]] ] +; CHECK-NEXT: [[NARROW17:%.*]] = add nuw nsw i32 [[K_013_US_3]], 45 +; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[NARROW17]] to i64 +; CHECK-NEXT: [[TMP25:%.*]] = icmp ult i32 [[K_013_US_3]], 180 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP25]]) +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP24]] +; CHECK-NEXT: [[MATRIXEXT_US_3:%.*]] = load double, double* [[TMP26]], align 8 +; CHECK-NEXT: [[MATRIXEXT8_US_3:%.*]] = load double, double* [[TMP23]], align 8 +; CHECK-NEXT: [[MUL_US_3:%.*]] = fmul double [[MATRIXEXT_US_3]], [[MATRIXEXT8_US_3]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP24]] +; CHECK-NEXT: [[MATRIXEXT11_US_3:%.*]] = load double, double* [[TMP27]], align 8 +; CHECK-NEXT: [[SUB_US_3:%.*]] = fsub double [[MATRIXEXT11_US_3]], [[MUL_US_3]] +; CHECK-NEXT: store double [[SUB_US_3]], double* [[TMP27]], align 8 +; CHECK-NEXT: [[INC_US_3]] = add nuw nsw i32 [[K_013_US_3]], 1 +; CHECK-NEXT: [[CMP2_US_3:%.*]] = icmp ult i32 [[INC_US_3]], [[I]] +; CHECK-NEXT: br i1 [[CMP2_US_3]], label [[FOR_BODY4_US_3]], label [[FOR_COND_CLEANUP]] ; entry: %i.addr = alloca i32, align 4