diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -35,6 +35,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/IR/MatrixBuilder.h" #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/Path.h" @@ -1939,10 +1940,15 @@ return EmitLoadOfGlobalRegLValue(LV); if (LV.isMatrixElt()) { + llvm::Value *Idx = LV.getMatrixIdx(); + if (CGM.getCodeGenOpts().OptimizationLevel > 0) { + const auto *const MatTy = LV.getType()->getAs(); + llvm::MatrixBuilder MB(Builder); + MB.CreateIndexAssumption(Idx, MatTy->getNumElementsFlattened()); + } llvm::LoadInst *Load = Builder.CreateLoad(LV.getMatrixAddress(), LV.isVolatileQualified()); - return RValue::get( - Builder.CreateExtractElement(Load, LV.getMatrixIdx(), "matrixext")); + return RValue::get(Builder.CreateExtractElement(Load, Idx, "matrixext")); } assert(LV.isBitField() && "Unknown LValue type!"); @@ -2080,9 +2086,15 @@ return EmitStoreThroughGlobalRegLValue(Src, Dst); if (Dst.isMatrixElt()) { - llvm::Value *Vec = Builder.CreateLoad(Dst.getMatrixAddress()); - Vec = Builder.CreateInsertElement(Vec, Src.getScalarVal(), - Dst.getMatrixIdx(), "matins"); + llvm::Value *Idx = Dst.getMatrixIdx(); + if (CGM.getCodeGenOpts().OptimizationLevel > 0) { + const auto *const MatTy = Dst.getType()->getAs(); + llvm::MatrixBuilder MB(Builder); + MB.CreateIndexAssumption(Idx, MatTy->getNumElementsFlattened()); + } + llvm::Instruction *Load = Builder.CreateLoad(Dst.getMatrixAddress()); + llvm::Value *Vec = + Builder.CreateInsertElement(Load, Src.getScalarVal(), Idx, "matins"); Builder.CreateStore(Vec, Dst.getMatrixAddress(), Dst.isVolatileQualified()); return; diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -1775,13 +1775,18 @@ // integer value. Value *RowIdx = Visit(E->getRowIdx()); Value *ColumnIdx = Visit(E->getColumnIdx()); + + const auto *MatrixTy = E->getBase()->getType()->castAs(); + unsigned NumRows = MatrixTy->getNumRows(); + llvm::MatrixBuilder MB(Builder); + Value *Idx = MB.CreateIndex(RowIdx, ColumnIdx, NumRows); + if (CGF.CGM.getCodeGenOpts().OptimizationLevel > 0) + MB.CreateIndexAssumption(Idx, MatrixTy->getNumElementsFlattened()); + Value *Matrix = Visit(E->getBase()); // TODO: Should we emit bounds checks with SanitizerKind::ArrayBounds? - llvm::MatrixBuilder MB(Builder); - return MB.CreateExtractElement( - Matrix, RowIdx, ColumnIdx, - E->getBase()->getType()->castAs()->getNumRows()); + return Builder.CreateExtractElement(Matrix, Idx, "matrixext"); } static int getMaskElt(llvm::ShuffleVectorInst *SVI, unsigned Idx, diff --git a/clang/test/CodeGen/matrix-type-operators.c b/clang/test/CodeGen/matrix-type-operators.c --- a/clang/test/CodeGen/matrix-type-operators.c +++ b/clang/test/CodeGen/matrix-type-operators.c @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: %clang_cc1 -O0 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=CHECK %s +// RUN: %clang_cc1 -O1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=CHECK,OPT %s typedef double dx5x5_t __attribute__((matrix_type(5, 5))); typedef float fx2x3_t __attribute__((matrix_type(2, 3))); @@ -506,7 +507,7 @@ // CHECK-NEXT: [[RES:%.*]] = call <25 x double> @llvm.matrix.multiply.v25f64.v25f64.v25f64(<25 x double> [[B]], <25 x double> [[C]], i32 5, i32 5, i32 5) // CHECK-NEXT: [[A_ADDR:%.*]] = bitcast [25 x double]* %a to <25 x double>* // CHECK-NEXT: store <25 x double> [[RES]], <25 x double>* [[A_ADDR]], align 8 - // CHECK-NEXT: ret void + // CHECK: ret void // dx5x5_t a; @@ -531,7 +532,7 @@ // CHECK-NEXT: [[RES:%.*]] = call <81 x i32> @llvm.matrix.multiply.v81i32.v27i32.v27i32(<27 x i32> [[B]], <27 x i32> [[C]], i32 9, i32 3, i32 9) // CHECK-NEXT: [[A_ADDR:%.*]] = bitcast [81 x i32]* %a to <81 x i32>* // CHECK-NEXT: store <81 x i32> [[RES]], <81 x i32>* [[A_ADDR]], align 4 -// CHECK-NEXT: ret void +// CHECK: ret void // void multiply_matrix_matrix_int(ix9x3_t b, ix3x9_t c) { ix9x9_t a; @@ -874,6 +875,8 @@ // CHECK-NEXT: [[K_EXT:%.*]] = zext i32 [[K]] to i64 // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[K_EXT]], 2 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[J_EXT]] + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT:%.*]] = load <6 x float>, <6 x float>* [[MAT_ADDR:%.*]], align 4 // CHECK-NEXT: [[MATINS:%.*]] = insertelement <6 x float> [[MAT]], float [[E]], i64 [[IDX2]] // CHECK-NEXT: store <6 x float> [[MATINS]], <6 x float>* [[MAT_ADDR]], align 4 @@ -890,6 +893,8 @@ // CHECK-NEXT: [[K:%.*]] = load i64, i64* %k.addr, align 8 // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[K]], 2 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[J_EXT]] + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT:%.*]] = load <6 x float>, <6 x float>* [[MAT_ADDR:%.*]], align 4 // CHECK-NEXT: [[MATINS:%.*]] = insertelement <6 x float> [[MAT]], float [[E]], i64 [[IDX2]] // CHECK-NEXT: store <6 x float> [[MATINS]], <6 x float>* [[MAT_ADDR]], align 4 @@ -907,6 +912,8 @@ // CHECK-NEXT: [[I2_ADD:%.*]] = add nsw i32 4, [[I2]] // CHECK-NEXT: [[ADD_EXT:%.*]] = sext i32 [[I2_ADD]] to i64 // CHECK-NEXT: [[IDX2:%.*]] = add i64 18, [[ADD_EXT]] + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 27 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT:%.*]] = load <27 x i32>, <27 x i32>* [[MAT_ADDR:%.*]], align 4 // CHECK-NEXT: [[MATINS:%.*]] = insertelement <27 x i32> [[MAT]], i32 [[I1]], i64 [[IDX2]] // CHECK-NEXT: store <27 x i32> [[MATINS]], <27 x i32>* [[MAT_ADDR]], align 4 @@ -980,9 +987,11 @@ // CHECK-LABEL: @extract_int( // CHECK: [[J1:%.*]] = load i64, i64* %j.addr, align 8 // CHECK-NEXT: [[J2:%.*]] = load i64, i64* %j.addr, align 8 - // CHECK-NEXT: [[MAT:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4 // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[J2]], 9 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[J1]] + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 27 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) + // CHECK-NEXT: [[MAT:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4 // CHECK-NEXT: [[MATEXT:%.*]] = extractelement <27 x i32> [[MAT]], i64 [[IDX2]] // CHECK-NEXT: ret i32 [[MATEXT]] @@ -995,13 +1004,15 @@ // CHECK-LABEL: @test_extract_matrix_pointer1( // CHECK: [[J:%.*]] = load i32, i32* %j.addr, align 4 // CHECK-NEXT: [[J_EXT:%.*]] = zext i32 [[J]] to i64 + // CHECK-NEXT: [[IDX:%.*]] = add i64 3, [[J_EXT]] + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX]], 6 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[PTR:%.*]] = load [6 x double]**, [6 x double]*** %ptr.addr, align 8 // CHECK-NEXT: [[PTR_IDX:%.*]] = getelementptr inbounds [6 x double]*, [6 x double]** [[PTR]], i64 1 // CHECK-NEXT: [[PTR2:%.*]] = load [6 x double]*, [6 x double]** [[PTR_IDX]], align 8 // CHECK-NEXT: [[PTR2_IDX:%.*]] = getelementptr inbounds [6 x double], [6 x double]* [[PTR2]], i64 2 // CHECK-NEXT: [[MAT_ADDR:%.*]] = bitcast [6 x double]* [[PTR2_IDX]] to <6 x double>* // CHECK-NEXT: [[MAT:%.*]] = load <6 x double>, <6 x double>* [[MAT_ADDR]], align 8 - // CHECK-NEXT: [[IDX:%.*]] = add i64 3, [[J_EXT]] // CHECK-NEXT: [[MATEXT:%.*]] = extractelement <6 x double> [[MAT]], i64 [[IDX]] // CHECK-NEXT: ret double [[MATEXT]] @@ -1027,13 +1038,17 @@ // CHECK-LABEL: @insert_extract( // CHECK: [[K:%.*]] = load i16, i16* %k.addr, align 2 // CHECK-NEXT: [[K_EXT:%.*]] = sext i16 [[K]] to i64 - // CHECK-NEXT: [[MAT:%.*]] = load <9 x float>, <9 x float>* [[MAT_ADDR:%.*]], align 4 // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[K_EXT]], 3 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], 0 - // CHECK-NEXT: [[MATEXT:%.*]] = extractelement <9 x float> [[MAT]], i64 [[IDX]] + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 9 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) + // CHECK-NEXT: [[MAT:%.*]] = load <9 x float>, <9 x float>* [[MAT_ADDR:%.*]], align 4 + // CHECK-NEXT: [[MATEXT:%.*]] = extractelement <9 x float> [[MAT]], i64 [[IDX2]] // CHECK-NEXT: [[J:%.*]] = load i64, i64* %j.addr, align 8 // CHECK-NEXT: [[IDX3:%.*]] = mul i64 [[J]], 3 // CHECK-NEXT: [[IDX4:%.*]] = add i64 [[IDX3]], 2 + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX4]], 9 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT2:%.*]] = load <9 x float>, <9 x float>* [[MAT_ADDR]], align 4 // CHECK-NEXT: [[MATINS:%.*]] = insertelement <9 x float> [[MAT2]], float [[MATEXT]], i64 [[IDX4]] // CHECK-NEXT: store <9 x float> [[MATINS]], <9 x float>* [[MAT_ADDR]], align 4 @@ -1068,9 +1083,13 @@ // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[J_EXT]], 2 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]] // CHECK-NEXT: [[MAT_PTR:%.*]] = bitcast [6 x float]* %mat to <6 x float>* + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT:%.*]] = load <6 x float>, <6 x float>* [[MAT_PTR]], align 4 // CHECK-NEXT: [[EXT:%.*]] = extractelement <6 x float> [[MAT]], i64 [[IDX2]] // CHECK-NEXT: [[SUM:%.*]] = fadd float [[EXT]], {{.*}} + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT2:%.*]] = load <6 x float>, <6 x float>* [[MAT_PTR]], align 4 // CHECK-NEXT: [[INS:%.*]] = insertelement <6 x float> [[MAT2]], float [[SUM]], i64 [[IDX2]] // CHECK-NEXT: store <6 x float> [[INS]], <6 x float>* [[MAT_PTR]], align 4 @@ -1085,23 +1104,29 @@ // CHECK-NEXT: [[I1_EXT:%.*]] = sext i32 [[I1]] to i64 // CHECK-NEXT: [[J1:%.*]] = load i32, i32* %j.addr, align 4 // CHECK-NEXT: [[J1_EXT:%.*]] = sext i32 [[J1]] to i64 - // CHECK-NEXT: [[A:%.*]] = load <27 x i32>, <27 x i32>* %0, align 4 // CHECK-NEXT: [[IDX1_1:%.*]] = mul i64 [[J1_EXT]], 9 // CHECK-NEXT: [[IDX1_2:%.*]] = add i64 [[IDX1_1]], [[I1_EXT]] + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX1_2]], 27 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) + // CHECK-NEXT: [[A:%.*]] = load <27 x i32>, <27 x i32>* %0, align 4 // CHECK-NEXT: [[MI1:%.*]] = extractelement <27 x i32> [[A]], i64 [[IDX1_2]] // CHECK-NEXT: [[MI1_EXT:%.*]] = sext i32 [[MI1]] to i64 // CHECK-NEXT: [[J2:%.*]] = load i32, i32* %j.addr, align 4 // CHECK-NEXT: [[J2_EXT:%.*]] = sext i32 [[J2]] to i64 // CHECK-NEXT: [[I2:%.*]] = load i32, i32* %i.addr, align 4 // CHECK-NEXT: [[I2_EXT:%.*]] = sext i32 [[I2]] to i64 - // CHECK-NEXT: [[A2:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4 // CHECK-NEXT: [[IDX2_1:%.*]] = mul i64 [[I2_EXT]], 9 // CHECK-NEXT: [[IDX2_2:%.*]] = add i64 [[IDX2_1]], [[J2_EXT]] + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2_2]], 27 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) + // CHECK-NEXT: [[A2:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4 // CHECK-NEXT: [[MI2:%.*]] = extractelement <27 x i32> [[A2]], i64 [[IDX2_2]] // CHECK-NEXT: [[MI3:%.*]] = add nsw i32 [[MI2]], 2 // CHECK-NEXT: [[MI3_EXT:%.*]] = sext i32 [[MI3]] to i64 // CHECK-NEXT: [[IDX3_1:%.*]] = mul i64 [[MI3_EXT]], 5 // CHECK-NEXT: [[IDX3_2:%.*]] = add i64 [[IDX3_1]], [[MI1_EXT]] + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX3_2]], 25 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[B:%.*]] = load <25 x double>, <25 x double>* [[B_PTR:%.*]], align 8 // CHECK-NEXT: [[INS:%.*]] = insertelement <25 x double> [[B]], double 1.500000e+00, i64 [[IDX3_2]] // CHECK-NEXT: store <25 x double> [[INS]], <25 x double>* [[B_PTR]], align 8 diff --git a/clang/test/CodeGenCXX/matrix-type-operators.cpp b/clang/test/CodeGenCXX/matrix-type-operators.cpp --- a/clang/test/CodeGenCXX/matrix-type-operators.cpp +++ b/clang/test/CodeGenCXX/matrix-type-operators.cpp @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++11 | FileCheck %s +// RUN: %clang_cc1 -O0 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++11 | FileCheck %s +// RUN: %clang_cc1 -O1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++11 | FileCheck --check-prefixes=CHECK,OPT %s typedef double dx5x5_t __attribute__((matrix_type(5, 5))); using fx2x3_t = float __attribute__((matrix_type(2, 3))); @@ -94,7 +95,7 @@ void test_DoubleWrapper2_Add1(MyMatrix &m) { // CHECK-LABEL: define{{.*}} void @_Z24test_DoubleWrapper2_Add1R8MyMatrixIdLj10ELj9EE( - // CHECK: [[MATRIX:%.*]] = load <90 x double>, <90 x double>* %1, align 8 + // CHECK: [[MATRIX:%.*]] = load <90 x double>, <90 x double>* {{.+}}, align 8 // CHECK: [[SCALAR:%.*]] = call double @_ZN14DoubleWrapper2cvdEv(%struct.DoubleWrapper2* {{[^,]*}} %w2) // CHECK-NEXT: [[SCALAR_EMBED:%.*]] = insertelement <90 x double> poison, double [[SCALAR]], i32 0 // CHECK-NEXT: [[SCALAR_EMBED1:%.*]] = shufflevector <90 x double> [[SCALAR_EMBED]], <90 x double> poison, <90 x i32> zeroinitializer @@ -109,7 +110,7 @@ void test_DoubleWrapper2_Add2(MyMatrix &m) { // CHECK-LABEL: define{{.*}} void @_Z24test_DoubleWrapper2_Add2R8MyMatrixIdLj10ELj9EE( // CHECK: [[SCALAR:%.*]] = call double @_ZN14DoubleWrapper2cvdEv(%struct.DoubleWrapper2* {{[^,]*}} %w2) - // CHECK: [[MATRIX:%.*]] = load <90 x double>, <90 x double>* %1, align 8 + // CHECK: [[MATRIX:%.*]] = load <90 x double>, <90 x double>* {{.*}}, align 8 // CHECK-NEXT: [[SCALAR_EMBED:%.*]] = insertelement <90 x double> poison, double [[SCALAR]], i32 0 // CHECK-NEXT: [[SCALAR_EMBED1:%.*]] = shufflevector <90 x double> [[SCALAR_EMBED]], <90 x double> poison, <90 x i32> zeroinitializer // CHECK-NEXT: [[RES:%.*]] = fadd <90 x double> [[SCALAR_EMBED1]], [[MATRIX]] @@ -219,6 +220,8 @@ // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[J_EXT]], 2 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]] // CHECK-NEXT: [[MAT_ADDR:%.*]] = bitcast [4 x i32]* {{.*}} to <4 x i32>* + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 4 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT:%.*]] = load <4 x i32>, <4 x i32>* [[MAT_ADDR]], align 4 // CHECK-NEXT: [[MATINS:%.*]] = insertelement <4 x i32> [[MAT]], i32 [[E]], i64 [[IDX2]] // CHECK-NEXT: store <4 x i32> [[MATINS]], <4 x i32>* [[MAT_ADDR]], align 4 @@ -243,6 +246,8 @@ // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[J_EXT]], 3 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]] // CHECK-NEXT: [[MAT_ADDR:%.*]] = bitcast [24 x float]* {{.*}} to <24 x float>* + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 24 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT:%.*]] = load <24 x float>, <24 x float>* [[MAT_ADDR]], align 4 // CHECK-NEXT: [[MATINS:%.*]] = insertelement <24 x float> [[MAT]], float [[E]], i64 [[IDX2]] // CHECK-NEXT: store <24 x float> [[MATINS]], <24 x float>* [[MAT_ADDR]], align 4 @@ -292,10 +297,10 @@ // CHECK-NEXT: [[REF_TMP:%.*]] = alloca double, align 8 // CHECK-NEXT: [[NAMELESS0:%.*]] = bitcast [16 x double]* [[M_ADDR]] to <16 x double>* // CHECK-NEXT: store <16 x double> [[M:%.*]], <16 x double>* [[NAMELESS0]], align 8 - // CHECK-NEXT: [[NAMELESS1:%.*]] = load <16 x double>, <16 x double>* [[NAMELESS0]], align 8 + // CHECK: [[NAMELESS1:%.*]] = load <16 x double>, <16 x double>* [[NAMELESS0]], align 8 // CHECK-NEXT: [[MATEXT:%.*]] = extractelement <16 x double> [[NAMELESS1]], i64 4 // CHECK-NEXT: store double [[MATEXT]], double* [[REF_TMP]], align 8 - // CHECK-NEXT: ret double* [[REF_TMP]] + // CHECK: ret double* [[REF_TMP]] return m[0][1]; } @@ -315,11 +320,13 @@ // CHECK-NEXT: [[J:%.*]] = call i32 @_ZN15UnsignedWrappercvjEv(%struct.UnsignedWrapper* {{[^,]*}} %j) // CHECK-NEXT: [[J_SUB:%.*]] = sub i32 [[J]], 1 // CHECK-NEXT: [[J_SUB_EXT:%.*]] = zext i32 [[J_SUB]] to i64 + // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[J_SUB_EXT]], 4 + // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_ADD_EXT]] + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 16 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT_ADDR:%.*]] = load [16 x double]*, [16 x double]** %m.addr, align 8 // CHECK-NEXT: [[MAT_ADDR2:%.*]] = bitcast [16 x double]* [[MAT_ADDR]] to <16 x double>* // CHECK-NEXT: [[MAT:%.*]] = load <16 x double>, <16 x double>* [[MAT_ADDR2]], align 8 - // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[J_SUB_EXT]], 4 - // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_ADD_EXT]] // CHECK-NEXT: [[MATEXT:%.*]] = extractelement <16 x double> [[MAT]], i64 [[IDX2]] // CHECK-NEXT: ret double [[MATEXT]] return m[i + 1][j - 1]; @@ -358,6 +365,8 @@ // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[I2_EXT]], 4 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]] // CHECK-NEXT: [[MAT_ADDR:%.*]] = bitcast [16 x float]* %result to <16 x float>* + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 16 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT:%.*]] = load <16 x float>, <16 x float>* [[MAT_ADDR]], align 4 // CHECK-NEXT: [[MATINS:%.*]] = insertelement <16 x float> [[MAT]], float 1.000000e+00, i64 [[IDX2]] // CHECK-NEXT: store <16 x float> [[MATINS]], <16 x float>* [[MAT_ADDR]], align 4 @@ -386,6 +395,8 @@ // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[I2_EXT]], 5 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]] // CHECK-NEXT: [[MAT_ADDR:%.*]] = bitcast [25 x i32]* %result to <25 x i32>* + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 25 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT:%.*]] = load <25 x i32>, <25 x i32>* [[MAT_ADDR]], align 4 // CHECK-NEXT: [[MATINS:%.*]] = insertelement <25 x i32> [[MAT]], i32 1, i64 [[IDX2]] // CHECK-NEXT: store <25 x i32> [[MATINS]], <25 x i32>* [[MAT_ADDR]], align 4 diff --git a/clang/test/CodeGenObjC/matrix-type-operators.m b/clang/test/CodeGenObjC/matrix-type-operators.m --- a/clang/test/CodeGenObjC/matrix-type-operators.m +++ b/clang/test/CodeGenObjC/matrix-type-operators.m @@ -22,9 +22,9 @@ // CHECK-NEXT: [[IV2_PTR:%.*]] = bitcast %0* [[IV2]] to i8* // CHECK-NEXT: [[CALL1:%.*]] = call i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*)(i8* [[IV2_PTR]], i8* [[SEL2]]) // CHECK-NEXT: [[CONV2:%.*]] = sext i32 [[CALL1]] to i64 -// CHECK-NEXT: [[MAT:%.*]] = load <16 x double>, <16 x double>* {{.*}} align 8 // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[CONV2]], 4 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[CONV]] +// CHECK-NEXT: [[MAT:%.*]] = load <16 x double>, <16 x double>* {{.*}} align 8 // CHECK-NEXT: [[MATEXT:%.*]] = extractelement <16 x double> [[MAT]], i64 [[IDX2]] // CHECK-NEXT: ret double [[MATEXT]] // @@ -49,12 +49,12 @@ // CHECK-NEXT: [[IV2_PTR:%.*]] = bitcast %0* [[IV2]] to i8* // CHECK-NEXT: [[CALL1:%.*]] = call i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*)(i8* [[IV2_PTR]], i8* [[SEL2]]) // CHECK-NEXT: [[CONV2:%.*]] = sext i32 [[CALL1]] to i64 +// CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[CONV2]], 4 +// CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[CONV]] // CHECK-NEXT: [[M:%.*]] = load %1*, %1** %m.addr, align 8 // CHECK-NEXT: [[SEL3:%.*]] = load i8*, i8** @OBJC_SELECTOR_REFERENCES_, align 8, !invariant.load !7 // CHECK-NEXT: [[M_PTR:%.*]] = bitcast %1* [[M]] to i8* // CHECK-NEXT: [[MAT:%.*]] = call <16 x double> bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to <16 x double> (i8*, i8*)*)(i8* [[M_PTR]], i8* [[SEL3]]) -// CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[CONV2]], 4 -// CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[CONV]] // CHECK-NEXT: [[MATEXT:%.*]] = extractelement <16 x double> [[MAT]], i64 [[IDX2]] // CHECK-NEXT: ret double [[MATEXT]] // diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h --- a/llvm/include/llvm/IR/MatrixBuilder.h +++ b/llvm/include/llvm/IR/MatrixBuilder.h @@ -231,9 +231,23 @@ : (IsUnsigned ? B.CreateUDiv(LHS, RHS) : B.CreateSDiv(LHS, RHS)); } - /// Extracts the element at (\p RowIdx, \p ColumnIdx) from \p Matrix. - Value *CreateExtractElement(Value *Matrix, Value *RowIdx, Value *ColumnIdx, - unsigned NumRows, Twine const &Name = "") { + /// Create an assumption that \p Idx is less than \p NumElements. + void CreateIndexAssumption(Value *Idx, unsigned NumElements, + Twine const &Name = "") { + + Value *NumElts = + B.getIntN(Idx->getType()->getScalarSizeInBits(), NumElements); + auto *Cmp = B.CreateICmpULT(Idx, NumElts); + if (auto *ConstCond = dyn_cast(Cmp)) + assert(ConstCond->isOne() && "Index must be valid!"); + else + B.CreateAssumption(Cmp); + } + + /// Compute the index to access the element at (\p RowIdx, \p ColumnIdx) from + /// a matrix with \p NumRows embedded in a vector. + Value *CreateIndex(Value *RowIdx, Value *ColumnIdx, unsigned NumRows, + Twine const &Name = "") { unsigned MaxWidth = std::max(RowIdx->getType()->getScalarSizeInBits(), ColumnIdx->getType()->getScalarSizeInBits()); @@ -241,9 +255,7 @@ RowIdx = B.CreateZExt(RowIdx, IntTy); ColumnIdx = B.CreateZExt(ColumnIdx, IntTy); Value *NumRowsV = B.getIntN(MaxWidth, NumRows); - return B.CreateExtractElement( - Matrix, B.CreateAdd(B.CreateMul(ColumnIdx, NumRowsV), RowIdx), - "matext"); + return B.CreateAdd(B.CreateMul(ColumnIdx, NumRowsV), RowIdx); } };