diff --git a/clang/test/CodeGen/matrix-type-builtins.c b/clang/test/CodeGen/matrix-type-builtins.c --- a/clang/test/CodeGen/matrix-type-builtins.c +++ b/clang/test/CodeGen/matrix-type-builtins.c @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=COMMON,CHECK64 %s +// RUN: %clang_cc1 -fenable-matrix -triple i386-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=COMMON,CHECK32 %s // Also check we do not crash when running some middle-end passes. Most // importantly this includes the IR verifier, to ensure we emit valid IR. @@ -15,30 +16,33 @@ typedef unsigned ux6x1_t __attribute__((matrix_type(6, 1))); void transpose_double_5x5(dx5x5_t *a) { - // CHECK-LABEL: define{{.*}} void @transpose_double_5x5( - // CHECK: [[A:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 8 - // CHECK-NEXT: [[TRANS:%.*]] = call <25 x double> @llvm.matrix.transpose.v25f64(<25 x double> [[A]], i32 5, i32 5) - // CHECK-NEXT: [[AT_ADDR:%.*]] = bitcast [25 x double]* %a_t to <25 x double>* - // CHECK-NEXT: store <25 x double> [[TRANS]], <25 x double>* [[AT_ADDR]], align 8 + // COMMON-LABEL: define{{.*}} void @transpose_double_5x5( + // CHECK32: [[A:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 4 + // CHECK64: [[A:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 8 + // COMMON-NEXT: [[TRANS:%.*]] = call <25 x double> @llvm.matrix.transpose.v25f64(<25 x double> [[A]], i32 5, i32 5) + // COMMON-NEXT: [[AT_ADDR:%.*]] = bitcast [25 x double]* %a_t to <25 x double>* + // CHECK32-NEXT: store <25 x double> [[TRANS]], <25 x double>* [[AT_ADDR]], align 4 + // CHECK64-NEXT: store <25 x double> [[TRANS]], <25 x double>* [[AT_ADDR]], align 8 + dx5x5_t a_t = __builtin_matrix_transpose(*a); } void transpose_float_3x2(fx3x2_t *a) { - // CHECK-LABEL: define{{.*}} void @transpose_float_3x2( - // CHECK: [[A:%.*]] = load <6 x float>, <6 x float>* {{.*}}, align 4 - // CHECK-NEXT: [[TRANS:%.*]] = call <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> [[A]], i32 3, i32 2) - // CHECK-NEXT: [[AT_ADDR:%.*]] = bitcast [6 x float]* %a_t to <6 x float>* - // CHECK-NEXT: store <6 x float> [[TRANS]], <6 x float>* [[AT_ADDR]], align 4 + // COMMON-LABEL: define{{.*}} void @transpose_float_3x2( + // COMMON: [[A:%.*]] = load <6 x float>, <6 x float>* {{.*}}, align 4 + // COMMON-NEXT: [[TRANS:%.*]] = call <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> [[A]], i32 3, i32 2) + // COMMON-NEXT: [[AT_ADDR:%.*]] = bitcast [6 x float]* %a_t to <6 x float>* + // COMMON-NEXT: store <6 x float> [[TRANS]], <6 x float>* [[AT_ADDR]], align 4 fx2x3_t a_t = __builtin_matrix_transpose(*a); } void transpose_int_20x4(ix20x4_t *a) { - // CHECK-LABEL: define{{.*}} void @transpose_int_20x4( - // CHECK: [[A:%.*]] = load <80 x i32>, <80 x i32>* {{.*}}, align 4 - // CHECK-NEXT: [[TRANS:%.*]] = call <80 x i32> @llvm.matrix.transpose.v80i32(<80 x i32> [[A]], i32 20, i32 4) - // CHECK-NEXT: [[AT_ADDR:%.*]] = bitcast [80 x i32]* %a_t to <80 x i32>* - // CHECK-NEXT: store <80 x i32> [[TRANS]], <80 x i32>* [[AT_ADDR]], align 4 + // COMMON-LABEL: define{{.*}} void @transpose_int_20x4( + // COMMON: [[A:%.*]] = load <80 x i32>, <80 x i32>* {{.*}}, align 4 + // COMMON-NEXT: [[TRANS:%.*]] = call <80 x i32> @llvm.matrix.transpose.v80i32(<80 x i32> [[A]], i32 20, i32 4) + // COMMON-NEXT: [[AT_ADDR:%.*]] = bitcast [80 x i32]* %a_t to <80 x i32>* + // COMMON-NEXT: store <80 x i32> [[TRANS]], <80 x i32>* [[AT_ADDR]], align 4 ix4x20_t a_t = __builtin_matrix_transpose(*a); } @@ -49,26 +53,28 @@ }; void transpose_struct_member(struct Foo *F) { - // CHECK-LABEL: define{{.*}} void @transpose_struct_member( - // CHECK: [[M:%.*]] = load <6 x i32>, <6 x i32>* {{.*}}, align 4 - // CHECK-NEXT: [[M_T:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[M]], i32 1, i32 6) - // CHECK-NEXT: [[F_ADDR:%.*]] = load %struct.Foo*, %struct.Foo** %F.addr, align 8 - // CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr inbounds %struct.Foo, %struct.Foo* [[F_ADDR]], i32 0, i32 1 - // CHECK-NEXT: [[OUT_PTR_C:%.*]] = bitcast [6 x i32]* [[OUT_PTR]] to <6 x i32>* - // CHECK-NEXT: store <6 x i32> [[M_T]], <6 x i32>* [[OUT_PTR_C]], align 4 + // COMMON-LABEL: define{{.*}} void @transpose_struct_member( + // COMMON: [[M:%.*]] = load <6 x i32>, <6 x i32>* {{.*}}, align 4 + // COMMON-NEXT: [[M_T:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[M]], i32 1, i32 6) + // CHECK32-NEXT: [[F_ADDR:%.*]] = load %struct.Foo*, %struct.Foo** %F.addr, align 4 + // CHECK64-NEXT: [[F_ADDR:%.*]] = load %struct.Foo*, %struct.Foo** %F.addr, align 8 + // COMMON-NEXT: [[OUT_PTR:%.*]] = getelementptr inbounds %struct.Foo, %struct.Foo* [[F_ADDR]], i32 0, i32 1 + // COMMON-NEXT: [[OUT_PTR_C:%.*]] = bitcast [6 x i32]* [[OUT_PTR]] to <6 x i32>* + // COMMON-NEXT: store <6 x i32> [[M_T]], <6 x i32>* [[OUT_PTR_C]], align 4 F->out = __builtin_matrix_transpose(F->in); } void transpose_transpose_struct_member(struct Foo *F) { - // CHECK-LABEL: define{{.*}} void @transpose_transpose_struct_member( - // CHECK: [[M:%.*]] = load <6 x i32>, <6 x i32>* {{.*}}, align 4 - // CHECK-NEXT: [[M_T:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[M]], i32 1, i32 6) - // CHECK-NEXT: [[M_T2:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[M_T]], i32 6, i32 1) - // CHECK-NEXT: [[F_ADDR:%.*]] = load %struct.Foo*, %struct.Foo** %F.addr, align 8 - // CHECK-NEXT: [[IN_PTR:%.*]] = getelementptr inbounds %struct.Foo, %struct.Foo* [[F_ADDR]], i32 0, i32 0 - // CHECK-NEXT: [[IN_PTR_C:%.*]] = bitcast [6 x i32]* [[IN_PTR]] to <6 x i32>* - // CHECK-NEXT: store <6 x i32> [[M_T2]], <6 x i32>* [[IN_PTR_C]], align 4 + // COMMON-LABEL: define{{.*}} void @transpose_transpose_struct_member( + // COMMON: [[M:%.*]] = load <6 x i32>, <6 x i32>* {{.*}}, align 4 + // COMMON-NEXT: [[M_T:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[M]], i32 1, i32 6) + // COMMON-NEXT: [[M_T2:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[M_T]], i32 6, i32 1) + // CHECK32-NEXT: [[F_ADDR:%.*]] = load %struct.Foo*, %struct.Foo** %F.addr, align 4 + // CHECK64-NEXT: [[F_ADDR:%.*]] = load %struct.Foo*, %struct.Foo** %F.addr, align 8 + // COMMON-NEXT: [[IN_PTR:%.*]] = getelementptr inbounds %struct.Foo, %struct.Foo* [[F_ADDR]], i32 0, i32 0 + // COMMON-NEXT: [[IN_PTR_C:%.*]] = bitcast [6 x i32]* [[IN_PTR]] to <6 x i32>* + // COMMON-NEXT: store <6 x i32> [[M_T2]], <6 x i32>* [[IN_PTR_C]], align 4 F->in = __builtin_matrix_transpose(__builtin_matrix_transpose(F->in)); } @@ -76,13 +82,16 @@ dx5x5_t get_matrix(); void transpose_rvalue() { - // CHECK-LABEL: define{{.*}} void @transpose_rvalue() - // CHECK-NEXT: entry: - // CHECK-NEXT: [[M_T_ADDR:%.*]] = alloca [25 x double], align 8 - // CHECK-NEXT: [[CALL:%.*]] = call <25 x double> (...) @get_matrix() - // CHECK-NEXT: [[M_T:%.*]] = call <25 x double> @llvm.matrix.transpose.v25f64(<25 x double> [[CALL]], i32 5, i32 5) - // CHECK-NEXT: [[M_T_ADDR_C:%.*]] = bitcast [25 x double]* [[M_T_ADDR]] to <25 x double>* - // CHECK-NEXT: store <25 x double> [[M_T]], <25 x double>* [[M_T_ADDR_C]], align 8 + // COMMON-LABEL: define{{.*}} void @transpose_rvalue() + // COMMON-NEXT: entry: + // CHECK32-NEXT: [[M_T_ADDR:%.*]] = alloca [25 x double], align 4 + // CHECK64-NEXT: [[M_T_ADDR:%.*]] = alloca [25 x double], align 8 + // CHECK32-NEXT: [[CALL:%.*]] = call <25 x double> bitcast (<25 x double> (...)* @get_matrix to <25 x double> ()*)() + // CHECK64-NEXT: [[CALL:%.*]] = call <25 x double> (...) @get_matrix() + // COMMON-NEXT: [[M_T:%.*]] = call <25 x double> @llvm.matrix.transpose.v25f64(<25 x double> [[CALL]], i32 5, i32 5) + // COMMON-NEXT: [[M_T_ADDR_C:%.*]] = bitcast [25 x double]* [[M_T_ADDR]] to <25 x double>* + // CHECK32-NEXT: store <25 x double> [[M_T]], <25 x double>* [[M_T_ADDR_C]], align 4 + // CHECK64-NEXT: store <25 x double> [[M_T]], <25 x double>* [[M_T_ADDR_C]], align 8 dx5x5_t m_t = __builtin_matrix_transpose(get_matrix()); } @@ -90,162 +99,215 @@ const dx5x5_t global_matrix; void transpose_global() { - // CHECK-LABEL: define{{.*}} void @transpose_global() - // CHECK-NEXT: entry: - // CHECK-NEXT: [[M_T_ADDR:%.*]] = alloca [25 x double], align 8 - // CHECK-NEXT: [[GLOBAL_MATRIX:%.*]] = load <25 x double>, <25 x double>* bitcast ([25 x double]* @global_matrix to <25 x double>*), align 8 - // CHECK-NEXT: [[M_T:%.*]] = call <25 x double> @llvm.matrix.transpose.v25f64(<25 x double> [[GLOBAL_MATRIX]], i32 5, i32 5) - // CHECK-NEXT: [[M_T_ADDR_C:%.*]] = bitcast [25 x double]* [[M_T_ADDR]] to <25 x double>* - // CHECK-NEXT: store <25 x double> [[M_T]], <25 x double>* [[M_T_ADDR_C]], align 8 + // COMMON-LABEL: define{{.*}} void @transpose_global() + // COMMON-NEXT: entry: + // CHECK32-NEXT: [[M_T_ADDR:%.*]] = alloca [25 x double], align 4 + // CHECK32-NEXT: [[GLOBAL_MATRIX:%.*]] = load <25 x double>, <25 x double>* bitcast ([25 x double]* @global_matrix to <25 x double>*), align 4 + // CHECK64-NEXT: [[M_T_ADDR:%.*]] = alloca [25 x double], align 8 + // CHECK64-NEXT: [[GLOBAL_MATRIX:%.*]] = load <25 x double>, <25 x double>* bitcast ([25 x double]* @global_matrix to <25 x double>*), align 8 + // COMMON-NEXT: [[M_T:%.*]] = call <25 x double> @llvm.matrix.transpose.v25f64(<25 x double> [[GLOBAL_MATRIX]], i32 5, i32 5) + // COMMON-NEXT: [[M_T_ADDR_C:%.*]] = bitcast [25 x double]* [[M_T_ADDR]] to <25 x double>* + // CHECK32-NEXT: store <25 x double> [[M_T]], <25 x double>* [[M_T_ADDR_C]], align 4 + // CHECK64-NEXT: store <25 x double> [[M_T]], <25 x double>* [[M_T_ADDR_C]], align 8 dx5x5_t m_t = __builtin_matrix_transpose(global_matrix); } void column_major_load_with_const_stride_double(double *Ptr) { - // CHECK-LABEL: define{{.*}} void @column_major_load_with_const_stride_double(double* %Ptr) - // CHECK: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8 - // CHECK-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64(double* align 8 [[PTR]], i64 5, i1 false, i32 5, i32 5) + // COMMON-LABEL: define{{.*}} void @column_major_load_with_const_stride_double(double* %Ptr) + // CHECK32: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 4 + // CHECK32-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64.i32(double* align 4 [[PTR]], i32 5, i1 false, i32 5, i32 5) + // CHECK64: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8 + // CHECK64-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64.i64(double* align 8 [[PTR]], i64 5, i1 false, i32 5, i32 5) dx5x5_t m_a1 = __builtin_matrix_column_major_load(Ptr, 5, 5, 5); } void column_major_load_with_const_stride2_double(double *Ptr) { - // CHECK-LABEL: define{{.*}} void @column_major_load_with_const_stride2_double(double* %Ptr) - // CHECK: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8 - // CHECK-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64(double* align 8 [[PTR]], i64 15, i1 false, i32 5, i32 5) + // COMMON-LABEL: define{{.*}} void @column_major_load_with_const_stride2_double(double* %Ptr) + // CHECK32: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 4 + // CHECK32-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64.i32(double* align 4 [[PTR]], i32 15, i1 false, i32 5, i32 5) + // CHECK64: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8 + // CHECK64-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64.i64(double* align 8 [[PTR]], i64 15, i1 false, i32 5, i32 5) dx5x5_t m_a2 = __builtin_matrix_column_major_load(Ptr, 5, 5, 2 * 3 + 9); } void column_major_load_with_variable_stride_ull_float(float *Ptr, unsigned long long S) { - // CHECK-LABEL: define{{.*}} void @column_major_load_with_variable_stride_ull_float(float* %Ptr, i64 %S) - // CHECK: [[S:%.*]] = load i64, i64* %S.addr, align 8 - // CHECK-NEXT: [[PTR:%.*]] = load float*, float** %Ptr.addr, align 8 - // CHECK-NEXT: call <6 x float> @llvm.matrix.column.major.load.v6f32(float* align 4 [[PTR]], i64 [[S]], i1 false, i32 2, i32 3) + // COMMON-LABEL: define{{.*}} void @column_major_load_with_variable_stride_ull_float(float* %Ptr, i64 %S) + // CHECK32: [[S:%.*]] = load i64, i64* %S.addr, align 8 + // CHECK32-NEXT: [[STRIDE_TRUNC:%.*]] = trunc i64 [[S]] to i32 + // CHECK32-NEXT: [[PTR:%.*]] = load float*, float** %Ptr.addr, align 4 + // CHECK32-NEXT: call <6 x float> @llvm.matrix.column.major.load.v6f32.i32(float* align 4 [[PTR]], i32 [[STRIDE_TRUNC]], i1 false, i32 2, i32 3) + + // CHECK64: [[S:%.*]] = load i64, i64* %S.addr, align 8 + // CHECK64-NEXT: [[PTR:%.*]] = load float*, float** %Ptr.addr, align 8 + // CHECK64-NEXT: call <6 x float> @llvm.matrix.column.major.load.v6f32.i64(float* align 4 [[PTR]], i64 [[S]], i1 false, i32 2, i32 3) fx2x3_t m_b = __builtin_matrix_column_major_load(Ptr, 2, 3, S); } void column_major_load_with_stride_math_int(int *Ptr, int S) { - // CHECK-LABEL: define{{.*}} void @column_major_load_with_stride_math_int(i32* %Ptr, i32 %S) - // CHECK: [[S:%.*]] = load i32, i32* %S.addr, align 4 - // CHECK-NEXT: [[STRIDE:%.*]] = add nsw i32 [[S]], 32 - // CHECK-NEXT: [[STRIDE_EXT:%.*]] = sext i32 [[STRIDE]] to i64 - // CHECK-NEXT: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8 - // CHECK-NEXT: call <80 x i32> @llvm.matrix.column.major.load.v80i32(i32* align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 4, i32 20) + // COMMON-LABEL: define{{.*}} void @column_major_load_with_stride_math_int(i32* %Ptr, i32 %S) + // COMMON: [[S:%.*]] = load i32, i32* %S.addr, align 4 + // COMMON-NEXT: [[STRIDE:%.*]] = add nsw i32 [[S]], 32 + // CHECK32-NEXT: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 4 + // CHECK32-NEXT: call <80 x i32> @llvm.matrix.column.major.load.v80i32.i32(i32* align 4 [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 20) + // + // CHECK64-NEXT: [[STRIDE_EXT:%.*]] = sext i32 [[STRIDE]] to i64 + // CHECK64-NEXT: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8 + // CHECK64-NEXT: call <80 x i32> @llvm.matrix.column.major.load.v80i32.i64(i32* align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 4, i32 20) ix4x20_t m_c = __builtin_matrix_column_major_load(Ptr, 4, 20, S + 32); } void column_major_load_with_stride_math_s_int(int *Ptr, short S) { - // CHECK-LABEL: define{{.*}} void @column_major_load_with_stride_math_s_int(i32* %Ptr, i16 signext %S) - // CHECK: [[S:%.*]] = load i16, i16* %S.addr, align 2 - // CHECK-NEXT: [[S_EXT:%.*]] = sext i16 [[S]] to i32 - // CHECK-NEXT: [[STRIDE:%.*]] = add nsw i32 [[S_EXT]], 32 - // CHECK-NEXT: [[STRIDE_EXT:%.*]] = sext i32 [[STRIDE]] to i64 - // CHECK-NEXT: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8 - // CHECK-NEXT: %matrix = call <80 x i32> @llvm.matrix.column.major.load.v80i32(i32* align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 4, i32 20) + // COMMON-LABEL: define{{.*}} void @column_major_load_with_stride_math_s_int(i32* %Ptr, i16 signext %S) + // COMMON: [[S:%.*]] = load i16, i16* %S.addr, align 2 + // COMMON-NEXT: [[S_EXT:%.*]] = sext i16 [[S]] to i32 + // COMMON-NEXT: [[STRIDE:%.*]] = add nsw i32 [[S_EXT]], 32 + // CHECK32-NEXT: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 4 + // CHECK32-NEXT: %matrix = call <80 x i32> @llvm.matrix.column.major.load.v80i32.i32(i32* align 4 [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 20) + // + // CHECK64-NEXT: [[STRIDE_EXT:%.*]] = sext i32 [[STRIDE]] to i64 + // CHECK64-NEXT: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8 + // CHECK64-NEXT: %matrix = call <80 x i32> @llvm.matrix.column.major.load.v80i32.i64(i32* align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 4, i32 20) ix4x20_t m_c = __builtin_matrix_column_major_load(Ptr, 4, 20, S + 32); } void column_major_load_array1(double Ptr[25]) { - // CHECK-LABEL: define{{.*}} void @column_major_load_array1(double* %Ptr) - // CHECK: [[ADDR:%.*]] = load double*, double** %Ptr.addr, align 8 - // CHECK-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64(double* align 8 [[ADDR]], i64 5, i1 false, i32 5, i32 5) + // COMMON-LABEL: define{{.*}} void @column_major_load_array1(double* %Ptr) + // CHECK32: [[ADDR:%.*]] = load double*, double** %Ptr.addr, align 4 + // CHECK32-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64.i32(double* align 4 [[ADDR]], i32 5, i1 false, i32 5, i32 5) + + // CHECK64: [[ADDR:%.*]] = load double*, double** %Ptr.addr, align 8 + // CHECK64-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64.i64(double* align 8 [[ADDR]], i64 5, i1 false, i32 5, i32 5) dx5x5_t m = __builtin_matrix_column_major_load(Ptr, 5, 5, 5); } void column_major_load_array2() { - // CHECK-LABEL: define{{.*}} void @column_major_load_array2() #0 { - // CHECK-NEXT: entry: - // CHECK-NEXT: [[PTR:%.*]] = alloca [25 x double], align 16 - // CHECK: [[ARRAY_DEC:%.*]] = getelementptr inbounds [25 x double], [25 x double]* [[PTR]], i64 0, i64 0 - // CHECK-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64(double* align 16 [[ARRAY_DEC]], i64 5, i1 false, i32 5, i32 5) + // COMMON-LABEL: define{{.*}} void @column_major_load_array2() #0 { + // COMMON-NEXT: entry: + // CHECK32-NEXT: [[PTR:%.*]] = alloca [25 x double], align 8 + // CHECK32: [[ARRAY_DEC:%.*]] = getelementptr inbounds [25 x double], [25 x double]* [[PTR]], i32 0, i32 0 + // CHECK32-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64.i32(double* align 8 [[ARRAY_DEC]], i32 5, i1 false, i32 5, i32 5) + + // CHECK64-NEXT: [[PTR:%.*]] = alloca [25 x double], align 16 + // CHECK64: [[ARRAY_DEC:%.*]] = getelementptr inbounds [25 x double], [25 x double]* [[PTR]], i64 0, i64 0 + // CHECK64-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64.i64(double* align 16 [[ARRAY_DEC]], i64 5, i1 false, i32 5, i32 5) double Ptr[25]; dx5x5_t m = __builtin_matrix_column_major_load(Ptr, 5, 5, 5); } void column_major_load_const(const double *Ptr) { - // CHECK-LABEL: define{{.*}} void @column_major_load_const(double* %Ptr) - // CHECK: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8 - // CHECK-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64(double* align 8 [[PTR]], i64 5, i1 false, i32 5, i32 5) + // COMMON-LABEL: define{{.*}} void @column_major_load_const(double* %Ptr) + // CHECK32: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 4 + // CHECK32-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64.i32(double* align 4 [[PTR]], i32 5, i1 false, i32 5, i32 5) + // + // CHECK64: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8 + // CHECK64-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64.i64(double* align 8 [[PTR]], i64 5, i1 false, i32 5, i32 5) dx5x5_t m_a1 = __builtin_matrix_column_major_load(Ptr, 5, 5, 5); } void column_major_load_volatile(volatile double *Ptr) { - // CHECK-LABEL: define{{.*}} void @column_major_load_volatile(double* %Ptr) - // CHECK: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8 - // CHECK-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64(double* align 8 [[PTR]], i64 5, i1 true, i32 5, i32 5) + // COMMON-LABEL: define{{.*}} void @column_major_load_volatile(double* %Ptr) + // CHECK32: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 4 + // CHECK32-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64.i32(double* align 4 [[PTR]], i32 5, i1 true, i32 5, i32 5) + // + // CHECK64: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8 + // CHECK64-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64.i64(double* align 8 [[PTR]], i64 5, i1 true, i32 5, i32 5) dx5x5_t m_a1 = __builtin_matrix_column_major_load(Ptr, 5, 5, 5); } void column_major_store_with_const_stride_double(double *Ptr) { - // CHECK-LABEL: define{{.*}} void @column_major_store_with_const_stride_double(double* %Ptr) - // CHECK: [[M:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 8 - // CHECK-NEXT: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8 - // CHECK-NEXT: call void @llvm.matrix.column.major.store.v25f64(<25 x double> [[M]], double* align 8 [[PTR]], i64 5, i1 false, i32 5, i32 5) + // COMMON-LABEL: define{{.*}} void @column_major_store_with_const_stride_double(double* %Ptr) + // CHECK32: [[M:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 4 + // CHECK32-NEXT: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 4 + // CHECK32-NEXT: call void @llvm.matrix.column.major.store.v25f64.i32(<25 x double> [[M]], double* align 4 [[PTR]], i32 5, i1 false, i32 5, i32 5) + // + // CHECK64: [[M:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 8 + // CHECK64-NEXT: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8 + // CHECK64-NEXT: call void @llvm.matrix.column.major.store.v25f64.i64(<25 x double> [[M]], double* align 8 [[PTR]], i64 5, i1 false, i32 5, i32 5) dx5x5_t m; __builtin_matrix_column_major_store(m, Ptr, 5); } void column_major_store_with_const_stride2_double(double *Ptr) { - // CHECK-LABEL: define{{.*}} void @column_major_store_with_const_stride2_double(double* %Ptr) - // CHECK: [[M:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 8 - // CHECK-NEXT: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8 - // CHECK-NEXT: call void @llvm.matrix.column.major.store.v25f64(<25 x double> [[M]], double* align 8 [[PTR]], i64 15, i1 false, i32 5, i32 5) + // COMMON-LABEL: define{{.*}} void @column_major_store_with_const_stride2_double(double* %Ptr) + // CHECK32: [[M:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 4 + // CHECK32-NEXT: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 4 + // CHECK32-NEXT: call void @llvm.matrix.column.major.store.v25f64.i32(<25 x double> [[M]], double* align 4 [[PTR]], i32 15, i1 false, i32 5, i32 5) + // + // CHECK64: [[M:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 8 + // CHECK64-NEXT: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8 + // CHECK64-NEXT: call void @llvm.matrix.column.major.store.v25f64.i64(<25 x double> [[M]], double* align 8 [[PTR]], i64 15, i1 false, i32 5, i32 5) // dx5x5_t m; __builtin_matrix_column_major_store(m, Ptr, 2 * 3 + 9); } void column_major_store_with_stride_math_int(int *Ptr, int S) { - // CHECK-LABEL: define{{.*}} void @column_major_store_with_stride_math_int(i32* %Ptr, i32 %S) - // CHECK: [[M:%.*]] = load <80 x i32>, <80 x i32>* {{.*}}, align 4 - // CHECK-NEXT: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8 - // CHECK-NEXT: [[S:%.*]] = load i32, i32* %S.addr, align 4 - // CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[S]], 32 - // CHECK-NEXT: [[IDX:%.*]] = sext i32 [[ADD]] to i64 - // CHECK-NEXT: call void @llvm.matrix.column.major.store.v80i32(<80 x i32> [[M]], i32* align 4 [[PTR]], i64 [[IDX]], i1 false, i32 4, i32 20) + // COMMON-LABEL: define{{.*}} void @column_major_store_with_stride_math_int(i32* %Ptr, i32 %S) + // COMMON: [[M:%.*]] = load <80 x i32>, <80 x i32>* {{.*}}, align 4 + // CHECK32-NEXT: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 4 + // CHECK64-NEXT: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8 + // COMMON-NEXT: [[S:%.*]] = load i32, i32* %S.addr, align 4 + // COMMON-NEXT: [[ADD:%.*]] = add nsw i32 [[S]], 32 + // CHECK32-NEXT: call void @llvm.matrix.column.major.store.v80i32.i32(<80 x i32> [[M]], i32* align 4 [[PTR]], i32 [[ADD]], i1 false, i32 4, i32 20) + // + // CHECK64-NEXT: [[IDX:%.*]] = sext i32 [[ADD]] to i64 + // CHECK64-NEXT: call void @llvm.matrix.column.major.store.v80i32.i64(<80 x i32> [[M]], i32* align 4 [[PTR]], i64 [[IDX]], i1 false, i32 4, i32 20) ix4x20_t m; __builtin_matrix_column_major_store(m, Ptr, S + 32); } void column_major_store_with_stride_math_s_int(int *Ptr, short S) { - // CHECK-LABEL: define{{.*}} void @column_major_store_with_stride_math_s_int(i32* %Ptr, i16 signext %S) - // CHECK: [[M:%.*]] = load <80 x i32>, <80 x i32>* {{.*}}, align 4 - // CHECK-NEXT: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8 - // CHECK-NEXT: [[S:%.*]] = load i16, i16* %S.addr, align 2 - // CHECK-NEXT: [[EXT:%.*]] = sext i16 [[S]] to i32 - // CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[EXT]], 2 - // CHECK-NEXT: [[IDX:%.*]] = sext i32 [[ADD]] to i64 - // CHECK-NEXT: call void @llvm.matrix.column.major.store.v80i32(<80 x i32> [[M]], i32* align 4 [[PTR]], i64 [[IDX]], i1 false, i32 4, i32 20) + // COMMON-LABEL: define{{.*}} void @column_major_store_with_stride_math_s_int(i32* %Ptr, i16 signext %S) + // COMMON: [[M:%.*]] = load <80 x i32>, <80 x i32>* {{.*}}, align 4 + // CHECK32-NEXT: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 4 + // CHECK64-NEXT: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8 + // COMMON-NEXT: [[S:%.*]] = load i16, i16* %S.addr, align 2 + // COMMON-NEXT: [[EXT:%.*]] = sext i16 [[S]] to i32 + // COMMON-NEXT: [[ADD:%.*]] = add nsw i32 [[EXT]], 2 + // CHECK32-NEXT: call void @llvm.matrix.column.major.store.v80i32.i32(<80 x i32> [[M]], i32* align 4 [[PTR]], i32 [[ADD]], i1 false, i32 4, i32 20) + // + // CHECK64-NEXT: [[IDX:%.*]] = sext i32 [[ADD]] to i64 + // CHECK64-NEXT: call void @llvm.matrix.column.major.store.v80i32.i64(<80 x i32> [[M]], i32* align 4 [[PTR]], i64 [[IDX]], i1 false, i32 4, i32 20) ix4x20_t m; __builtin_matrix_column_major_store(m, Ptr, S + 2); } void column_major_store_array1(double Ptr[25]) { - // CHECK-LABEL: define{{.*}} void @column_major_store_array1(double* %Ptr) - // CHECK: [[M:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 8 - // CHECK-NEXT: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8 - // CHECK-NEXT: call void @llvm.matrix.column.major.store.v25f64(<25 x double> [[M]], double* align 8 [[PTR]], i64 5, i1 false, i32 5, i32 5) + // COMMON-LABEL: define{{.*}} void @column_major_store_array1(double* %Ptr) + // CHECK32: [[M:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 4 + // CHECK32-NEXT: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 4 + // CHECK32-NEXT: call void @llvm.matrix.column.major.store.v25f64.i32(<25 x double> [[M]], double* align 4 [[PTR]], i32 5, i1 false, i32 5, i32 5) + // + // CHECK64: [[M:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 8 + // CHECK64-NEXT: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8 + // CHECK64-NEXT: call void @llvm.matrix.column.major.store.v25f64.i64(<25 x double> [[M]], double* align 8 [[PTR]], i64 5, i1 false, i32 5, i32 5) dx5x5_t m; __builtin_matrix_column_major_store(m, Ptr, 5); } void column_major_store_array2() { - // CHECK-LABEL: define{{.*}} void @column_major_store_array2() - // CHECK: [[M:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 8 - // CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds [25 x double], [25 x double]* %Ptr, i64 0, i64 0 - // CHECK-NEXT: call void @llvm.matrix.column.major.store.v25f64(<25 x double> [[M]], double* align 16 [[PTR]], i64 5, i1 false, i32 5, i32 5) + // COMMON-LABEL: define{{.*}} void @column_major_store_array2() + // CHECK32: [[M:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 4 + // CHECK32-NEXT: [[PTR:%.*]] = getelementptr inbounds [25 x double], [25 x double]* %Ptr, i32 0, i32 0 + // CHECK32-NEXT: call void @llvm.matrix.column.major.store.v25f64.i32(<25 x double> [[M]], double* align 8 [[PTR]], i32 5, i1 false, i32 5, i32 5) + // + // CHECK64: [[M:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 8 + // CHECK64-NEXT: [[PTR:%.*]] = getelementptr inbounds [25 x double], [25 x double]* %Ptr, i64 0, i64 0 + // CHECK64-NEXT: call void @llvm.matrix.column.major.store.v25f64.i64(<25 x double> [[M]], double* align 16 [[PTR]], i64 5, i1 false, i32 5, i32 5) double Ptr[25]; dx5x5_t m; @@ -253,10 +315,14 @@ } void column_major_store_volatile(volatile double *Ptr) { - // CHECK-LABEL: define{{.*}} void @column_major_store_volatile(double* %Ptr) #0 { - // CHECK: [[M:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 8 - // CHECK-NEXT: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8 - // CHECK-NEXT: call void @llvm.matrix.column.major.store.v25f64(<25 x double> [[M]], double* align 8 [[PTR]], i64 5, i1 true, i32 5, i32 5) + // COMMON-LABEL: define{{.*}} void @column_major_store_volatile(double* %Ptr) #0 { + // CHECK32: [[M:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 4 + // CHECK32-NEXT: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 4 + // CHECK32-NEXT: call void @llvm.matrix.column.major.store.v25f64.i32(<25 x double> [[M]], double* align 4 [[PTR]], i32 5, i1 true, i32 5, i32 5) + // + // CHECK64: [[M:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 8 + // CHECK64-NEXT: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8 + // CHECK64-NEXT: call void @llvm.matrix.column.major.store.v25f64.i64(<25 x double> [[M]], double* align 8 [[PTR]], i64 5, i1 true, i32 5, i32 5) dx5x5_t m; __builtin_matrix_column_major_store(m, Ptr, 5); diff --git a/clang/test/CodeGenCXX/matrix-type-builtins.cpp b/clang/test/CodeGenCXX/matrix-type-builtins.cpp --- a/clang/test/CodeGenCXX/matrix-type-builtins.cpp +++ b/clang/test/CodeGenCXX/matrix-type-builtins.cpp @@ -94,7 +94,7 @@ // CHECK-LABEL: define linkonce_odr <40 x double> @_Z29column_major_load_with_strideIdLj10ELj4ELj15EEu11matrix_typeIXT0_EXT1_ET_EPS0_(double* %Ptr) // CHECK: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8 - // CHECK-NEXT: call <40 x double> @llvm.matrix.column.major.load.v40f64(double* align 8 [[PTR]], i64 15, i1 false, i32 10, i32 4) + // CHECK-NEXT: call <40 x double> @llvm.matrix.column.major.load.v40f64.i64(double* align 8 [[PTR]], i64 15, i1 false, i32 10, i32 4) matrix_t M1 = column_major_load_with_stride(Ptr); } @@ -106,7 +106,7 @@ // CHECK-LABEL: define linkonce_odr <6 x i32> @_Z29column_major_load_with_strideIiLj3ELj2ELj12EEu11matrix_typeIXT0_EXT1_ET_EPS0_(i32* %Ptr) // CHECK: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8 - // CHECK-NEXT: call <6 x i32> @llvm.matrix.column.major.load.v6i32(i32* align 4 [[PTR]], i64 12, i1 false, i32 3, i32 2) + // CHECK-NEXT: call <6 x i32> @llvm.matrix.column.major.load.v6i32.i64(i32* align 4 [[PTR]], i64 12, i1 false, i32 3, i32 2) matrix_t M1 = column_major_load_with_stride(Ptr); } @@ -124,7 +124,7 @@ // CHECK-NEXT: [[STRIDE:%.*]] = call i32 @_ZN15UnsignedWrappercvjEv(%struct.UnsignedWrapper* {{[^,]*}} [[W]]) // CHECK-NEXT: [[STRIDE_EXT:%.*]] = zext i32 [[STRIDE]] to i64 // CHECK-NEXT: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8 - // CHECK-NEXT: call <4 x i32> @llvm.matrix.column.major.load.v4i32(i32* align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 2, i32 2) + // CHECK-NEXT: call <4 x i32> @llvm.matrix.column.major.load.v4i32.i64(i32* align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 2, i32 2) matrix_t M1 = __builtin_matrix_column_major_load(Ptr, 2, 2, W); } @@ -133,7 +133,7 @@ void test_column_major_load_constexpr_num_rows(int *Ptr) { // CHECK-LABEL: define{{.*}} void @_Z41test_column_major_load_constexpr_num_rowsPi(i32* %Ptr) // CHECK: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8 - // CHECK-NEXT: call <6 x i32> @llvm.matrix.column.major.load.v6i32(i32* align 4 [[PTR]], i64 3, i1 false, i32 3, i32 2) + // CHECK-NEXT: call <6 x i32> @llvm.matrix.column.major.load.v6i32.i64(i32* align 4 [[PTR]], i64 3, i1 false, i32 3, i32 2) matrix_t M1 = __builtin_matrix_column_major_load(Ptr, constexpr3(), 2, 3); } @@ -143,7 +143,7 @@ void test_column_major_load_constexpr_num_columns(int *Ptr) { // CHECK-LABEL: define{{.*}} void @_Z44test_column_major_load_constexpr_num_columnsPi(i32* %Ptr) // CHECK: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8 - // CHECK-NEXT: call <2 x i32> @llvm.matrix.column.major.load.v2i32(i32* align 4 [[PTR]], i64 3, i1 false, i32 2, i32 1) + // CHECK-NEXT: call <2 x i32> @llvm.matrix.column.major.load.v2i32.i64(i32* align 4 [[PTR]], i64 3, i1 false, i32 2, i32 1) matrix_t M1 = __builtin_matrix_column_major_load(Ptr, 2, constexpr1(), 3); } @@ -153,7 +153,7 @@ void test_column_major_load_constexpr_num_columns_temp(int *Ptr) { // CHECK-LABEL: define{{.*}} void @_Z49test_column_major_load_constexpr_num_columns_tempPi(i32* %Ptr) // CHECK: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8 - // CHECK-NEXT: call <10 x i32> @llvm.matrix.column.major.load.v10i32(i32* align 4 [[PTR]], i64 3, i1 false, i32 2, i32 5) + // CHECK-NEXT: call <10 x i32> @llvm.matrix.column.major.load.v10i32.i64(i32* align 4 [[PTR]], i64 3, i1 false, i32 2, i32 5) matrix_t M1 = __builtin_matrix_column_major_load(Ptr, 2, constexpr_plus1<4>(), 3); } @@ -162,7 +162,7 @@ // CHECK: [[STRIDE:%.*]] = call i32 @_Z10constexpr3v() // CHECK-NEXT: [[STRIDE_EXT:%.*]] = sext i32 [[STRIDE]] to i64 // CHECK-NEXT: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8 - // CHECK-NEXT: call <4 x i32> @llvm.matrix.column.major.load.v4i32(i32* align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 2, i32 2) + // CHECK-NEXT: call <4 x i32> @llvm.matrix.column.major.load.v4i32.i64(i32* align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 2, i32 2) matrix_t M1 = __builtin_matrix_column_major_load(Ptr, 2, 2, constexpr3()); } @@ -200,7 +200,7 @@ // CHECK-LABEL: define linkonce_odr void @_Z30column_major_store_with_strideIdLj10ELj4ELj15EEvRu11matrix_typeIXT0_EXT1_ET_EPS0_([40 x double]* nonnull align 8 dereferenceable(320) %m, double* %Ptr) // CHECK: [[M:%.*]] = load <40 x double>, <40 x double>* {{.*}}, align 8 // CHECK-NEXT: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8 - // CHECK-NEXT: call void @llvm.matrix.column.major.store.v40f64(<40 x double> [[M]], double* align 8 [[PTR]], i64 15, i1 false, i32 10, i32 4) + // CHECK-NEXT: call void @llvm.matrix.column.major.store.v40f64.i64(<40 x double> [[M]], double* align 8 [[PTR]], i64 15, i1 false, i32 10, i32 4) matrix_t M1; column_major_store_with_stride(M1, Ptr); @@ -214,7 +214,7 @@ // CHECK-LABEL: define linkonce_odr void @_Z30column_major_store_with_strideIiLj3ELj2ELj3EEvRu11matrix_typeIXT0_EXT1_ET_EPS0_([6 x i32]* nonnull align 4 dereferenceable(24) %m, i32* %Ptr) // CHECK: [[M:%.*]] = load <6 x i32>, <6 x i32>* {{.*}}, align 4 // CHECK-NEXT: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8 - // CHECK-NEXT: call void @llvm.matrix.column.major.store.v6i32(<6 x i32> [[M]], i32* align 4 [[PTR]], i64 3, i1 false, i32 3, i32 2) + // CHECK-NEXT: call void @llvm.matrix.column.major.store.v6i32.i64(<6 x i32> [[M]], i32* align 4 [[PTR]], i64 3, i1 false, i32 3, i32 2) matrix_t M1; column_major_store_with_stride(M1, Ptr); @@ -227,7 +227,7 @@ // CHECK-NEXT: [[W:%.*]] = load %struct.UnsignedWrapper*, %struct.UnsignedWrapper** %W.addr, align 8 // CHECK-NEXT: [[IDX:%.*]] = call i32 @_ZN15UnsignedWrappercvjEv(%struct.UnsignedWrapper* {{[^,]*}} [[W]]) // CHECK-NEXT: [[IDX_EXT:%.*]] = zext i32 [[IDX]] to i64 - // CHECK-NEXT: call void @llvm.matrix.column.major.store.v4i32(<4 x i32> [[M]], i32* align 4 [[PTR]], i64 [[IDX_EXT]], i1 false, i32 2, i32 2) + // CHECK-NEXT: call void @llvm.matrix.column.major.store.v4i32.i64(<4 x i32> [[M]], i32* align 4 [[PTR]], i64 [[IDX_EXT]], i1 false, i32 2, i32 2) matrix_t M1; __builtin_matrix_column_major_store(M1, Ptr, W); @@ -239,7 +239,7 @@ // CHECK-NEXT: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8 // CHECK-NEXT: [[IDX:%.*]] = call i32 @_Z10constexpr3v() // CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[IDX]] to i64 - // CHECK-NEXT: call void @llvm.matrix.column.major.store.v4i32(<4 x i32> [[M]], i32* align 4 [[PTR]], i64 [[IDX_EXT]], i1 false, i32 2, i32 2) + // CHECK-NEXT: call void @llvm.matrix.column.major.store.v4i32.i64(<4 x i32> [[M]], i32* align 4 [[PTR]], i64 [[IDX_EXT]], i1 false, i32 2, i32 2) matrix_t M; __builtin_matrix_column_major_store(M, Ptr, constexpr3()); diff --git a/clang/test/CodeGenObjC/matrix-type-builtins.m b/clang/test/CodeGenObjC/matrix-type-builtins.m --- a/clang/test/CodeGenObjC/matrix-type-builtins.m +++ b/clang/test/CodeGenObjC/matrix-type-builtins.m @@ -56,7 +56,7 @@ // CHECK: [[STRIDE:%.*]] = call i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*) // CHECK-NEXT: [[STRIDE_EXT:%.*]] = sext i32 [[STRIDE]] to i64 // CHECK: [[PTR:%.*]] = call i32* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32* (i8*, i8*)*) - // CHECK-NEXT: call <12 x i32> @llvm.matrix.column.major.load.v12i32(i32* align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 3, i32 4) + // CHECK-NEXT: call <12 x i32> @llvm.matrix.column.major.load.v12i32.i64(i32* align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 3, i32 4) u3x4 m = __builtin_matrix_column_major_load(Ptr.value, 3, 4, Stride.value); } @@ -67,7 +67,7 @@ // CHECK: [[PTR:%.*]] = call i32* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32* (i8*, i8*)*) // CHECK: [[IDX:%.*]] = call i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*) // CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[IDX]] to i64 - // CHECK-NEXT: call void @llvm.matrix.column.major.store.v12i32(<12 x i32> [[M]], i32* align 4 [[PTR]], i64 [[IDX_EXT]], i1 false, i32 3, i32 4) + // CHECK-NEXT: call void @llvm.matrix.column.major.store.v12i32.i64(<12 x i32> [[M]], i32* align 4 [[PTR]], i64 [[IDX_EXT]], i1 false, i32 3, i32 4) __builtin_matrix_column_major_store(M.value, Ptr.value, Stride.value); } diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -17252,11 +17252,12 @@ The '``llvm.matrix.column.major.load.*``' intrinsics load a `` x `` matrix using a stride of ``%Stride`` to compute the start address of the -different columns. This allows for convenient loading of sub matrixes. If -```` is true, the intrinsic is considered a :ref:`volatile memory -access `. The result matrix is returned in the result vector. If the -``%Ptr`` argument is known to be aligned to some boundary, this can be -specified as an attribute on the argument. +different columns. The offset is computed using ``%Stride``'s bitwidth. This +allows for convenient loading of sub matrixes. If ```` is true, the +intrinsic is considered a :ref:`volatile memory access `. The result +matrix is returned in the result vector. If the ``%Ptr`` argument is known to +be aligned to some boundary, this can be specified as an attribute on the +argument. Arguments: """""""""" @@ -17291,7 +17292,8 @@ The '``llvm.matrix.column.major.store.*``' intrinsics store the `` x `` matrix in ``%In`` to memory using a stride of ``%Stride`` between -columns. If ```` is true, the intrinsic is considered a +columns. The offset is computed using ``%Stride``'s bitwidth. If +```` is true, the intrinsic is considered a :ref:`volatile memory access `. If the ``%Ptr`` argument is known to be aligned to some boundary, this can be diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1668,7 +1668,7 @@ def int_matrix_column_major_load : DefaultAttrsIntrinsic<[llvm_anyvector_ty], - [LLVMPointerToElt<0>, llvm_i64_ty, llvm_i1_ty, + [LLVMPointerToElt<0>, llvm_anyint_ty, llvm_i1_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoSync, IntrWillReturn, IntrArgMemOnly, IntrReadMem, NoCapture>, ImmArg>, ImmArg>, @@ -1677,7 +1677,7 @@ def int_matrix_column_major_store : DefaultAttrsIntrinsic<[], [llvm_anyvector_ty, LLVMPointerToElt<0>, - llvm_i64_ty, llvm_i1_ty, llvm_i32_ty, llvm_i32_ty], + llvm_anyint_ty, llvm_i1_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoSync, IntrWillReturn, IntrArgMemOnly, IntrWriteMem, WriteOnly>, NoCapture>, ImmArg>, ImmArg>, ImmArg>]>; diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h --- a/llvm/include/llvm/IR/MatrixBuilder.h +++ b/llvm/include/llvm/IR/MatrixBuilder.h @@ -74,7 +74,7 @@ Value *Ops[] = {DataPtr, Stride, B.getInt1(IsVolatile), B.getInt32(Rows), B.getInt32(Columns)}; - Type *OverloadedTypes[] = {RetType}; + Type *OverloadedTypes[] = {RetType, Stride->getType()}; Function *TheFn = Intrinsic::getDeclaration( getModule(), Intrinsic::matrix_column_major_load, OverloadedTypes); @@ -97,7 +97,7 @@ Value *Ops[] = {Matrix, Ptr, Stride, B.getInt1(IsVolatile), B.getInt32(Rows), B.getInt32(Columns)}; - Type *OverloadedTypes[] = {Matrix->getType()}; + Type *OverloadedTypes[] = {Matrix->getType(), Stride->getType()}; Function *TheFn = Intrinsic::getDeclaration( getModule(), Intrinsic::matrix_column_major_store, OverloadedTypes); diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -981,8 +981,9 @@ Value *EltPtr = createElementPtr(Ptr, EltTy, Builder); MatrixTy Result; for (unsigned I = 0, E = Shape.getNumVectors(); I < E; ++I) { - Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(I), Stride, - Shape.getStride(), EltTy, Builder); + Value *GEP = computeVectorAddr( + EltPtr, Builder.getIntN(Stride->getType()->getScalarSizeInBits(), I), + Stride, Shape.getStride(), EltTy, Builder); Value *Vector = Builder.CreateAlignedLoad( VecTy, GEP, getAlignForIndex(I, Stride, EltTy, MAlign), IsVolatile, "col.load"); @@ -1071,9 +1072,11 @@ auto VType = cast(Ty); Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); for (auto Vec : enumerate(StoreVal.vectors())) { - Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(Vec.index()), - Stride, StoreVal.getStride(), - VType->getElementType(), Builder); + Value *GEP = computeVectorAddr( + EltPtr, + Builder.getIntN(Stride->getType()->getScalarSizeInBits(), + Vec.index()), + Stride, StoreVal.getStride(), VType->getElementType(), Builder); Builder.CreateAlignedStore(Vec.value(), GEP, getAlignForIndex(Vec.index(), Stride, VType->getElementType(), diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll @@ -23,11 +23,11 @@ ; CHECK-NEXT: ret <9 x double> [[TMP2]] ; entry: - %load = call <9 x double> @llvm.matrix.column.major.load(double* %in, i64 %stride, i1 false, i32 3, i32 3) + %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i64(double* %in, i64 %stride, i1 false, i32 3, i32 3) ret <9 x double> %load } -declare <9 x double> @llvm.matrix.column.major.load(double*, i64, i1, i32, i32) +declare <9 x double> @llvm.matrix.column.major.load.v9f64.i64(double*, i64, i1, i32, i32) define <9 x double> @strided_load_9x1(double* %in, i64 %stride) { ; CHECK-LABEL: @strided_load_9x1( @@ -39,12 +39,11 @@ ; CHECK-NEXT: ret <9 x double> [[COL_LOAD]] ; entry: - %load = call <9 x double> @llvm.matrix.column.major.load(double* %in, i64 %stride, i1 false, i32 9, i32 1) + %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i64(double* %in, i64 %stride, i1 false, i32 9, i32 1) ret <9 x double> %load } -declare <8 x double> @llvm.matrix.column.major.load.v8f64(double*, i64, i1, i32, i32) -; CHECK: declare <8 x double> @llvm.matrix.column.major.load.v8f64(double* nocapture, i64, i1 immarg, i32 immarg, i32 immarg) [[READONLY:#[0-9]]] +declare <8 x double> @llvm.matrix.column.major.load.v8f64.i64(double*, i64, i1, i32, i32) define <8 x double> @strided_load_4x2(double* %in, i64 %stride) { ; CHECK-LABEL: @strided_load_4x2( @@ -61,9 +60,27 @@ ; CHECK-NEXT: ret <8 x double> [[TMP0]] ; entry: - %load = call <8 x double> @llvm.matrix.column.major.load.v8f64(double* %in, i64 %stride, i1 false, i32 4, i32 2) + %load = call <8 x double> @llvm.matrix.column.major.load.v8f64.i64(double* %in, i64 %stride, i1 false, i32 4, i32 2) ret <8 x double> %load } -; CHECK: declare <9 x double> @llvm.matrix.column.major.load.v9f64(double* nocapture, i64, i1 immarg, i32 immarg, i32 immarg) [[READONLY]] -; CHECK: attributes [[READONLY]] = { argmemonly nofree nosync nounwind readonly willreturn } +declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(double*, i32, i1, i32, i32) + +define <8 x double> @strided_load_4x2_stride_i32(double* %in, i32 %stride) { +; CHECK-LABEL: @strided_load_4x2_stride_i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[IN:%.*]], i32 [[VEC_START]] +; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast double* [[VEC_GEP]] to <4 x double>* +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST]], align 8 +; CHECK-NEXT: [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]] +; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, double* [[IN]], i32 [[VEC_START1]] +; CHECK-NEXT: [[VEC_CAST3:%.*]] = bitcast double* [[VEC_GEP2]] to <4 x double>* +; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST3]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> [[COL_LOAD4]], <8 x i32> +; CHECK-NEXT: ret <8 x double> [[TMP0]] +; +entry: + %load = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(double* %in, i32 %stride, i1 false, i32 4, i32 2) + ret <8 x double> %load +} diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll @@ -13,7 +13,7 @@ ; CHECK-NEXT: store <3 x double> [[SPLIT1]], <3 x double>* [[VEC_CAST2]], align 8 ; CHECK-NEXT: ret void ; - call void @llvm.matrix.column.major.store.v6f64(<6 x double> %in, double* %out, i64 5, i1 false, i32 3, i32 2) + call void @llvm.matrix.column.major.store.v6f64.i64(<6 x double> %in, double* %out, i64 5, i1 false, i32 3, i32 2) ret void } @@ -31,7 +31,25 @@ ; CHECK-NEXT: store <3 x double> [[SPLIT1]], <3 x double>* [[VEC_CAST4]], align 8 ; CHECK-NEXT: ret void ; - call void @llvm.matrix.column.major.store.v6f64(<6 x double> %in, double* %out, i64 %stride, i1 false, i32 3, i32 2) + call void @llvm.matrix.column.major.store.v6f64.i64(<6 x double> %in, double* %out, i64 %stride, i1 false, i32 3, i32 2) + ret void +} + +define void @strided_store_3x2_nonconst_i32_stride(<6 x double> %in, i32 %stride, double* %out) { +; CHECK-LABEL: @strided_store_3x2_nonconst_i32_stride( +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <6 x double> [[IN:%.*]], <6 x double> poison, <3 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <6 x double> [[IN]], <6 x double> poison, <3 x i32> +; CHECK-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[OUT:%.*]], i32 [[VEC_START]] +; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast double* [[VEC_GEP]] to <3 x double>* +; CHECK-NEXT: store <3 x double> [[SPLIT]], <3 x double>* [[VEC_CAST]], align 8 +; CHECK-NEXT: [[VEC_START2:%.*]] = mul i32 1, [[STRIDE]] +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, double* [[OUT]], i32 [[VEC_START2]] +; CHECK-NEXT: [[VEC_CAST4:%.*]] = bitcast double* [[VEC_GEP3]] to <3 x double>* +; CHECK-NEXT: store <3 x double> [[SPLIT1]], <3 x double>* [[VEC_CAST4]], align 8 +; CHECK-NEXT: ret void +; + call void @llvm.matrix.column.major.store.v6f64.i32(<6 x double> %in, double* %out, i32 %stride, i1 false, i32 3, i32 2) ret void } @@ -58,13 +76,14 @@ ; CHECK-NEXT: store <2 x double> [[SPLIT4]], <2 x double>* [[VEC_CAST11]], align 8 ; CHECK-NEXT: ret void ; - call void @llvm.matrix.column.major.store.v10f64(<10 x double> %in, double* %out, i64 4, i1 false, i32 2, i32 5) + call void @llvm.matrix.column.major.store.v10f64.i64(<10 x double> %in, double* %out, i64 4, i1 false, i32 2, i32 5) ret void } -declare void @llvm.matrix.column.major.store.v6f64(<6 x double>, double*, i64, i1, i32, i32) -declare void @llvm.matrix.column.major.store.v10f64(<10 x double>, double*, i64, i1, i32, i32) +declare void @llvm.matrix.column.major.store.v6f64.i64(<6 x double>, double*, i64, i1, i32, i32) +declare void @llvm.matrix.column.major.store.v6f64.i32(<6 x double>, double*, i32, i1, i32, i32) +declare void @llvm.matrix.column.major.store.v10f64.i64(<10 x double>, double*, i64, i1, i32, i32) -; CHECK: declare void @llvm.matrix.column.major.store.v6f64(<6 x double>, double* nocapture writeonly, i64, i1 immarg, i32 immarg, i32 immarg) #0 -; CHECK: declare void @llvm.matrix.column.major.store.v10f64(<10 x double>, double* nocapture writeonly, i64, i1 immarg, i32 immarg, i32 immarg) #0 +; CHECK: declare void @llvm.matrix.column.major.store.v6f64.i64(<6 x double>, double* nocapture writeonly, i64, i1 immarg, i32 immarg, i32 immarg) #0 +; CHECK: declare void @llvm.matrix.column.major.store.v10f64.i64(<10 x double>, double* nocapture writeonly, i64, i1 immarg, i32 immarg, i32 immarg) #0 ; CHECK: attributes #0 = { argmemonly nofree nosync nounwind willreturn writeonly } diff --git a/llvm/test/Verifier/matrix-intrinsics.ll b/llvm/test/Verifier/matrix-intrinsics.ll --- a/llvm/test/Verifier/matrix-intrinsics.ll +++ b/llvm/test/Verifier/matrix-intrinsics.ll @@ -39,11 +39,11 @@ ; CHECK-NEXT: Result of a matrix operation does not fit in the returned vector! ; CHECK-NEXT: immarg operand has non-immediate parameter ; CHECK-NEXT: i32 %arg -; CHECK-NEXT: %result.3 = call <6 x float> @llvm.matrix.column.major.load.v6f32(float* %n, i64 2, i1 true, i32 3, i32 %arg) - %result.0 = call <4 x float> @llvm.matrix.column.major.load.v4f32(float* %m, i64 0, i1 false, i32 0, i32 0) - %result.1 = call <4 x float> @llvm.matrix.column.major.load.v4f32(float* %m, i64 2, i1 false, i32 1, i32 2) - %result.2 = call <6 x float> @llvm.matrix.column.major.load.v6f32(float* %n, i64 2, i1 true, i32 3, i32 3) - %result.3 = call <6 x float> @llvm.matrix.column.major.load.v6f32(float* %n, i64 2, i1 true, i32 3, i32 %arg) +; CHECK-NEXT: %result.3 = call <6 x float> @llvm.matrix.column.major.load.v6f32.i64(float* %n, i64 2, i1 true, i32 3, i32 %arg) + %result.0 = call <4 x float> @llvm.matrix.column.major.load.v4f32.i64(float* %m, i64 0, i1 false, i32 0, i32 0) + %result.1 = call <4 x float> @llvm.matrix.column.major.load.v4f32.i64(float* %m, i64 2, i1 false, i32 1, i32 2) + %result.2 = call <6 x float> @llvm.matrix.column.major.load.v6f32.i64(float* %n, i64 2, i1 true, i32 3, i32 3) + %result.3 = call <6 x float> @llvm.matrix.column.major.load.v6f32.i64(float* %n, i64 2, i1 true, i32 3, i32 %arg) ret <4 x float> %result.1 } @@ -52,10 +52,10 @@ ; CHECK-NEXT: Result of a matrix operation does not fit in the returned vector! ; CHECK-NEXT: Result of a matrix operation does not fit in the returned vector! ; CHECK-NEXT: Result of a matrix operation does not fit in the returned vector! - call void @llvm.matrix.column.major.store.v4f32(<4 x float> zeroinitializer, float* %m, i64 0, i1 false, i32 0, i32 0) - call void @llvm.matrix.column.major.store.v4f32(<4 x float> zeroinitializer, float* %m, i64 2, i1 false, i32 1, i32 2) - call void @llvm.matrix.column.major.store.v6f32(<6 x float> zeroinitializer, float* %n, i64 2, i1 false, i32 3, i32 3) - call void @llvm.matrix.column.major.store.v6f32(<6 x float> zeroinitializer, float* %n, i64 %arg, i1 false, i32 3, i32 3) + call void @llvm.matrix.column.major.store.v4f32.i64(<4 x float> zeroinitializer, float* %m, i64 0, i1 false, i32 0, i32 0) + call void @llvm.matrix.column.major.store.v4f32.i64(<4 x float> zeroinitializer, float* %m, i64 2, i1 false, i32 1, i32 2) + call void @llvm.matrix.column.major.store.v6f32.i64(<6 x float> zeroinitializer, float* %n, i64 2, i1 false, i32 3, i32 3) + call void @llvm.matrix.column.major.store.v6f32.i64(<6 x float> zeroinitializer, float* %n, i64 %arg, i1 false, i32 3, i32 3) ret void } @@ -94,18 +94,18 @@ ; CHECK-NEXT: Intrinsic has incorrect argument type! ; CHECK-NEXT: <4 x float> (i32*, i64, i1, i32, i32)* @llvm.matrix.column.major.load.v4f32.pi32 ; CHECK-NEXT: Intrinsic has incorrect argument type! -; CHECK-NEXT: <4 x i32> (float*, i64, i1, i32, i32)* @llvm.matrix.column.major.load.v4i32 +; CHECK-NEXT: <4 x i32> (float*, i64, i1, i32, i32)* @llvm.matrix.column.major.load.v4i32.i64 ; %result.0 = call <4 x float> @llvm.matrix.column.major.load.v4f32.pi32(i32* %m, i64 2, i1 false, i32 2, i32 2) - %result.1 = call <4 x i32> @llvm.matrix.column.major.load.v4i32(float* %n, i64 2, i1 false, i32 2, i32 2) + %result.1 = call <4 x i32> @llvm.matrix.column.major.load.v4i32.i64(float* %n, i64 2, i1 false, i32 2, i32 2) ret <4 x float> %result.0 } define void @column.major_store_mixed_types(float* %m, i32* %n, i64 %arg) { ; -; CHECK-NEXT: Intrinsic has incorrect argument type! +; CHECK-NEXT: Intrinsic has incorrect argument type! ; CHECK-NEXT: void (<4 x i32>, float*, i64, i1, i32, i32)* @llvm.matrix.column.major.store.v4i32.vi32 -; CHECK-NEXT: Intrinsic has incorrect argument type! +; CHECK-NEXT: Intrinsic has incorrect argument type! ; CHECK-NEXT: void (<4 x float>, i32*, i64, i1, i32, i32)* @llvm.matrix.column.major.store.v4f32.pi32 ; call void @llvm.matrix.column.major.store.v4i32.vi32(<4 x i32> zeroinitializer, float* %m, i64 2, i1 false, i32 2, i32 2) @@ -125,28 +125,28 @@ define <4 x float> @column.major_load_stride_too_small(float* %m, i32 %arg) { ; ; CHECK-NEXT: Stride must be greater or equal than the number of rows! -; CHECK-NEXT: <4 x float> (float*, i64, i1, i32, i32)* @llvm.matrix.column.major.load.v4f32 +; CHECK-NEXT: <4 x float> (float*, i64, i1, i32, i32)* @llvm.matrix.column.major.load.v4f32.i64 ; - %result.1 = call <4 x float> @llvm.matrix.column.major.load.v4f32(float* %m, i64 1, i1 false, i32 2, i32 2) + %result.1 = call <4 x float> @llvm.matrix.column.major.load.v4f32.i64(float* %m, i64 1, i1 false, i32 2, i32 2) ret <4 x float> %result.1 } define void @column.major_store_stride_too_small(float* %m, i64 %arg) { ; ; CHECK-NEXT: Stride must be greater or equal than the number of rows! -; CHECK-NEXT: void (<4 x float>, float*, i64, i1, i32, i32)* @llvm.matrix.column.major.store.v4f32 +; CHECK-NEXT: void (<4 x float>, float*, i64, i1, i32, i32)* @llvm.matrix.column.major.store.v4f32.i64 ; - call void @llvm.matrix.column.major.store.v4f32(<4 x float> zeroinitializer, float* %m, i64 1, i1 false, i32 2, i32 2) + call void @llvm.matrix.column.major.store.v4f32.i64(<4 x float> zeroinitializer, float* %m, i64 1, i1 false, i32 2, i32 2) ret void } -declare <4 x i32> @llvm.matrix.column.major.load.v4i32(float*, i64, i1, i32, i32) +declare <4 x i32> @llvm.matrix.column.major.load.v4i32.i64(float*, i64, i1, i32, i32) declare <4 x float> @llvm.matrix.column.major.load.v4f32.pi32(i32*, i64, i1, i32, i32) -declare <4 x float> @llvm.matrix.column.major.load.v4f32(float*, i64, i1, i32, i32) -declare <6 x float> @llvm.matrix.column.major.load.v6f32(float*, i64, i1, i32, i32) +declare <4 x float> @llvm.matrix.column.major.load.v4f32.i64(float*, i64, i1, i32, i32) +declare <6 x float> @llvm.matrix.column.major.load.v6f32.i64(float*, i64, i1, i32, i32) -declare void @llvm.matrix.column.major.store.v4f32(<4 x float>, float*, i64, i1, i32, i32) -declare void @llvm.matrix.column.major.store.v6f32(<6 x float>, float*, i64, i1, i32, i32) +declare void @llvm.matrix.column.major.store.v4f32.i64(<4 x float>, float*, i64, i1, i32, i32) +declare void @llvm.matrix.column.major.store.v6f32.i64(<6 x float>, float*, i64, i1, i32, i32) declare void @llvm.matrix.column.major.store.v4i32.vi32(<4 x i32>, float*, i64, i1, i32, i32) declare void @llvm.matrix.column.major.store.v4f32.pi32(<4 x float>, i32*, i64, i1, i32, i32) declare void @llvm.matrix.column.major.store.v4f32p0.p0v4f32(<4 x float*>, <4 x float>*, i64, i1, i32, i32)