diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -1996,6 +1996,32 @@ } } + // If Src is a fixed vector and Dst is a scalable vector, and both have the + // same element type, use the llvm.experimental.vector.insert intrinsic to + // perform the bitcast. + if (const auto *FixedSrc = dyn_cast(SrcTy)) { + if (const auto *ScalableDst = dyn_cast(DstTy)) { + if (FixedSrc->getElementType() == ScalableDst->getElementType()) { + llvm::Value *UndefVec = llvm::UndefValue::get(DstTy); + llvm::Value *Zero = llvm::Constant::getNullValue(CGF.CGM.Int64Ty); + return Builder.CreateInsertVector(DstTy, UndefVec, Src, Zero, + "castScalableSve"); + } + } + } + + // If Src is a scalable vector and Dst is a fixed vector, and both have the + // same element type, use the llvm.experimental.vector.extract intrinsic to + // perform the bitcast. + if (const auto *ScalableSrc = dyn_cast(SrcTy)) { + if (const auto *FixedDst = dyn_cast(DstTy)) { + if (ScalableSrc->getElementType() == FixedDst->getElementType()) { + llvm::Value *Zero = llvm::Constant::getNullValue(CGF.CGM.Int64Ty); + return Builder.CreateExtractVector(DstTy, Src, Zero, "castFixedSve"); + } + } + } + // Perform VLAT <-> VLST bitcast through memory. if ((isa(SrcTy) && isa(DstTy)) || diff --git a/clang/test/CodeGen/aarch64-sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.c b/clang/test/CodeGen/aarch64-sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.c --- a/clang/test/CodeGen/aarch64-sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.c +++ b/clang/test/CodeGen/aarch64-sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.c @@ -51,34 +51,22 @@ typedef int8_t vec_int8 __attribute__((vector_size(N / 8))); // CHECK128-LABEL: define <16 x i8> @f2(<16 x i8> %x) // CHECK128-NEXT: entry: -// CHECK128-NEXT: %x.addr = alloca <16 x i8>, align 16 -// CHECK128-NEXT: %saved-call-rvalue = alloca , align 16 -// CHECK128-NEXT: store <16 x i8> %x, <16 x i8>* %x.addr, align 16 -// CHECK128-NEXT: %0 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) -// CHECK128-NEXT: %1 = bitcast <16 x i8>* %x.addr to * -// CHECK128-NEXT: %2 = load , * %1, align 16 -// CHECK128-NEXT: %3 = call @llvm.aarch64.sve.asrd.nxv16i8( %0, %2, i32 1) -// CHECK128-NEXT: store %3, * %saved-call-rvalue, align 16 -// CHECK128-NEXT: %castFixedSve = bitcast * %saved-call-rvalue to <16 x i8>* -// CHECK128-NEXT: %4 = load <16 x i8>, <16 x i8>* %castFixedSve, align 16 -// CHECK128-NEXT: ret <16 x i8> %4 +// CHECK128-NEXT: [[TMP0:%.*]] = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) +// CHECK128-NEXT: [[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv16i8.v16i8( undef, <16 x i8> [[X:%.*]], i64 0) +// CHECK128-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.asrd.nxv16i8( [[TMP0]], [[CASTSCALABLESVE]], i32 1) +// CHECK128-NEXT: [[CASTFIXEDSVE:%.*]] = call <16 x i8> @llvm.experimental.vector.extract.v16i8.nxv16i8( [[TMP1]], i64 0) +// CHECK128-NEXT: ret <16 x i8> [[CASTFIXEDSVE]] // CHECK-LABEL: define void @f2( // CHECK-SAME: <[[#div(VBITS,8)]] x i8>* noalias nocapture sret(<[[#div(VBITS,8)]] x i8>) align 16 %agg.result, <[[#div(VBITS,8)]] x i8>* nocapture readonly %0) -// CHECK-NEXT: entry: -// CHECK-NEXT: %x.addr = alloca <[[#div(VBITS,8)]] x i8>, align 16 -// CHECK-NEXT: %saved-call-rvalue = alloca , align 16 -// CHECK-NEXT: %x = load <[[#div(VBITS,8)]] x i8>, <[[#div(VBITS,8)]] x i8>* %0, align 16 -// CHECK-NEXT: store <[[#div(VBITS,8)]] x i8> %x, <[[#div(VBITS,8)]] x i8>* %x.addr, align 16 -// CHECK-NEXT: %1 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) -// CHECK-NEXT: %2 = bitcast <[[#div(VBITS,8)]] x i8>* %x.addr to * -// CHECK-NEXT: %3 = load , * %2, align 16 -// CHECK-NEXT: %4 = call @llvm.aarch64.sve.asrd.nxv16i8( %1, %3, i32 1) -// CHECK-NEXT: store %4, * %saved-call-rvalue, align 16 -// CHECK-NEXT: %castFixedSve = bitcast * %saved-call-rvalue to <[[#div(VBITS,8)]] x i8>* -// CHECK-NEXT: %5 = load <[[#div(VBITS,8)]] x i8>, <[[#div(VBITS,8)]] x i8>* %castFixedSve, align 16 -// CHECK-NEXT: store <[[#div(VBITS,8)]] x i8> %5, <[[#div(VBITS,8)]] x i8>* %agg.result, align 16 -// CHECK-NEXT: ret void +// CHECK-NEXT: entry: +// CHECK-NEXT: [[X:%.*]] = load <[[#div(VBITS,8)]] x i8>, <[[#div(VBITS,8)]] x i8>* [[TMP0:%.*]], align 16, [[TBAA6:!tbaa !.*]] +// CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) +// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv16i8.v[[#div(VBITS,8)]]i8( undef, <[[#div(VBITS,8)]] x i8> [[X]], i64 0) +// CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.asrd.nxv16i8( [[TMP1]], [[CASTSCALABLESVE]], i32 1) +// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <[[#div(VBITS,8)]] x i8> @llvm.experimental.vector.extract.v[[#div(VBITS,8)]]i8.nxv16i8( [[TMP2]], i64 0) +// CHECK-NEXT: store <[[#div(VBITS,8)]] x i8> [[CASTFIXEDSVE]], <[[#div(VBITS,8)]] x i8>* [[AGG_RESULT:%.*]], align 16, [[TBAA6]] +// CHECK-NEXT: ret void vec_int8 f2(vec_int8 x) { return svasrd_x(svptrue_b8(), x, 1); } #endif @@ -90,24 +78,24 @@ typedef svint8_t vec2 __attribute__((arm_sve_vector_bits(N))); // CHECK128-LABEL: define void @g( %x.coerce) -// CHECK128-NEXT: entry: -// CHECK128-NEXT: %x = alloca <16 x i8>, align 16 -// CHECK128-NEXT: %0 = bitcast <16 x i8>* %x to * -// CHECK128-NEXT: store %x.coerce, * %0, align 16 -// CHECK128-NEXT: %x1 = load <16 x i8>, <16 x i8>* %x, align 16, -// CHECK128-NEXT: call void @f3(<16 x i8> %x1) #4 -// CHECK128-NEXT: ret void +// CHECK128-NEXT: entry: +// CHECK128-NEXT: [[X:%.*]] = alloca <16 x i8>, align 16 +// CHECK128-NEXT: [[TMP0:%.*]] = bitcast <16 x i8>* [[X]] to * +// CHECK128-NEXT: store [[X_COERCE:%.*]], * [[TMP0]], align 16 +// CHECK128-NEXT: [[X1:%.*]] = load <16 x i8>, <16 x i8>* [[X]], align 16, [[TBAA6:!tbaa !.*]] +// CHECK128-NEXT: call void @f3(<16 x i8> [[X1]]) [[ATTR5:#.*]] +// CHECK128-NEXT: ret void // CHECK-LABEL: define void @g( %x.coerce) -// CHECK-NEXT: entry: -// CHECK-NEXT: %x = alloca <[[#div(VBITS,8)]] x i8>, align 16 -// CHECK-NEXT: %indirect-arg-temp = alloca <[[#div(VBITS,8)]] x i8>, align 16 -// CHECK-NEXT: %0 = bitcast <[[#div(VBITS,8)]] x i8>* %x to * -// CHECK-NEXT: store %x.coerce, * %0 -// CHECK-NEXT: %x1 = load <[[#div(VBITS,8)]] x i8>, <[[#div(VBITS,8)]] x i8>* %x, align 16 -// CHECK-NEXT: store <[[#div(VBITS,8)]] x i8> %x1, <[[#div(VBITS,8)]] x i8>* %indirect-arg-temp -// CHECK-NEXT: call void @f3(<[[#div(VBITS,8)]] x i8>* nonnull %indirect-arg-temp) -// CHECK-NEXT: ret void +// CHECK-NEXT: entry: +// CHECK-NEXT: [[X:%.*]] = alloca <[[#div(VBITS,8)]] x i8>, align 16 +// CHECK-NEXT: [[INDIRECT_ARG_TEMP:%.*]] = alloca <[[#div(VBITS,8)]] x i8>, align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <[[#div(VBITS,8)]] x i8>* [[X]] to * +// CHECK-NEXT: store [[X_COERCE:%.*]], * [[TMP0]], align 16 +// CHECK-NEXT: [[X1:%.*]] = load <[[#div(VBITS,8)]] x i8>, <[[#div(VBITS,8)]] x i8>* [[X]], align 16, [[TBAA6]] +// CHECK-NEXT: store <[[#div(VBITS,8)]] x i8> [[X1]], <[[#div(VBITS,8)]] x i8>* [[INDIRECT_ARG_TEMP]], align 16, [[TBAA6]] +// CHECK-NEXT: call void @f3(<[[#div(VBITS,8)]] x i8>* nonnull [[INDIRECT_ARG_TEMP]]) [[ATTR5:#.*]] +// CHECK-NEXT: ret void // CHECK128-LABEL: declare void @f3(<16 x i8>) diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-bitcast.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-bitcast.c --- a/clang/test/CodeGen/attr-arm-sve-vector-bits-bitcast.c +++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-bitcast.c @@ -30,23 +30,23 @@ // CHECK-128-LABEL: @read_int64( // CHECK-128-NEXT: entry: // CHECK-128-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_INT64:%.*]], %struct.struct_int64* [[S:%.*]], i64 0, i32 1, i64 0 -// CHECK-128-NEXT: [[TMP0:%.*]] = bitcast <2 x i64>* [[ARRAYIDX]] to * -// CHECK-128-NEXT: [[TMP1:%.*]] = load , * [[TMP0]], align 16, [[TBAA6:!tbaa !.*]] -// CHECK-128-NEXT: ret [[TMP1]] +// CHECK-128-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16, [[TBAA6:!tbaa !.*]] +// CHECK-128-NEXT: [[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv2i64.v2i64( undef, <2 x i64> [[TMP0]], i64 0) +// CHECK-128-NEXT: ret [[CASTSCALABLESVE]] // // CHECK-256-LABEL: @read_int64( // CHECK-256-NEXT: entry: // CHECK-256-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_INT64:%.*]], %struct.struct_int64* [[S:%.*]], i64 0, i32 1, i64 0 -// CHECK-256-NEXT: [[TMP0:%.*]] = bitcast <4 x i64>* [[ARRAYIDX]] to * -// CHECK-256-NEXT: [[TMP1:%.*]] = load , * [[TMP0]], align 16, [[TBAA6:!tbaa !.*]] -// CHECK-256-NEXT: ret [[TMP1]] +// CHECK-256-NEXT: [[TMP0:%.*]] = load <4 x i64>, <4 x i64>* [[ARRAYIDX]], align 16, [[TBAA6:!tbaa !.*]] +// CHECK-256-NEXT: [[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP0]], i64 0) +// CHECK-256-NEXT: ret [[CASTSCALABLESVE]] // // CHECK-512-LABEL: @read_int64( // CHECK-512-NEXT: entry: // CHECK-512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_INT64:%.*]], %struct.struct_int64* [[S:%.*]], i64 0, i32 1, i64 0 -// CHECK-512-NEXT: [[TMP0:%.*]] = bitcast <8 x i64>* [[ARRAYIDX]] to * -// CHECK-512-NEXT: [[TMP1:%.*]] = load , * [[TMP0]], align 16, [[TBAA6:!tbaa !.*]] -// CHECK-512-NEXT: ret [[TMP1]] +// CHECK-512-NEXT: [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[ARRAYIDX]], align 16, [[TBAA6:!tbaa !.*]] +// CHECK-512-NEXT: [[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv2i64.v8i64( undef, <8 x i64> [[TMP0]], i64 0) +// CHECK-512-NEXT: ret [[CASTSCALABLESVE]] // svint64_t read_int64(struct struct_int64 *s) { return s->y[0]; @@ -54,32 +54,23 @@ // CHECK-128-LABEL: @write_int64( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[X_ADDR:%.*]] = alloca , align 16 -// CHECK-128-NEXT: store [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA9:!tbaa !.*]] -// CHECK-128-NEXT: [[TMP0:%.*]] = bitcast * [[X_ADDR]] to <2 x i64>* -// CHECK-128-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 16, [[TBAA6]] +// CHECK-128-NEXT: [[CASTFIXEDSVE:%.*]] = call <2 x i64> @llvm.experimental.vector.extract.v2i64.nxv2i64( [[X:%.*]], i64 0) // CHECK-128-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_INT64:%.*]], %struct.struct_int64* [[S:%.*]], i64 0, i32 1, i64 0 -// CHECK-128-NEXT: store <2 x i64> [[TMP1]], <2 x i64>* [[ARRAYIDX]], align 16, [[TBAA6]] +// CHECK-128-NEXT: store <2 x i64> [[CASTFIXEDSVE]], <2 x i64>* [[ARRAYIDX]], align 16, [[TBAA6]] // CHECK-128-NEXT: ret void // // CHECK-256-LABEL: @write_int64( // CHECK-256-NEXT: entry: -// CHECK-256-NEXT: [[X_ADDR:%.*]] = alloca , align 16 -// CHECK-256-NEXT: store [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA9:!tbaa !.*]] -// CHECK-256-NEXT: [[TMP0:%.*]] = bitcast * [[X_ADDR]] to <4 x i64>* -// CHECK-256-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* [[TMP0]], align 16, [[TBAA6]] +// CHECK-256-NEXT: [[CASTFIXEDSVE:%.*]] = call <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64( [[X:%.*]], i64 0) // CHECK-256-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_INT64:%.*]], %struct.struct_int64* [[S:%.*]], i64 0, i32 1, i64 0 -// CHECK-256-NEXT: store <4 x i64> [[TMP1]], <4 x i64>* [[ARRAYIDX]], align 16, [[TBAA6]] +// CHECK-256-NEXT: store <4 x i64> [[CASTFIXEDSVE]], <4 x i64>* [[ARRAYIDX]], align 16, [[TBAA6]] // CHECK-256-NEXT: ret void // // CHECK-512-LABEL: @write_int64( // CHECK-512-NEXT: entry: -// CHECK-512-NEXT: [[X_ADDR:%.*]] = alloca , align 16 -// CHECK-512-NEXT: store [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA9:!tbaa !.*]] -// CHECK-512-NEXT: [[TMP0:%.*]] = bitcast * [[X_ADDR]] to <8 x i64>* -// CHECK-512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[TMP0]], align 16, [[TBAA6]] +// CHECK-512-NEXT: [[CASTFIXEDSVE:%.*]] = call <8 x i64> @llvm.experimental.vector.extract.v8i64.nxv2i64( [[X:%.*]], i64 0) // CHECK-512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_INT64:%.*]], %struct.struct_int64* [[S:%.*]], i64 0, i32 1, i64 0 -// CHECK-512-NEXT: store <8 x i64> [[TMP1]], <8 x i64>* [[ARRAYIDX]], align 16, [[TBAA6]] +// CHECK-512-NEXT: store <8 x i64> [[CASTFIXEDSVE]], <8 x i64>* [[ARRAYIDX]], align 16, [[TBAA6]] // CHECK-512-NEXT: ret void // void write_int64(struct struct_int64 *s, svint64_t x) { @@ -93,23 +84,23 @@ // CHECK-128-LABEL: @read_float64( // CHECK-128-NEXT: entry: // CHECK-128-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_FLOAT64:%.*]], %struct.struct_float64* [[S:%.*]], i64 0, i32 1, i64 0 -// CHECK-128-NEXT: [[TMP0:%.*]] = bitcast <2 x double>* [[ARRAYIDX]] to * -// CHECK-128-NEXT: [[TMP1:%.*]] = load , * [[TMP0]], align 16, [[TBAA6]] -// CHECK-128-NEXT: ret [[TMP1]] +// CHECK-128-NEXT: [[TMP0:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16, [[TBAA6]] +// CHECK-128-NEXT: [[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv2f64.v2f64( undef, <2 x double> [[TMP0]], i64 0) +// CHECK-128-NEXT: ret [[CASTSCALABLESVE]] // // CHECK-256-LABEL: @read_float64( // CHECK-256-NEXT: entry: // CHECK-256-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_FLOAT64:%.*]], %struct.struct_float64* [[S:%.*]], i64 0, i32 1, i64 0 -// CHECK-256-NEXT: [[TMP0:%.*]] = bitcast <4 x double>* [[ARRAYIDX]] to * -// CHECK-256-NEXT: [[TMP1:%.*]] = load , * [[TMP0]], align 16, [[TBAA6]] -// CHECK-256-NEXT: ret [[TMP1]] +// CHECK-256-NEXT: [[TMP0:%.*]] = load <4 x double>, <4 x double>* [[ARRAYIDX]], align 16, [[TBAA6]] +// CHECK-256-NEXT: [[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv2f64.v4f64( undef, <4 x double> [[TMP0]], i64 0) +// CHECK-256-NEXT: ret [[CASTSCALABLESVE]] // // CHECK-512-LABEL: @read_float64( // CHECK-512-NEXT: entry: // CHECK-512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_FLOAT64:%.*]], %struct.struct_float64* [[S:%.*]], i64 0, i32 1, i64 0 -// CHECK-512-NEXT: [[TMP0:%.*]] = bitcast <8 x double>* [[ARRAYIDX]] to * -// CHECK-512-NEXT: [[TMP1:%.*]] = load , * [[TMP0]], align 16, [[TBAA6]] -// CHECK-512-NEXT: ret [[TMP1]] +// CHECK-512-NEXT: [[TMP0:%.*]] = load <8 x double>, <8 x double>* [[ARRAYIDX]], align 16, [[TBAA6]] +// CHECK-512-NEXT: [[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv2f64.v8f64( undef, <8 x double> [[TMP0]], i64 0) +// CHECK-512-NEXT: ret [[CASTSCALABLESVE]] // svfloat64_t read_float64(struct struct_float64 *s) { return s->y[0]; @@ -117,32 +108,23 @@ // CHECK-128-LABEL: @write_float64( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[X_ADDR:%.*]] = alloca , align 16 -// CHECK-128-NEXT: store [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA11:!tbaa !.*]] -// CHECK-128-NEXT: [[TMP0:%.*]] = bitcast * [[X_ADDR]] to <2 x double>* -// CHECK-128-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 16, [[TBAA6]] +// CHECK-128-NEXT: [[CASTFIXEDSVE:%.*]] = call <2 x double> @llvm.experimental.vector.extract.v2f64.nxv2f64( [[X:%.*]], i64 0) // CHECK-128-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_FLOAT64:%.*]], %struct.struct_float64* [[S:%.*]], i64 0, i32 1, i64 0 -// CHECK-128-NEXT: store <2 x double> [[TMP1]], <2 x double>* [[ARRAYIDX]], align 16, [[TBAA6]] +// CHECK-128-NEXT: store <2 x double> [[CASTFIXEDSVE]], <2 x double>* [[ARRAYIDX]], align 16, [[TBAA6]] // CHECK-128-NEXT: ret void // // CHECK-256-LABEL: @write_float64( // CHECK-256-NEXT: entry: -// CHECK-256-NEXT: [[X_ADDR:%.*]] = alloca , align 16 -// CHECK-256-NEXT: store [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA11:!tbaa !.*]] -// CHECK-256-NEXT: [[TMP0:%.*]] = bitcast * [[X_ADDR]] to <4 x double>* -// CHECK-256-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[TMP0]], align 16, [[TBAA6]] +// CHECK-256-NEXT: [[CASTFIXEDSVE:%.*]] = call <4 x double> @llvm.experimental.vector.extract.v4f64.nxv2f64( [[X:%.*]], i64 0) // CHECK-256-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_FLOAT64:%.*]], %struct.struct_float64* [[S:%.*]], i64 0, i32 1, i64 0 -// CHECK-256-NEXT: store <4 x double> [[TMP1]], <4 x double>* [[ARRAYIDX]], align 16, [[TBAA6]] +// CHECK-256-NEXT: store <4 x double> [[CASTFIXEDSVE]], <4 x double>* [[ARRAYIDX]], align 16, [[TBAA6]] // CHECK-256-NEXT: ret void // // CHECK-512-LABEL: @write_float64( // CHECK-512-NEXT: entry: -// CHECK-512-NEXT: [[X_ADDR:%.*]] = alloca , align 16 -// CHECK-512-NEXT: store [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA11:!tbaa !.*]] -// CHECK-512-NEXT: [[TMP0:%.*]] = bitcast * [[X_ADDR]] to <8 x double>* -// CHECK-512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* [[TMP0]], align 16, [[TBAA6]] +// CHECK-512-NEXT: [[CASTFIXEDSVE:%.*]] = call <8 x double> @llvm.experimental.vector.extract.v8f64.nxv2f64( [[X:%.*]], i64 0) // CHECK-512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_FLOAT64:%.*]], %struct.struct_float64* [[S:%.*]], i64 0, i32 1, i64 0 -// CHECK-512-NEXT: store <8 x double> [[TMP1]], <8 x double>* [[ARRAYIDX]], align 16, [[TBAA6]] +// CHECK-512-NEXT: store <8 x double> [[CASTFIXEDSVE]], <8 x double>* [[ARRAYIDX]], align 16, [[TBAA6]] // CHECK-512-NEXT: ret void // void write_float64(struct struct_float64 *s, svfloat64_t x) { @@ -156,23 +138,23 @@ // CHECK-128-LABEL: @read_bfloat16( // CHECK-128-NEXT: entry: // CHECK-128-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BFLOAT16:%.*]], %struct.struct_bfloat16* [[S:%.*]], i64 0, i32 1, i64 0 -// CHECK-128-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat>* [[ARRAYIDX]] to * -// CHECK-128-NEXT: [[TMP1:%.*]] = load , * [[TMP0]], align 16, [[TBAA6]] -// CHECK-128-NEXT: ret [[TMP1]] +// CHECK-128-NEXT: [[TMP0:%.*]] = load <8 x bfloat>, <8 x bfloat>* [[ARRAYIDX]], align 16, [[TBAA6]] +// CHECK-128-NEXT: [[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv8bf16.v8bf16( undef, <8 x bfloat> [[TMP0]], i64 0) +// CHECK-128-NEXT: ret [[CASTSCALABLESVE]] // // CHECK-256-LABEL: @read_bfloat16( // CHECK-256-NEXT: entry: // CHECK-256-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BFLOAT16:%.*]], %struct.struct_bfloat16* [[S:%.*]], i64 0, i32 1, i64 0 -// CHECK-256-NEXT: [[TMP0:%.*]] = bitcast <16 x bfloat>* [[ARRAYIDX]] to * -// CHECK-256-NEXT: [[TMP1:%.*]] = load , * [[TMP0]], align 16, [[TBAA6]] -// CHECK-256-NEXT: ret [[TMP1]] +// CHECK-256-NEXT: [[TMP0:%.*]] = load <16 x bfloat>, <16 x bfloat>* [[ARRAYIDX]], align 16, [[TBAA6]] +// CHECK-256-NEXT: [[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv8bf16.v16bf16( undef, <16 x bfloat> [[TMP0]], i64 0) +// CHECK-256-NEXT: ret [[CASTSCALABLESVE]] // // CHECK-512-LABEL: @read_bfloat16( // CHECK-512-NEXT: entry: // CHECK-512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BFLOAT16:%.*]], %struct.struct_bfloat16* [[S:%.*]], i64 0, i32 1, i64 0 -// CHECK-512-NEXT: [[TMP0:%.*]] = bitcast <32 x bfloat>* [[ARRAYIDX]] to * -// CHECK-512-NEXT: [[TMP1:%.*]] = load , * [[TMP0]], align 16, [[TBAA6]] -// CHECK-512-NEXT: ret [[TMP1]] +// CHECK-512-NEXT: [[TMP0:%.*]] = load <32 x bfloat>, <32 x bfloat>* [[ARRAYIDX]], align 16, [[TBAA6]] +// CHECK-512-NEXT: [[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv8bf16.v32bf16( undef, <32 x bfloat> [[TMP0]], i64 0) +// CHECK-512-NEXT: ret [[CASTSCALABLESVE]] // svbfloat16_t read_bfloat16(struct struct_bfloat16 *s) { return s->y[0]; @@ -180,32 +162,23 @@ // CHECK-128-LABEL: @write_bfloat16( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[X_ADDR:%.*]] = alloca , align 16 -// CHECK-128-NEXT: store [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA13:!tbaa !.*]] -// CHECK-128-NEXT: [[TMP0:%.*]] = bitcast * [[X_ADDR]] to <8 x bfloat>* -// CHECK-128-NEXT: [[TMP1:%.*]] = load <8 x bfloat>, <8 x bfloat>* [[TMP0]], align 16, [[TBAA6]] +// CHECK-128-NEXT: [[CASTFIXEDSVE:%.*]] = call <8 x bfloat> @llvm.experimental.vector.extract.v8bf16.nxv8bf16( [[X:%.*]], i64 0) // CHECK-128-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BFLOAT16:%.*]], %struct.struct_bfloat16* [[S:%.*]], i64 0, i32 1, i64 0 -// CHECK-128-NEXT: store <8 x bfloat> [[TMP1]], <8 x bfloat>* [[ARRAYIDX]], align 16, [[TBAA6]] +// CHECK-128-NEXT: store <8 x bfloat> [[CASTFIXEDSVE]], <8 x bfloat>* [[ARRAYIDX]], align 16, [[TBAA6]] // CHECK-128-NEXT: ret void // // CHECK-256-LABEL: @write_bfloat16( // CHECK-256-NEXT: entry: -// CHECK-256-NEXT: [[X_ADDR:%.*]] = alloca , align 16 -// CHECK-256-NEXT: store [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA13:!tbaa !.*]] -// CHECK-256-NEXT: [[TMP0:%.*]] = bitcast * [[X_ADDR]] to <16 x bfloat>* -// CHECK-256-NEXT: [[TMP1:%.*]] = load <16 x bfloat>, <16 x bfloat>* [[TMP0]], align 16, [[TBAA6]] +// CHECK-256-NEXT: [[CASTFIXEDSVE:%.*]] = call <16 x bfloat> @llvm.experimental.vector.extract.v16bf16.nxv8bf16( [[X:%.*]], i64 0) // CHECK-256-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BFLOAT16:%.*]], %struct.struct_bfloat16* [[S:%.*]], i64 0, i32 1, i64 0 -// CHECK-256-NEXT: store <16 x bfloat> [[TMP1]], <16 x bfloat>* [[ARRAYIDX]], align 16, [[TBAA6]] +// CHECK-256-NEXT: store <16 x bfloat> [[CASTFIXEDSVE]], <16 x bfloat>* [[ARRAYIDX]], align 16, [[TBAA6]] // CHECK-256-NEXT: ret void // // CHECK-512-LABEL: @write_bfloat16( // CHECK-512-NEXT: entry: -// CHECK-512-NEXT: [[X_ADDR:%.*]] = alloca , align 16 -// CHECK-512-NEXT: store [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA13:!tbaa !.*]] -// CHECK-512-NEXT: [[TMP0:%.*]] = bitcast * [[X_ADDR]] to <32 x bfloat>* -// CHECK-512-NEXT: [[TMP1:%.*]] = load <32 x bfloat>, <32 x bfloat>* [[TMP0]], align 16, [[TBAA6]] +// CHECK-512-NEXT: [[CASTFIXEDSVE:%.*]] = call <32 x bfloat> @llvm.experimental.vector.extract.v32bf16.nxv8bf16( [[X:%.*]], i64 0) // CHECK-512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BFLOAT16:%.*]], %struct.struct_bfloat16* [[S:%.*]], i64 0, i32 1, i64 0 -// CHECK-512-NEXT: store <32 x bfloat> [[TMP1]], <32 x bfloat>* [[ARRAYIDX]], align 16, [[TBAA6]] +// CHECK-512-NEXT: store <32 x bfloat> [[CASTFIXEDSVE]], <32 x bfloat>* [[ARRAYIDX]], align 16, [[TBAA6]] // CHECK-512-NEXT: ret void // void write_bfloat16(struct struct_bfloat16 *s, svbfloat16_t x) { @@ -244,7 +217,7 @@ // CHECK-128-LABEL: @write_bool( // CHECK-128-NEXT: entry: // CHECK-128-NEXT: [[X_ADDR:%.*]] = alloca , align 16 -// CHECK-128-NEXT: store [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA15:!tbaa !.*]] +// CHECK-128-NEXT: store [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA9:!tbaa !.*]] // CHECK-128-NEXT: [[TMP0:%.*]] = bitcast * [[X_ADDR]] to <2 x i8>* // CHECK-128-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 16, [[TBAA6]] // CHECK-128-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL:%.*]], %struct.struct_bool* [[S:%.*]], i64 0, i32 1, i64 0 @@ -254,7 +227,7 @@ // CHECK-256-LABEL: @write_bool( // CHECK-256-NEXT: entry: // CHECK-256-NEXT: [[X_ADDR:%.*]] = alloca , align 16 -// CHECK-256-NEXT: store [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA15:!tbaa !.*]] +// CHECK-256-NEXT: store [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA9:!tbaa !.*]] // CHECK-256-NEXT: [[TMP0:%.*]] = bitcast * [[X_ADDR]] to <4 x i8>* // CHECK-256-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 16, [[TBAA6]] // CHECK-256-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL:%.*]], %struct.struct_bool* [[S:%.*]], i64 0, i32 1, i64 0 @@ -264,7 +237,7 @@ // CHECK-512-LABEL: @write_bool( // CHECK-512-NEXT: entry: // CHECK-512-NEXT: [[X_ADDR:%.*]] = alloca , align 16 -// CHECK-512-NEXT: store [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA15:!tbaa !.*]] +// CHECK-512-NEXT: store [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA9:!tbaa !.*]] // CHECK-512-NEXT: [[TMP0:%.*]] = bitcast * [[X_ADDR]] to <8 x i8>* // CHECK-512-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 16, [[TBAA6]] // CHECK-512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL:%.*]], %struct.struct_bool* [[S:%.*]], i64 0, i32 1, i64 0 diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-call.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-call.c --- a/clang/test/CodeGen/attr-arm-sve-vector-bits-call.c +++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-call.c @@ -25,22 +25,16 @@ // CHECK-LABEL: @fixed_caller( // CHECK-NEXT: entry: // CHECK-NEXT: [[X:%.*]] = alloca <16 x i32>, align 16 -// CHECK-NEXT: [[X_ADDR:%.*]] = alloca <16 x i32>, align 16 -// CHECK-NEXT: [[SAVED_CALL_RVALUE:%.*]] = alloca , align 16 // CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 16 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i32>* [[X]] to * // CHECK-NEXT: store [[X_COERCE:%.*]], * [[TMP0]], align 16 // CHECK-NEXT: [[X1:%.*]] = load <16 x i32>, <16 x i32>* [[X]], align 16, [[TBAA6:!tbaa !.*]] -// CHECK-NEXT: store <16 x i32> [[X1]], <16 x i32>* [[X_ADDR]], align 16, [[TBAA6]] -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* [[X_ADDR]] to * -// CHECK-NEXT: [[TMP2:%.*]] = load , * [[TMP1]], align 16, [[TBAA6]] -// CHECK-NEXT: store [[TMP2]], * [[SAVED_CALL_RVALUE]], align 16, [[TBAA9:!tbaa !.*]] -// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast * [[SAVED_CALL_RVALUE]] to <16 x i32>* -// CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, <16 x i32>* [[CASTFIXEDSVE]], align 16, [[TBAA6]] +// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv4i32.v16i32( undef, <16 x i32> [[X1]], i64 0) +// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32( [[CASTSCALABLESVE]], i64 0) // CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast * [[RETVAL_COERCE]] to <16 x i32>* -// CHECK-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* [[RETVAL_0__SROA_CAST]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load , * [[RETVAL_COERCE]], align 16 -// CHECK-NEXT: ret [[TMP4]] +// CHECK-NEXT: store <16 x i32> [[CASTFIXEDSVE]], <16 x i32>* [[RETVAL_0__SROA_CAST]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load , * [[RETVAL_COERCE]], align 16 +// CHECK-NEXT: ret [[TMP1]] // fixed_int32_t fixed_caller(fixed_int32_t x) { return sizeless_callee(x); @@ -64,24 +58,18 @@ // CHECK-LABEL: @sizeless_caller( // CHECK-NEXT: entry: -// CHECK-NEXT: [[X_ADDR:%.*]] = alloca , align 16 // CHECK-NEXT: [[COERCE_COERCE:%.*]] = alloca , align 16 // CHECK-NEXT: [[COERCE1:%.*]] = alloca <16 x i32>, align 16 -// CHECK-NEXT: [[SAVED_CALL_RVALUE:%.*]] = alloca <16 x i32>, align 64 -// CHECK-NEXT: store [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA9]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast * [[X_ADDR]] to <16 x i32>* -// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP0]], align 16, [[TBAA6]] +// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32( [[X:%.*]], i64 0) // CHECK-NEXT: [[COERCE_0__SROA_CAST:%.*]] = bitcast * [[COERCE_COERCE]] to <16 x i32>* -// CHECK-NEXT: store <16 x i32> [[TMP1]], <16 x i32>* [[COERCE_0__SROA_CAST]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load , * [[COERCE_COERCE]], align 16 -// CHECK-NEXT: [[CALL:%.*]] = call @fixed_callee( [[TMP2]]) -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32>* [[COERCE1]] to * -// CHECK-NEXT: store [[CALL]], * [[TMP3]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, <16 x i32>* [[COERCE1]], align 16, [[TBAA6]] -// CHECK-NEXT: store <16 x i32> [[TMP4]], <16 x i32>* [[SAVED_CALL_RVALUE]], align 64, [[TBAA6]] -// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast <16 x i32>* [[SAVED_CALL_RVALUE]] to * -// CHECK-NEXT: [[TMP5:%.*]] = load , * [[CASTFIXEDSVE]], align 64, [[TBAA6]] -// CHECK-NEXT: ret [[TMP5]] +// CHECK-NEXT: store <16 x i32> [[CASTFIXEDSVE]], <16 x i32>* [[COERCE_0__SROA_CAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load , * [[COERCE_COERCE]], align 16 +// CHECK-NEXT: [[CALL:%.*]] = call @fixed_callee( [[TMP0]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* [[COERCE1]] to * +// CHECK-NEXT: store [[CALL]], * [[TMP1]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* [[COERCE1]], align 16, [[TBAA6]] +// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv4i32.v16i32( undef, <16 x i32> [[TMP2]], i64 0) +// CHECK-NEXT: ret [[CASTSCALABLESVE]] // svint32_t sizeless_caller(svint32_t x) { return fixed_callee(x); @@ -95,9 +83,6 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[OP1:%.*]] = alloca <16 x i32>, align 16 // CHECK-NEXT: [[OP2:%.*]] = alloca <16 x i32>, align 16 -// CHECK-NEXT: [[OP1_ADDR:%.*]] = alloca <16 x i32>, align 16 -// CHECK-NEXT: [[OP2_ADDR:%.*]] = alloca <16 x i32>, align 16 -// CHECK-NEXT: [[SAVED_CALL_RVALUE:%.*]] = alloca , align 16 // CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 16 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i32>* [[OP1]] to * // CHECK-NEXT: store [[OP1_COERCE:%.*]], * [[TMP0]], align 16 @@ -105,21 +90,15 @@ // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* [[OP2]] to * // CHECK-NEXT: store [[OP2_COERCE:%.*]], * [[TMP1]], align 16 // CHECK-NEXT: [[OP22:%.*]] = load <16 x i32>, <16 x i32>* [[OP2]], align 16, [[TBAA6]] -// CHECK-NEXT: store <16 x i32> [[OP11]], <16 x i32>* [[OP1_ADDR]], align 16, [[TBAA6]] -// CHECK-NEXT: store <16 x i32> [[OP22]], <16 x i32>* [[OP2_ADDR]], align 16, [[TBAA6]] -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32>* [[OP1_ADDR]] to * -// CHECK-NEXT: [[TMP3:%.*]] = load , * [[TMP2]], align 16, [[TBAA6]] -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32>* [[OP2_ADDR]] to * -// CHECK-NEXT: [[TMP5:%.*]] = load , * [[TMP4]], align 16, [[TBAA6]] -// CHECK-NEXT: [[TMP6:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.sel.nxv4i32( [[TMP6]], [[TMP3]], [[TMP5]]) -// CHECK-NEXT: store [[TMP7]], * [[SAVED_CALL_RVALUE]], align 16, [[TBAA9]] -// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast * [[SAVED_CALL_RVALUE]] to <16 x i32>* -// CHECK-NEXT: [[TMP8:%.*]] = load <16 x i32>, <16 x i32>* [[CASTFIXEDSVE]], align 16, [[TBAA6]] +// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv4i32.v16i32( undef, <16 x i32> [[OP11]], i64 0) +// CHECK-NEXT: [[CASTSCALABLESVE3:%.*]] = call @llvm.experimental.vector.insert.nxv4i32.v16i32( undef, <16 x i32> [[OP22]], i64 0) +// CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = call @llvm.aarch64.sve.sel.nxv4i32( [[TMP2]], [[CASTSCALABLESVE]], [[CASTSCALABLESVE3]]) +// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32( [[TMP3]], i64 0) // CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast * [[RETVAL_COERCE]] to <16 x i32>* -// CHECK-NEXT: store <16 x i32> [[TMP8]], <16 x i32>* [[RETVAL_0__SROA_CAST]], align 16 -// CHECK-NEXT: [[TMP9:%.*]] = load , * [[RETVAL_COERCE]], align 16 -// CHECK-NEXT: ret [[TMP9]] +// CHECK-NEXT: store <16 x i32> [[CASTFIXEDSVE]], <16 x i32>* [[RETVAL_0__SROA_CAST]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = load , * [[RETVAL_COERCE]], align 16 +// CHECK-NEXT: ret [[TMP4]] // fixed_int32_t call_int32_ff(svbool_t pg, fixed_int32_t op1, fixed_int32_t op2) { return svsel(pg, op1, op2); @@ -129,9 +108,6 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[OP1:%.*]] = alloca <8 x double>, align 16 // CHECK-NEXT: [[OP2:%.*]] = alloca <8 x double>, align 16 -// CHECK-NEXT: [[OP1_ADDR:%.*]] = alloca <8 x double>, align 16 -// CHECK-NEXT: [[OP2_ADDR:%.*]] = alloca <8 x double>, align 16 -// CHECK-NEXT: [[SAVED_CALL_RVALUE:%.*]] = alloca , align 16 // CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 16 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x double>* [[OP1]] to * // CHECK-NEXT: store [[OP1_COERCE:%.*]], * [[TMP0]], align 16 @@ -139,21 +115,15 @@ // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x double>* [[OP2]] to * // CHECK-NEXT: store [[OP2_COERCE:%.*]], * [[TMP1]], align 16 // CHECK-NEXT: [[OP22:%.*]] = load <8 x double>, <8 x double>* [[OP2]], align 16, [[TBAA6]] -// CHECK-NEXT: store <8 x double> [[OP11]], <8 x double>* [[OP1_ADDR]], align 16, [[TBAA6]] -// CHECK-NEXT: store <8 x double> [[OP22]], <8 x double>* [[OP2_ADDR]], align 16, [[TBAA6]] -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x double>* [[OP1_ADDR]] to * -// CHECK-NEXT: [[TMP3:%.*]] = load , * [[TMP2]], align 16, [[TBAA6]] -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x double>* [[OP2_ADDR]] to * -// CHECK-NEXT: [[TMP5:%.*]] = load , * [[TMP4]], align 16, [[TBAA6]] -// CHECK-NEXT: [[TMP6:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.sel.nxv2f64( [[TMP6]], [[TMP3]], [[TMP5]]) -// CHECK-NEXT: store [[TMP7]], * [[SAVED_CALL_RVALUE]], align 16, [[TBAA11:!tbaa !.*]] -// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast * [[SAVED_CALL_RVALUE]] to <8 x double>* -// CHECK-NEXT: [[TMP8:%.*]] = load <8 x double>, <8 x double>* [[CASTFIXEDSVE]], align 16, [[TBAA6]] +// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv2f64.v8f64( undef, <8 x double> [[OP11]], i64 0) +// CHECK-NEXT: [[CASTSCALABLESVE3:%.*]] = call @llvm.experimental.vector.insert.nxv2f64.v8f64( undef, <8 x double> [[OP22]], i64 0) +// CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = call @llvm.aarch64.sve.sel.nxv2f64( [[TMP2]], [[CASTSCALABLESVE]], [[CASTSCALABLESVE3]]) +// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <8 x double> @llvm.experimental.vector.extract.v8f64.nxv2f64( [[TMP3]], i64 0) // CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast * [[RETVAL_COERCE]] to <8 x double>* -// CHECK-NEXT: store <8 x double> [[TMP8]], <8 x double>* [[RETVAL_0__SROA_CAST]], align 16 -// CHECK-NEXT: [[TMP9:%.*]] = load , * [[RETVAL_COERCE]], align 16 -// CHECK-NEXT: ret [[TMP9]] +// CHECK-NEXT: store <8 x double> [[CASTFIXEDSVE]], <8 x double>* [[RETVAL_0__SROA_CAST]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = load , * [[RETVAL_COERCE]], align 16 +// CHECK-NEXT: ret [[TMP4]] // fixed_float64_t call_float64_ff(svbool_t pg, fixed_float64_t op1, fixed_float64_t op2) { return svsel(pg, op1, op2); @@ -180,7 +150,7 @@ // CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8>* [[OP2_ADDR]] to * // CHECK-NEXT: [[TMP5:%.*]] = load , * [[TMP4]], align 16, [[TBAA6]] // CHECK-NEXT: [[TMP6:%.*]] = call @llvm.aarch64.sve.sel.nxv16i1( [[PG:%.*]], [[TMP3]], [[TMP5]]) -// CHECK-NEXT: store [[TMP6]], * [[SAVED_CALL_RVALUE]], align 16, [[TBAA13:!tbaa !.*]] +// CHECK-NEXT: store [[TMP6]], * [[SAVED_CALL_RVALUE]], align 16, [[TBAA9:!tbaa !.*]] // CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast * [[SAVED_CALL_RVALUE]] to <8 x i8>* // CHECK-NEXT: [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[CASTFIXEDSVE]], align 16, [[TBAA6]] // CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast * [[RETVAL_COERCE]] to <8 x i8>* @@ -199,24 +169,18 @@ // CHECK-LABEL: @call_int32_fs( // CHECK-NEXT: entry: // CHECK-NEXT: [[OP1:%.*]] = alloca <16 x i32>, align 16 -// CHECK-NEXT: [[OP1_ADDR:%.*]] = alloca <16 x i32>, align 16 -// CHECK-NEXT: [[SAVED_CALL_RVALUE:%.*]] = alloca , align 16 // CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 16 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i32>* [[OP1]] to * // CHECK-NEXT: store [[OP1_COERCE:%.*]], * [[TMP0]], align 16 // CHECK-NEXT: [[OP11:%.*]] = load <16 x i32>, <16 x i32>* [[OP1]], align 16, [[TBAA6]] -// CHECK-NEXT: store <16 x i32> [[OP11]], <16 x i32>* [[OP1_ADDR]], align 16, [[TBAA6]] -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* [[OP1_ADDR]] to * -// CHECK-NEXT: [[TMP2:%.*]] = load , * [[TMP1]], align 16, [[TBAA6]] -// CHECK-NEXT: [[TMP3:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP4:%.*]] = call @llvm.aarch64.sve.sel.nxv4i32( [[TMP3]], [[TMP2]], [[OP2:%.*]]) -// CHECK-NEXT: store [[TMP4]], * [[SAVED_CALL_RVALUE]], align 16, [[TBAA9]] -// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast * [[SAVED_CALL_RVALUE]] to <16 x i32>* -// CHECK-NEXT: [[TMP5:%.*]] = load <16 x i32>, <16 x i32>* [[CASTFIXEDSVE]], align 16, [[TBAA6]] +// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv4i32.v16i32( undef, <16 x i32> [[OP11]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.sel.nxv4i32( [[TMP1]], [[CASTSCALABLESVE]], [[OP2:%.*]]) +// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32( [[TMP2]], i64 0) // CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast * [[RETVAL_COERCE]] to <16 x i32>* -// CHECK-NEXT: store <16 x i32> [[TMP5]], <16 x i32>* [[RETVAL_0__SROA_CAST]], align 16 -// CHECK-NEXT: [[TMP6:%.*]] = load , * [[RETVAL_COERCE]], align 16 -// CHECK-NEXT: ret [[TMP6]] +// CHECK-NEXT: store <16 x i32> [[CASTFIXEDSVE]], <16 x i32>* [[RETVAL_0__SROA_CAST]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load , * [[RETVAL_COERCE]], align 16 +// CHECK-NEXT: ret [[TMP3]] // fixed_int32_t call_int32_fs(svbool_t pg, fixed_int32_t op1, svint32_t op2) { return svsel(pg, op1, op2); @@ -225,24 +189,18 @@ // CHECK-LABEL: @call_float64_fs( // CHECK-NEXT: entry: // CHECK-NEXT: [[OP1:%.*]] = alloca <8 x double>, align 16 -// CHECK-NEXT: [[OP1_ADDR:%.*]] = alloca <8 x double>, align 16 -// CHECK-NEXT: [[SAVED_CALL_RVALUE:%.*]] = alloca , align 16 // CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 16 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x double>* [[OP1]] to * // CHECK-NEXT: store [[OP1_COERCE:%.*]], * [[TMP0]], align 16 // CHECK-NEXT: [[OP11:%.*]] = load <8 x double>, <8 x double>* [[OP1]], align 16, [[TBAA6]] -// CHECK-NEXT: store <8 x double> [[OP11]], <8 x double>* [[OP1_ADDR]], align 16, [[TBAA6]] -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x double>* [[OP1_ADDR]] to * -// CHECK-NEXT: [[TMP2:%.*]] = load , * [[TMP1]], align 16, [[TBAA6]] -// CHECK-NEXT: [[TMP3:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP4:%.*]] = call @llvm.aarch64.sve.sel.nxv2f64( [[TMP3]], [[TMP2]], [[OP2:%.*]]) -// CHECK-NEXT: store [[TMP4]], * [[SAVED_CALL_RVALUE]], align 16, [[TBAA11]] -// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast * [[SAVED_CALL_RVALUE]] to <8 x double>* -// CHECK-NEXT: [[TMP5:%.*]] = load <8 x double>, <8 x double>* [[CASTFIXEDSVE]], align 16, [[TBAA6]] +// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv2f64.v8f64( undef, <8 x double> [[OP11]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.sel.nxv2f64( [[TMP1]], [[CASTSCALABLESVE]], [[OP2:%.*]]) +// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <8 x double> @llvm.experimental.vector.extract.v8f64.nxv2f64( [[TMP2]], i64 0) // CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast * [[RETVAL_COERCE]] to <8 x double>* -// CHECK-NEXT: store <8 x double> [[TMP5]], <8 x double>* [[RETVAL_0__SROA_CAST]], align 16 -// CHECK-NEXT: [[TMP6:%.*]] = load , * [[RETVAL_COERCE]], align 16 -// CHECK-NEXT: ret [[TMP6]] +// CHECK-NEXT: store <8 x double> [[CASTFIXEDSVE]], <8 x double>* [[RETVAL_0__SROA_CAST]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load , * [[RETVAL_COERCE]], align 16 +// CHECK-NEXT: ret [[TMP3]] // fixed_float64_t call_float64_fs(svbool_t pg, fixed_float64_t op1, svfloat64_t op2) { return svsel(pg, op1, op2); @@ -261,7 +219,7 @@ // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8>* [[OP1_ADDR]] to * // CHECK-NEXT: [[TMP2:%.*]] = load , * [[TMP1]], align 16, [[TBAA6]] // CHECK-NEXT: [[TMP3:%.*]] = call @llvm.aarch64.sve.sel.nxv16i1( [[PG:%.*]], [[TMP2]], [[OP2:%.*]]) -// CHECK-NEXT: store [[TMP3]], * [[SAVED_CALL_RVALUE]], align 16, [[TBAA13]] +// CHECK-NEXT: store [[TMP3]], * [[SAVED_CALL_RVALUE]], align 16, [[TBAA9]] // CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast * [[SAVED_CALL_RVALUE]] to <8 x i8>* // CHECK-NEXT: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[CASTFIXEDSVE]], align 16, [[TBAA6]] // CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast * [[RETVAL_COERCE]] to <8 x i8>* @@ -279,17 +237,14 @@ // CHECK-LABEL: @call_int32_ss( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SAVED_CALL_RVALUE:%.*]] = alloca , align 16 // CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 16 // CHECK-NEXT: [[TMP0:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.sel.nxv4i32( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) -// CHECK-NEXT: store [[TMP1]], * [[SAVED_CALL_RVALUE]], align 16, [[TBAA9]] -// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast * [[SAVED_CALL_RVALUE]] to <16 x i32>* -// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* [[CASTFIXEDSVE]], align 16, [[TBAA6]] +// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32( [[TMP1]], i64 0) // CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast * [[RETVAL_COERCE]] to <16 x i32>* -// CHECK-NEXT: store <16 x i32> [[TMP2]], <16 x i32>* [[RETVAL_0__SROA_CAST]], align 16 -// CHECK-NEXT: [[TMP3:%.*]] = load , * [[RETVAL_COERCE]], align 16 -// CHECK-NEXT: ret [[TMP3]] +// CHECK-NEXT: store <16 x i32> [[CASTFIXEDSVE]], <16 x i32>* [[RETVAL_0__SROA_CAST]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load , * [[RETVAL_COERCE]], align 16 +// CHECK-NEXT: ret [[TMP2]] // fixed_int32_t call_int32_ss(svbool_t pg, svint32_t op1, svint32_t op2) { return svsel(pg, op1, op2); @@ -297,17 +252,14 @@ // CHECK-LABEL: @call_float64_ss( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SAVED_CALL_RVALUE:%.*]] = alloca , align 16 // CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 16 // CHECK-NEXT: [[TMP0:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.sel.nxv2f64( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) -// CHECK-NEXT: store [[TMP1]], * [[SAVED_CALL_RVALUE]], align 16, [[TBAA11]] -// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast * [[SAVED_CALL_RVALUE]] to <8 x double>* -// CHECK-NEXT: [[TMP2:%.*]] = load <8 x double>, <8 x double>* [[CASTFIXEDSVE]], align 16, [[TBAA6]] +// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <8 x double> @llvm.experimental.vector.extract.v8f64.nxv2f64( [[TMP1]], i64 0) // CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast * [[RETVAL_COERCE]] to <8 x double>* -// CHECK-NEXT: store <8 x double> [[TMP2]], <8 x double>* [[RETVAL_0__SROA_CAST]], align 16 -// CHECK-NEXT: [[TMP3:%.*]] = load , * [[RETVAL_COERCE]], align 16 -// CHECK-NEXT: ret [[TMP3]] +// CHECK-NEXT: store <8 x double> [[CASTFIXEDSVE]], <8 x double>* [[RETVAL_0__SROA_CAST]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load , * [[RETVAL_COERCE]], align 16 +// CHECK-NEXT: ret [[TMP2]] // fixed_float64_t call_float64_ss(svbool_t pg, svfloat64_t op1, svfloat64_t op2) { return svsel(pg, op1, op2); @@ -318,7 +270,7 @@ // CHECK-NEXT: [[SAVED_CALL_RVALUE:%.*]] = alloca , align 16 // CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 16 // CHECK-NEXT: [[TMP0:%.*]] = call @llvm.aarch64.sve.sel.nxv16i1( [[PG:%.*]], [[OP1:%.*]], [[OP2:%.*]]) -// CHECK-NEXT: store [[TMP0]], * [[SAVED_CALL_RVALUE]], align 16, [[TBAA13]] +// CHECK-NEXT: store [[TMP0]], * [[SAVED_CALL_RVALUE]], align 16, [[TBAA9]] // CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast * [[SAVED_CALL_RVALUE]] to <8 x i8>* // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[CASTFIXEDSVE]], align 16, [[TBAA6]] // CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast * [[RETVAL_COERCE]] to <8 x i8>* diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c --- a/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c +++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c @@ -14,14 +14,11 @@ // CHECK-LABEL: @to_svint32_t( // CHECK-NEXT: entry: // CHECK-NEXT: [[TYPE:%.*]] = alloca <16 x i32>, align 16 -// CHECK-NEXT: [[TYPE_ADDR:%.*]] = alloca <16 x i32>, align 16 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i32>* [[TYPE]] to * // CHECK-NEXT: store [[TYPE_COERCE:%.*]], * [[TMP0]], align 16 // CHECK-NEXT: [[TYPE1:%.*]] = load <16 x i32>, <16 x i32>* [[TYPE]], align 16, [[TBAA6:!tbaa !.*]] -// CHECK-NEXT: store <16 x i32> [[TYPE1]], <16 x i32>* [[TYPE_ADDR]], align 16, [[TBAA6]] -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* [[TYPE_ADDR]] to * -// CHECK-NEXT: [[TMP2:%.*]] = load , * [[TMP1]], align 16, [[TBAA6]] -// CHECK-NEXT: ret [[TMP2]] +// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv4i32.v16i32( undef, <16 x i32> [[TYPE1]], i64 0) +// CHECK-NEXT: ret [[CASTSCALABLESVE]] // svint32_t to_svint32_t(fixed_int32_t type) { return type; @@ -29,15 +26,12 @@ // CHECK-LABEL: @from_svint32_t( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TYPE_ADDR:%.*]] = alloca , align 16 // CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 16 -// CHECK-NEXT: store [[TYPE:%.*]], * [[TYPE_ADDR]], align 16, [[TBAA9:!tbaa !.*]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast * [[TYPE_ADDR]] to <16 x i32>* -// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP0]], align 16, [[TBAA6]] +// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32( [[TYPE:%.*]], i64 0) // CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast * [[RETVAL_COERCE]] to <16 x i32>* -// CHECK-NEXT: store <16 x i32> [[TMP1]], <16 x i32>* [[RETVAL_0__SROA_CAST]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load , * [[RETVAL_COERCE]], align 16 -// CHECK-NEXT: ret [[TMP2]] +// CHECK-NEXT: store <16 x i32> [[CASTFIXEDSVE]], <16 x i32>* [[RETVAL_0__SROA_CAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load , * [[RETVAL_COERCE]], align 16 +// CHECK-NEXT: ret [[TMP0]] // fixed_int32_t from_svint32_t(svint32_t type) { return type; @@ -46,14 +40,11 @@ // CHECK-LABEL: @to_svfloat64_t( // CHECK-NEXT: entry: // CHECK-NEXT: [[TYPE:%.*]] = alloca <8 x double>, align 16 -// CHECK-NEXT: [[TYPE_ADDR:%.*]] = alloca <8 x double>, align 16 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x double>* [[TYPE]] to * // CHECK-NEXT: store [[TYPE_COERCE:%.*]], * [[TMP0]], align 16 // CHECK-NEXT: [[TYPE1:%.*]] = load <8 x double>, <8 x double>* [[TYPE]], align 16, [[TBAA6]] -// CHECK-NEXT: store <8 x double> [[TYPE1]], <8 x double>* [[TYPE_ADDR]], align 16, [[TBAA6]] -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x double>* [[TYPE_ADDR]] to * -// CHECK-NEXT: [[TMP2:%.*]] = load , * [[TMP1]], align 16, [[TBAA6]] -// CHECK-NEXT: ret [[TMP2]] +// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv2f64.v8f64( undef, <8 x double> [[TYPE1]], i64 0) +// CHECK-NEXT: ret [[CASTSCALABLESVE]] // svfloat64_t to_svfloat64_t(fixed_float64_t type) { return type; @@ -61,15 +52,12 @@ // CHECK-LABEL: @from_svfloat64_t( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TYPE_ADDR:%.*]] = alloca , align 16 // CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 16 -// CHECK-NEXT: store [[TYPE:%.*]], * [[TYPE_ADDR]], align 16, [[TBAA11:!tbaa !.*]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast * [[TYPE_ADDR]] to <8 x double>* -// CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* [[TMP0]], align 16, [[TBAA6]] +// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <8 x double> @llvm.experimental.vector.extract.v8f64.nxv2f64( [[TYPE:%.*]], i64 0) // CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast * [[RETVAL_COERCE]] to <8 x double>* -// CHECK-NEXT: store <8 x double> [[TMP1]], <8 x double>* [[RETVAL_0__SROA_CAST]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load , * [[RETVAL_COERCE]], align 16 -// CHECK-NEXT: ret [[TMP2]] +// CHECK-NEXT: store <8 x double> [[CASTFIXEDSVE]], <8 x double>* [[RETVAL_0__SROA_CAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load , * [[RETVAL_COERCE]], align 16 +// CHECK-NEXT: ret [[TMP0]] // fixed_float64_t from_svfloat64_t(svfloat64_t type) { return type; @@ -95,7 +83,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TYPE_ADDR:%.*]] = alloca , align 16 // CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 16 -// CHECK-NEXT: store [[TYPE:%.*]], * [[TYPE_ADDR]], align 16, [[TBAA13:!tbaa !.*]] +// CHECK-NEXT: store [[TYPE:%.*]], * [[TYPE_ADDR]], align 16, [[TBAA9:!tbaa !.*]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast * [[TYPE_ADDR]] to <8 x i8>* // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 16, [[TBAA6]] // CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast * [[RETVAL_COERCE]] to <8 x i8>* @@ -109,12 +97,9 @@ // CHECK-LABEL: @to_svint32_t__from_gnu_int32_t( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TYPE_ADDR:%.*]] = alloca <16 x i32>, align 16 // CHECK-NEXT: [[TYPE:%.*]] = load <16 x i32>, <16 x i32>* [[TMP0:%.*]], align 16, [[TBAA6]] -// CHECK-NEXT: store <16 x i32> [[TYPE]], <16 x i32>* [[TYPE_ADDR]], align 16, [[TBAA6]] -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* [[TYPE_ADDR]] to * -// CHECK-NEXT: [[TMP2:%.*]] = load , * [[TMP1]], align 16, [[TBAA6]] -// CHECK-NEXT: ret [[TMP2]] +// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv4i32.v16i32( undef, <16 x i32> [[TYPE]], i64 0) +// CHECK-NEXT: ret [[CASTSCALABLESVE]] // svint32_t to_svint32_t__from_gnu_int32_t(gnu_int32_t type) { return type; @@ -122,11 +107,8 @@ // CHECK-LABEL: @from_svint32_t__to_gnu_int32_t( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TYPE_ADDR:%.*]] = alloca , align 16 -// CHECK-NEXT: store [[TYPE:%.*]], * [[TYPE_ADDR]], align 16, [[TBAA9]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast * [[TYPE_ADDR]] to <16 x i32>* -// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP0]], align 16, [[TBAA6]] -// CHECK-NEXT: store <16 x i32> [[TMP1]], <16 x i32>* [[AGG_RESULT:%.*]], align 16, [[TBAA6]] +// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32( [[TYPE:%.*]], i64 0) +// CHECK-NEXT: store <16 x i32> [[CASTFIXEDSVE]], <16 x i32>* [[AGG_RESULT:%.*]], align 16, [[TBAA6]] // CHECK-NEXT: ret void // gnu_int32_t from_svint32_t__to_gnu_int32_t(svint32_t type) { diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c --- a/clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c +++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c @@ -17,7 +17,6 @@ // CHECK-NEXT: [[PRED_ADDR:%.*]] = alloca , align 2 // CHECK-NEXT: [[VEC_ADDR:%.*]] = alloca , align 16 // CHECK-NEXT: [[PG:%.*]] = alloca , align 2 -// CHECK-NEXT: [[SAVED_CALL_RVALUE:%.*]] = alloca , align 16 // CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 16 // CHECK-NEXT: store [[PRED:%.*]], * [[PRED_ADDR]], align 2 // CHECK-NEXT: store [[VEC:%.*]], * [[VEC_ADDR]], align 16 @@ -30,19 +29,17 @@ // CHECK-NEXT: store [[TMP5]], * [[PG]], align 2 // CHECK-NEXT: [[TMP6:%.*]] = load , * [[PG]], align 2 // CHECK-NEXT: [[TMP7:%.*]] = load <16 x i32>, <16 x i32>* @global_vec, align 16 -// CHECK-NEXT: [[TMP8:%.*]] = load , * bitcast (<16 x i32>* @global_vec to *), align 16 -// CHECK-NEXT: [[TMP9:%.*]] = load , * [[VEC_ADDR]], align 16 -// CHECK-NEXT: [[TMP10:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[TMP6]]) -// CHECK-NEXT: [[TMP11:%.*]] = call @llvm.aarch64.sve.add.nxv4i32( [[TMP10]], [[TMP8]], [[TMP9]]) -// CHECK-NEXT: store [[TMP11]], * [[SAVED_CALL_RVALUE]], align 16 -// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast * [[SAVED_CALL_RVALUE]] to <16 x i32>* -// CHECK-NEXT: [[TMP12:%.*]] = load <16 x i32>, <16 x i32>* [[CASTFIXEDSVE]], align 16 -// CHECK-NEXT: store <16 x i32> [[TMP12]], <16 x i32>* [[RETVAL]], align 16 -// CHECK-NEXT: [[TMP13:%.*]] = bitcast * [[RETVAL_COERCE]] to i8* -// CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i32>* [[RETVAL]] to i8* -// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP13]], i8* align 16 [[TMP14]], i64 64, i1 false) -// CHECK-NEXT: [[TMP15:%.*]] = load , * [[RETVAL_COERCE]], align 16 -// CHECK-NEXT: ret [[TMP15]] +// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv4i32.v16i32( undef, <16 x i32> [[TMP7]], i64 0) +// CHECK-NEXT: [[TMP8:%.*]] = load , * [[VEC_ADDR]], align 16 +// CHECK-NEXT: [[TMP9:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[TMP6]]) +// CHECK-NEXT: [[TMP10:%.*]] = call @llvm.aarch64.sve.add.nxv4i32( [[TMP9]], [[CASTSCALABLESVE]], [[TMP8]]) +// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32( [[TMP10]], i64 0) +// CHECK-NEXT: store <16 x i32> [[CASTFIXEDSVE]], <16 x i32>* [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast * [[RETVAL_COERCE]] to i8* +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32>* [[RETVAL]] to i8* +// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 64, i1 false) +// CHECK-NEXT: [[TMP13:%.*]] = load , * [[RETVAL_COERCE]], align 16 +// CHECK-NEXT: ret [[TMP13]] // fixed_int32_t foo(svbool_t pred, svint32_t vec) { svbool_t pg = svand_z(pred, global_pred, global_pred); diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-globals.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-globals.c --- a/clang/test/CodeGen/attr-arm-sve-vector-bits-globals.c +++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-globals.c @@ -21,40 +21,28 @@ // CHECK-128-LABEL: @write_global_i64( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[V_ADDR:%.*]] = alloca , align 16 -// CHECK-128-NEXT: store [[V:%.*]], * [[V_ADDR]], align 16, [[TBAA6:!tbaa !.*]] -// CHECK-128-NEXT: [[TMP0:%.*]] = bitcast * [[V_ADDR]] to <2 x i64>* -// CHECK-128-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 16, [[TBAA10:!tbaa !.*]] -// CHECK-128-NEXT: store <2 x i64> [[TMP1]], <2 x i64>* @global_i64, align 16, [[TBAA10]] +// CHECK-128-NEXT: [[CASTFIXEDSVE:%.*]] = call <2 x i64> @llvm.experimental.vector.extract.v2i64.nxv2i64( [[V:%.*]], i64 0) +// CHECK-128-NEXT: store <2 x i64> [[CASTFIXEDSVE]], <2 x i64>* @global_i64, align 16, [[TBAA6:!tbaa !.*]] // CHECK-128-NEXT: ret void // // CHECK-512-LABEL: @write_global_i64( // CHECK-512-NEXT: entry: -// CHECK-512-NEXT: [[V_ADDR:%.*]] = alloca , align 16 -// CHECK-512-NEXT: store [[V:%.*]], * [[V_ADDR]], align 16, [[TBAA6:!tbaa !.*]] -// CHECK-512-NEXT: [[TMP0:%.*]] = bitcast * [[V_ADDR]] to <8 x i64>* -// CHECK-512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[TMP0]], align 16, [[TBAA10:!tbaa !.*]] -// CHECK-512-NEXT: store <8 x i64> [[TMP1]], <8 x i64>* @global_i64, align 16, [[TBAA10]] +// CHECK-512-NEXT: [[CASTFIXEDSVE:%.*]] = call <8 x i64> @llvm.experimental.vector.extract.v8i64.nxv2i64( [[V:%.*]], i64 0) +// CHECK-512-NEXT: store <8 x i64> [[CASTFIXEDSVE]], <8 x i64>* @global_i64, align 16, [[TBAA6:!tbaa !.*]] // CHECK-512-NEXT: ret void // void write_global_i64(svint64_t v) { global_i64 = v; } // CHECK-128-LABEL: @write_global_bf16( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[V_ADDR:%.*]] = alloca , align 16 -// CHECK-128-NEXT: store [[V:%.*]], * [[V_ADDR]], align 16, [[TBAA11:!tbaa !.*]] -// CHECK-128-NEXT: [[TMP0:%.*]] = bitcast * [[V_ADDR]] to <8 x bfloat>* -// CHECK-128-NEXT: [[TMP1:%.*]] = load <8 x bfloat>, <8 x bfloat>* [[TMP0]], align 16, [[TBAA10]] -// CHECK-128-NEXT: store <8 x bfloat> [[TMP1]], <8 x bfloat>* @global_bf16, align 16, [[TBAA10]] +// CHECK-128-NEXT: [[CASTFIXEDSVE:%.*]] = call <8 x bfloat> @llvm.experimental.vector.extract.v8bf16.nxv8bf16( [[V:%.*]], i64 0) +// CHECK-128-NEXT: store <8 x bfloat> [[CASTFIXEDSVE]], <8 x bfloat>* @global_bf16, align 16, [[TBAA6]] // CHECK-128-NEXT: ret void // // CHECK-512-LABEL: @write_global_bf16( // CHECK-512-NEXT: entry: -// CHECK-512-NEXT: [[V_ADDR:%.*]] = alloca , align 16 -// CHECK-512-NEXT: store [[V:%.*]], * [[V_ADDR]], align 16, [[TBAA11:!tbaa !.*]] -// CHECK-512-NEXT: [[TMP0:%.*]] = bitcast * [[V_ADDR]] to <32 x bfloat>* -// CHECK-512-NEXT: [[TMP1:%.*]] = load <32 x bfloat>, <32 x bfloat>* [[TMP0]], align 16, [[TBAA10]] -// CHECK-512-NEXT: store <32 x bfloat> [[TMP1]], <32 x bfloat>* @global_bf16, align 16, [[TBAA10]] +// CHECK-512-NEXT: [[CASTFIXEDSVE:%.*]] = call <32 x bfloat> @llvm.experimental.vector.extract.v32bf16.nxv8bf16( [[V:%.*]], i64 0) +// CHECK-512-NEXT: store <32 x bfloat> [[CASTFIXEDSVE]], <32 x bfloat>* @global_bf16, align 16, [[TBAA6]] // CHECK-512-NEXT: ret void // void write_global_bf16(svbfloat16_t v) { global_bf16 = v; } @@ -62,19 +50,19 @@ // CHECK-128-LABEL: @write_global_bool( // CHECK-128-NEXT: entry: // CHECK-128-NEXT: [[V_ADDR:%.*]] = alloca , align 16 -// CHECK-128-NEXT: store [[V:%.*]], * [[V_ADDR]], align 16, [[TBAA13:!tbaa !.*]] +// CHECK-128-NEXT: store [[V:%.*]], * [[V_ADDR]], align 16, [[TBAA9:!tbaa !.*]] // CHECK-128-NEXT: [[TMP0:%.*]] = bitcast * [[V_ADDR]] to <2 x i8>* -// CHECK-128-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 16, [[TBAA10]] -// CHECK-128-NEXT: store <2 x i8> [[TMP1]], <2 x i8>* @global_bool, align 2, [[TBAA10]] +// CHECK-128-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 16, [[TBAA6]] +// CHECK-128-NEXT: store <2 x i8> [[TMP1]], <2 x i8>* @global_bool, align 2, [[TBAA6]] // CHECK-128-NEXT: ret void // // CHECK-512-LABEL: @write_global_bool( // CHECK-512-NEXT: entry: // CHECK-512-NEXT: [[V_ADDR:%.*]] = alloca , align 16 -// CHECK-512-NEXT: store [[V:%.*]], * [[V_ADDR]], align 16, [[TBAA13:!tbaa !.*]] +// CHECK-512-NEXT: store [[V:%.*]], * [[V_ADDR]], align 16, [[TBAA9:!tbaa !.*]] // CHECK-512-NEXT: [[TMP0:%.*]] = bitcast * [[V_ADDR]] to <8 x i8>* -// CHECK-512-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 16, [[TBAA10]] -// CHECK-512-NEXT: store <8 x i8> [[TMP1]], <8 x i8>* @global_bool, align 2, [[TBAA10]] +// CHECK-512-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 16, [[TBAA6]] +// CHECK-512-NEXT: store <8 x i8> [[TMP1]], <8 x i8>* @global_bool, align 2, [[TBAA6]] // CHECK-512-NEXT: ret void // void write_global_bool(svbool_t v) { global_bool = v; } @@ -85,36 +73,40 @@ // CHECK-128-LABEL: @read_global_i64( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[TMP0:%.*]] = load , * bitcast (<2 x i64>* @global_i64 to *), align 16, [[TBAA10]] -// CHECK-128-NEXT: ret [[TMP0]] +// CHECK-128-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* @global_i64, align 16, [[TBAA6]] +// CHECK-128-NEXT: [[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv2i64.v2i64( undef, <2 x i64> [[TMP0]], i64 0) +// CHECK-128-NEXT: ret [[CASTSCALABLESVE]] // // CHECK-512-LABEL: @read_global_i64( // CHECK-512-NEXT: entry: -// CHECK-512-NEXT: [[TMP0:%.*]] = load , * bitcast (<8 x i64>* @global_i64 to *), align 16, [[TBAA10]] -// CHECK-512-NEXT: ret [[TMP0]] +// CHECK-512-NEXT: [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* @global_i64, align 16, [[TBAA6]] +// CHECK-512-NEXT: [[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv2i64.v8i64( undef, <8 x i64> [[TMP0]], i64 0) +// CHECK-512-NEXT: ret [[CASTSCALABLESVE]] // svint64_t read_global_i64() { return global_i64; } // CHECK-128-LABEL: @read_global_bf16( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[TMP0:%.*]] = load , * bitcast (<8 x bfloat>* @global_bf16 to *), align 16, [[TBAA10]] -// CHECK-128-NEXT: ret [[TMP0]] +// CHECK-128-NEXT: [[TMP0:%.*]] = load <8 x bfloat>, <8 x bfloat>* @global_bf16, align 16, [[TBAA6]] +// CHECK-128-NEXT: [[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv8bf16.v8bf16( undef, <8 x bfloat> [[TMP0]], i64 0) +// CHECK-128-NEXT: ret [[CASTSCALABLESVE]] // // CHECK-512-LABEL: @read_global_bf16( // CHECK-512-NEXT: entry: -// CHECK-512-NEXT: [[TMP0:%.*]] = load , * bitcast (<32 x bfloat>* @global_bf16 to *), align 16, [[TBAA10]] -// CHECK-512-NEXT: ret [[TMP0]] +// CHECK-512-NEXT: [[TMP0:%.*]] = load <32 x bfloat>, <32 x bfloat>* @global_bf16, align 16, [[TBAA6]] +// CHECK-512-NEXT: [[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv8bf16.v32bf16( undef, <32 x bfloat> [[TMP0]], i64 0) +// CHECK-512-NEXT: ret [[CASTSCALABLESVE]] // svbfloat16_t read_global_bf16() { return global_bf16; } // CHECK-128-LABEL: @read_global_bool( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[TMP0:%.*]] = load , * bitcast (<2 x i8>* @global_bool to *), align 2, [[TBAA10]] +// CHECK-128-NEXT: [[TMP0:%.*]] = load , * bitcast (<2 x i8>* @global_bool to *), align 2, [[TBAA6]] // CHECK-128-NEXT: ret [[TMP0]] // // CHECK-512-LABEL: @read_global_bool( // CHECK-512-NEXT: entry: -// CHECK-512-NEXT: [[TMP0:%.*]] = load , * bitcast (<8 x i8>* @global_bool to *), align 2, [[TBAA10]] +// CHECK-512-NEXT: [[TMP0:%.*]] = load , * bitcast (<8 x i8>* @global_bool to *), align 2, [[TBAA6]] // CHECK-512-NEXT: ret [[TMP0]] // svbool_t read_global_bool() { return global_bool; } diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -919,6 +919,20 @@ return CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS, nullptr, Name); } + CallInst *CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, + const Twine &Name = "") { + return CreateIntrinsic(Intrinsic::experimental_vector_extract, + {DstType, SrcVec->getType()}, {SrcVec, Idx}, nullptr, + Name); + } + + CallInst *CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, + Value *Idx, const Twine &Name = "") { + return CreateIntrinsic(Intrinsic::experimental_vector_insert, + {DstType, SubVec->getType()}, {SrcVec, SubVec, Idx}, + nullptr, Name); + } + private: /// Create a call to a masked intrinsic with given Id. CallInst *CreateMaskedIntrinsic(Intrinsic::ID Id, ArrayRef Ops,