Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -190,6 +190,15 @@ /// legalization. virtual bool shouldSplitVectorType(EVT /*VT*/) const { return false; } + /// Return true if a vector of the given type should be scalarized during type + /// legalization. + virtual bool shouldScalarizeVectorType(EVT VT) const { + return VT.getVectorNumElements() == 1; + } + + /// Return true if a vector of the given type is prefer to be widen. + virtual bool preferWidenVectorType(EVT VT) const { return false; } + // There are two general methods for expanding a BUILD_VECTOR node: // 1. Use SCALAR_TO_VECTOR on the defined scalar values and then shuffle // them together. Index: lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- lib/CodeGen/TargetLoweringBase.cpp +++ lib/CodeGen/TargetLoweringBase.cpp @@ -1091,27 +1091,29 @@ // that wider vector type. MVT EltVT = VT.getVectorElementType(); unsigned NElts = VT.getVectorNumElements(); - if (NElts != 1 && !shouldSplitVectorType(VT)) { + if (!shouldScalarizeVectorType(VT) && !shouldSplitVectorType(VT)) { bool IsLegalWiderType = false; // First try to promote the elements of integer vectors. If no legal // promotion was found, fallback to the widen-vector method. - for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) { - MVT SVT = (MVT::SimpleValueType)nVT; - // Promote vectors of integers to vectors with the same number - // of elements, with a wider element type. - if (SVT.getVectorElementType().getSizeInBits() > EltVT.getSizeInBits() - && SVT.getVectorNumElements() == NElts && - isTypeLegal(SVT) && SVT.getScalarType().isInteger()) { - TransformToType[i] = SVT; - RegisterTypeForVT[i] = SVT; - NumRegistersForVT[i] = 1; - ValueTypeActions.setTypeAction(VT, TypePromoteInteger); - IsLegalWiderType = true; - break; + if (!preferWidenVectorType(VT)) { + for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) { + MVT SVT = (MVT::SimpleValueType)nVT; + // Promote vectors of integers to vectors with the same number + // of elements, with a wider element type. + if (SVT.getVectorElementType().getSizeInBits() > EltVT.getSizeInBits() + && SVT.getVectorNumElements() == NElts && + isTypeLegal(SVT) && SVT.getScalarType().isInteger()) { + TransformToType[i] = SVT; + RegisterTypeForVT[i] = SVT; + NumRegistersForVT[i] = 1; + ValueTypeActions.setTypeAction(VT, TypePromoteInteger); + IsLegalWiderType = true; + break; + } } - } - if (IsLegalWiderType) continue; + if (IsLegalWiderType) continue; + } // Try to widen the vector. for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) { Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -324,6 +324,10 @@ bool shouldExpandAtomicInIR(Instruction *Inst) const override; + bool shouldScalarizeVectorType(EVT VT) const override; + + bool preferWidenVectorType(EVT VT) const override; + private: /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7864,6 +7864,23 @@ return Inst->getType()->getPrimitiveSizeInBits() <= 128; } +// During type legalization, v1i8, v1i16, v1i32, v1f32 should not be scalarized. +bool AArch64TargetLowering::shouldScalarizeVectorType(EVT VT) const { + assert(VT.isVector() && "Expected vector type"); + MVT SVT = VT.getSimpleVT(); + return VT.getVectorNumElements() == 1 && SVT != MVT::v1i8 && SVT != MVT::v1i16 + && SVT != MVT::v1i32 && SVT != MVT::v1f32; +} + +// During type legalization, v1i8, v1i16, v1i32 are prefered be widen to v8i8, +// v4i16, v2i32 instead of to be promoted. +bool AArch64TargetLowering::preferWidenVectorType(EVT VT) const { + assert(VT.isVector() && "Expected vector type"); + MVT SVT = VT.getSimpleVT(); + return SVT == MVT::v1i8 || SVT == MVT::v1i16 || SVT == MVT::v1i32 + || SVT == MVT::v1f32; +} + Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Index: test/CodeGen/AArch64/arm64-neon-copy.ll =================================================================== --- test/CodeGen/AArch64/arm64-neon-copy.ll +++ test/CodeGen/AArch64/arm64-neon-copy.ll @@ -842,7 +842,7 @@ define <8 x i8> @testDUP.v1i8(<1 x i8> %a) { ; CHECK-LABEL: testDUP.v1i8: -; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}} +; CHECK: dup v0.8b, v0.b[0] %b = extractelement <1 x i8> %a, i32 0 %c = insertelement <8 x i8> undef, i8 %b, i32 0 %d = insertelement <8 x i8> %c, i8 %b, i32 1 @@ -857,7 +857,7 @@ define <8 x i16> @testDUP.v1i16(<1 x i16> %a) { ; CHECK-LABEL: testDUP.v1i16: -; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}} +; CHECK: dup v0.8h, v0.h[0] %b = extractelement <1 x i16> %a, i32 0 %c = insertelement <8 x i16> undef, i16 %b, i32 0 %d = insertelement <8 x i16> %c, i16 %b, i32 1 @@ -872,7 +872,7 @@ define <4 x i32> @testDUP.v1i32(<1 x i32> %a) { ; CHECK-LABEL: testDUP.v1i32: -; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}} +; CHECK: dup v0.4s, v0.s[0] %b = extractelement <1 x i32> %a, i32 0 %c = insertelement <4 x i32> undef, i32 %b, i32 0 %d = insertelement <4 x i32> %c, i32 %b, i32 1 @@ -1411,35 +1411,35 @@ define <4 x i16> @concat_vector_v4i16(<1 x i16> %a) { ; CHECK-LABEL: concat_vector_v4i16: -; CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}} +; CHECK: dup v0.4h, v0.h[0] %r = shufflevector <1 x i16> %a, <1 x i16> undef, <4 x i32> zeroinitializer ret <4 x i16> %r } define <4 x i32> @concat_vector_v4i32(<1 x i32> %a) { ; CHECK-LABEL: concat_vector_v4i32: -; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}} +; CHECK: dup v0.4s, v0.s[0] %r = shufflevector <1 x i32> %a, <1 x i32> undef, <4 x i32> zeroinitializer ret <4 x i32> %r } define <8 x i8> @concat_vector_v8i8(<1 x i8> %a) { ; CHECK-LABEL: concat_vector_v8i8: -; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}} +; CHECK: dup v0.8b, v0.b[0] %r = shufflevector <1 x i8> %a, <1 x i8> undef, <8 x i32> zeroinitializer ret <8 x i8> %r } define <8 x i16> @concat_vector_v8i16(<1 x i16> %a) { ; CHECK-LABEL: concat_vector_v8i16: -; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}} +; CHECK: dup v0.8h, v0.h[0] %r = shufflevector <1 x i16> %a, <1 x i16> undef, <8 x i32> zeroinitializer ret <8 x i16> %r } define <16 x i8> @concat_vector_v16i8(<1 x i8> %a) { ; CHECK-LABEL: concat_vector_v16i8: -; CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}} +; CHECK: dup v0.16b, v0.b[0] %r = shufflevector <1 x i8> %a, <1 x i8> undef, <16 x i32> zeroinitializer ret <16 x i8> %r } Index: test/CodeGen/AArch64/arm64-neon-select_cc.ll =================================================================== --- test/CodeGen/AArch64/arm64-neon-select_cc.ll +++ test/CodeGen/AArch64/arm64-neon-select_cc.ll @@ -136,8 +136,8 @@ define <1 x float> @test_select_cc_v1f32(float %a, float %b, <1 x float> %c, <1 x float> %d ) { ; CHECK-LABEL: test_select_cc_v1f32: -; CHECK: fcmp s0, s1 -; CHECK-NEXT: fcsel s0, s2, s3, eq +; CHECK: fcmeq [[MASK:v[0-9]+]].2s, v0.2s, v1.2s +; CHECK-NEXT: bsl [[MASK]].8b, v2.8b, v3.8b %cmp31 = fcmp oeq float %a, %b %e = select i1 %cmp31, <1 x float> %c, <1 x float> %d ret <1 x float> %e Index: test/CodeGen/AArch64/trunc-v1i64.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/trunc-v1i64.ll @@ -0,0 +1,63 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon -verify-machineinstrs < %s | FileCheck %s + +; An optimization in DAG Combiner to fold +; (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)) +; will generate nodes like: +; v1i32 trunc v1i64, v1i16 trunc v1i64, v1i8 trunc v1i64. +; And such nodes will be defaultly scalarized in type legalization. But such +; scalarization will cause an assertion failure, as v1i64 is a legal type in +; AArch64. We change the default behaviour from be scalarized to be widen. + +; FIXME: Currently XTN is generated for v1i32, but it can be optimized. +; Just like v1i16 and v1i8, there is no XTN generated. + +define <2 x i32> @test_v1i32_0(<1 x i64> %in0) { +; CHECK-LABEL: test_v1i32_0: +; CHECK: xtn v0.2s, v0.2d + %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <2 x i32> + %2 = trunc <2 x i64> %1 to <2 x i32> + ret <2 x i32> %2 +} + +define <2 x i32> @test_v1i32_1(<1 x i64> %in0) { +; CHECK-LABEL: test_v1i32_1: +; CHECK: xtn v0.2s, v0.2d +; CHECK-NEXT: dup v0.2s, v0.s[0] + %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <2 x i32> + %2 = trunc <2 x i64> %1 to <2 x i32> + ret <2 x i32> %2 +} + +define <4 x i16> @test_v1i16_0(<1 x i64> %in0) { +; CHECK-LABEL: test_v1i16_0: +; CHECK-NOT: xtn + %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <4 x i32> + %2 = trunc <4 x i64> %1 to <4 x i16> + ret <4 x i16> %2 +} + +define <4 x i16> @test_v1i16_1(<1 x i64> %in0) { +; CHECK-LABEL: test_v1i16_1: +; CHECK-NOT: xtn +; CHECK: dup v0.4h, v0.h[0] + %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <4 x i32> + %2 = trunc <4 x i64> %1 to <4 x i16> + ret <4 x i16> %2 +} + +define <8 x i8> @test_v1i8_0(<1 x i64> %in0) { +; CHECK-LABEL: test_v1i8_0: +; CHECK-NOT: xtn + %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <8 x i32> + %2 = trunc <8 x i64> %1 to <8 x i8> + ret <8 x i8> %2 +} + +define <8 x i8> @test_v1i8_1(<1 x i64> %in0) { +; CHECK-LABEL: test_v1i8_1: +; CHECK-NOT: xtn +; CHECK: dup v0.8b, v0.b[0] + %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <8 x i32> + %2 = trunc <8 x i64> %1 to <8 x i8> + ret <8 x i8> %2 +} \ No newline at end of file