Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11499,6 +11499,62 @@ return SDValue(); } +static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT OpVT = N->getOperand(0).getValueType(); + + // If the operands are legal vectors, leave them alone. + if (TLI.isTypeLegal(OpVT)) + return SDValue(); + + SDLoc DL(N); + EVT VT = N->getValueType(0); + SmallVector Ops; + + EVT SVT = EVT::getIntegerVT(*DAG.getContext(), OpVT.getSizeInBits()); + SDValue ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT); + + // Keep track of what we encounter. + bool AnyInteger = false; + bool AnyFP = false; + for (const SDValue &Op : N->ops()) { + if (ISD::BITCAST == Op.getOpcode() && + !Op.getOperand(0).getValueType().isVector()) + Ops.push_back(Op.getOperand(0)); + else if (ISD::UNDEF == Op.getOpcode()) + Ops.push_back(ScalarUndef); + else + return SDValue(); + + if (Ops.back().getValueType().isFloatingPoint()) + AnyFP = true; + else + AnyInteger = true; + } + + // If any of the operands is a floating point scalar bitcast to a vector, + // use floating point types throughout, and bitcast everything. + // Replace UNDEFs by another scalar UNDEF node, of the final desired type. + if (AnyFP) { + SVT = EVT::getFloatingPointVT(OpVT.getSizeInBits()); + ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT); + if (AnyInteger) { + for (SDValue &Op : Ops) { + if (Op.getValueType() != SVT) { + Op = DAG.getNode(ISD::BITCAST, DL, SVT, Op); + if (Op.getOpcode() == ISD::UNDEF) + Op = ScalarUndef; + } + } + } + } + + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SVT, + VT.getSizeInBits() / SVT.getSizeInBits()); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, Ops)); +} + SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { // TODO: Check to see if this is a CONCAT_VECTORS of a bunch of // EXTRACT_SUBVECTOR operations. If so, and if the EXTRACT_SUBVECTOR vector @@ -11601,6 +11657,10 @@ return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, Opnds); } + // Fold CONCAT_VECTORS of only bitcast scalars (or undef) to BUILD_VECTOR. + if (SDValue V = combineConcatVectorOfScalars(N, DAG)) + return V; + // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR // nodes often generate nop CONCAT_VECTOR nodes. // Scan the CONCAT_VECTOR operands and look for a CONCAT operations that Index: llvm/trunk/test/CodeGen/AArch64/concat_vector-scalar-combine.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/concat_vector-scalar-combine.ll +++ llvm/trunk/test/CodeGen/AArch64/concat_vector-scalar-combine.ll @@ -0,0 +1,125 @@ +; RUN: llc < %s -mtriple aarch64-unknown-unknown -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + +; Test the (concat_vectors (bitcast (scalar)), ..) pattern. + +define <8 x i8> @test_concat_scalar_v2i8_to_v8i8_dup(i32 %x) #0 { +entry: +; CHECK-LABEL: test_concat_scalar_v2i8_to_v8i8_dup: +; CHECK-NEXT: dup.4h v0, w0 +; CHECK-NEXT: ret + %t = trunc i32 %x to i16 + %0 = bitcast i16 %t to <2 x i8> + %1 = shufflevector <2 x i8> %0, <2 x i8> undef, <8 x i32> + ret <8 x i8> %1 +} + +define <8 x i8> @test_concat_scalar_v4i8_to_v8i8_dup(i32 %x) #0 { +entry: +; CHECK-LABEL: test_concat_scalar_v4i8_to_v8i8_dup: +; CHECK-NEXT: dup.2s v0, w0 +; CHECK-NEXT: ret + %0 = bitcast i32 %x to <4 x i8> + %1 = shufflevector <4 x i8> %0, <4 x i8> undef, <8 x i32> + ret <8 x i8> %1 +} + +define <8 x i16> @test_concat_scalar_v2i16_to_v8i16_dup(i32 %x) #0 { +entry: +; CHECK-LABEL: test_concat_scalar_v2i16_to_v8i16_dup: +; CHECK-NEXT: dup.4s v0, w0 +; CHECK-NEXT: ret + %0 = bitcast i32 %x to <2 x i16> + %1 = shufflevector <2 x i16> %0, <2 x i16> undef, <8 x i32> + ret <8 x i16> %1 +} + +define <8 x i8> @test_concat_scalars_2x_v2i8_to_v8i8(i32 %x, i32 %y) #0 { +entry: +; CHECK-LABEL: test_concat_scalars_2x_v2i8_to_v8i8: +; CHECK-NEXT: ins.h v0[0], w0 +; CHECK-NEXT: ins.h v0[1], w1 +; CHECK-NEXT: ins.h v0[3], w1 +; CHECK-NEXT: ret + %tx = trunc i32 %x to i16 + %ty = trunc i32 %y to i16 + %bx = bitcast i16 %tx to <2 x i8> + %by = bitcast i16 %ty to <2 x i8> + %r = shufflevector <2 x i8> %bx, <2 x i8> %by, <8 x i32> + ret <8 x i8> %r +} + +define <8 x i8> @test_concat_scalars_2x_v4i8_to_v8i8_dup(i32 %x, i32 %y) #0 { +entry: +; CHECK-LABEL: test_concat_scalars_2x_v4i8_to_v8i8_dup: +; CHECK-NEXT: fmov s0, w1 +; CHECK-NEXT: ins.s v0[1], w0 +; CHECK-NEXT: ret + %bx = bitcast i32 %x to <4 x i8> + %by = bitcast i32 %y to <4 x i8> + %r = shufflevector <4 x i8> %bx, <4 x i8> %by, <8 x i32> + ret <8 x i8> %r +} + +define <8 x i16> @test_concat_scalars_2x_v2i16_to_v8i16_dup(i32 %x, i32 %y) #0 { +entry: +; CHECK-LABEL: test_concat_scalars_2x_v2i16_to_v8i16_dup: +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ins.s v0[1], w1 +; CHECK-NEXT: ins.s v0[2], w1 +; CHECK-NEXT: ins.s v0[3], w0 +; CHECK-NEXT: ret + %bx = bitcast i32 %x to <2 x i16> + %by = bitcast i32 %y to <2 x i16> + %r = shufflevector <2 x i16> %bx, <2 x i16> %by, <8 x i32> + ret <8 x i16> %r +} + +; Also make sure we minimize bitcasts. + +; This is a pretty artificial testcase: make sure we bitcast to floating-point +; if any of the scalars is floating-point. +define <8 x i8> @test_concat_scalars_mixed_2x_v2i8_to_v8i8(float %dummy, i32 %x, half %y) #0 { +entry: +; CHECK-LABEL: test_concat_scalars_mixed_2x_v2i8_to_v8i8: +; CHECK-NEXT: fmov s[[X:[0-9]+]], w0 +; CHECK-NEXT: ins.h v0[0], v[[X]][0] +; CHECK-NEXT: ins.h v0[1], v1[0] +; CHECK-NEXT: ins.h v0[2], v[[X]][0] +; CHECK-NEXT: ins.h v0[3], v1[0] +; CHECK-NEXT: ret + %t = trunc i32 %x to i16 + %0 = bitcast i16 %t to <2 x i8> + %y0 = bitcast half %y to <2 x i8> + %1 = shufflevector <2 x i8> %0, <2 x i8> %y0, <8 x i32> + ret <8 x i8> %1 +} + +define <2 x float> @test_concat_scalars_fp_2x_v2i8_to_v8i8(float %dummy, half %x, half %y) #0 { +entry: +; CHECK-LABEL: test_concat_scalars_fp_2x_v2i8_to_v8i8: +; CHECK-NEXT: ins.h v0[0], v1[0] +; CHECK-NEXT: ins.h v0[1], v2[0] +; CHECK-NEXT: ins.h v0[2], v1[0] +; CHECK-NEXT: ins.h v0[3], v2[0] +; CHECK-NEXT: ret + %0 = bitcast half %x to <2 x i8> + %y0 = bitcast half %y to <2 x i8> + %1 = shufflevector <2 x i8> %0, <2 x i8> %y0, <8 x i32> + %2 = bitcast <8 x i8> %1 to <2 x float> + ret <2 x float> %2 +} + +define <4 x float> @test_concat_scalar_fp_v2i16_to_v16i8_dup(float %x) #0 { +entry: +; CHECK-LABEL: test_concat_scalar_fp_v2i16_to_v16i8_dup: +; CHECK-NEXT: dup.4s v0, v0[0] +; CHECK-NEXT: ret + %0 = bitcast float %x to <2 x i16> + %1 = shufflevector <2 x i16> %0, <2 x i16> undef, <8 x i32> + %2 = bitcast <8 x i16> %1 to <4 x float> + ret <4 x float> %2 +} + +attributes #0 = { nounwind }