Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9530,6 +9530,86 @@ return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64)); } +static bool isWideTypeMask(ArrayRef M, EVT VT, + SmallVectorImpl &NewMask) { + unsigned NumElts = VT.getVectorNumElements(); + if (NumElts % 2 != 0) + return false; + + NewMask.clear(); + for (unsigned i = 0; i < NumElts; i += 2) { + int M0 = M[i]; + int M1 = M[i + 1]; + + if (M0 == -1 && M1 == -1) { + NewMask.push_back(-1); + continue; + } + + if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) { + NewMask.push_back(M1 / 2); + continue; + } + + if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) { + NewMask.push_back(M0 / 2); + continue; + } + + NewMask.clear(); + return false; + } + + assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!"); + return true; +} + +// Try to widen element type to get a new mask value for a better permutation +// sequence, so that we can use NEON shuffle instructions, such as zip1/2, +// UZP1/2, TRN1/2, REV, INS, etc. +// For example: +// shufflevector <4 x i32> %a, <4 x i32> %b, +// <4 x i32> +// is equivalent to: +// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> +// Finally, we can get: +// mov v0.d[0], v1.d[1] +static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + EVT ScalarVT = VT.getVectorElementType(); + SDValue V0 = Op.getOperand(0); + SDValue V1 = Op.getOperand(1); + ArrayRef Mask = cast(Op)->getMask(); + + if (ScalarVT != V0.getValueType().getVectorElementType() || + ScalarVT != V1.getValueType().getVectorElementType()) + return SDValue(); + + // Since the shuffle instructions in NEON, can only handle element size less + // than or equal to 64 bits. For element type larger than 64 bits, we can't + // get any benefits. Similarly, we also exclude the i1 type. + if (ScalarVT.getFixedSizeInBits() >= 64 || ScalarVT.getFixedSizeInBits() == 1) + return SDValue(); + + SmallVector NewMask; + if (isWideTypeMask(Mask, VT, NewMask)) { + MVT NewEltVT = + VT.isFloatingPoint() + ? MVT::getFloatingPointVT(ScalarVT.getFixedSizeInBits() * 2) + : MVT::getIntegerVT(ScalarVT.getFixedSizeInBits() * 2); + MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2); + if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) { + V0 = DAG.getBitcast(NewVT, V0); + V1 = DAG.getBitcast(NewVT, V1); + return DAG.getBitcast(VT, + DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask)); + } + } + + return SDValue(); +} + SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -9677,6 +9757,9 @@ DstLaneV); } + if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG)) + return NewSD; + // If the shuffle is not directly supported and it has 4 elements, use // the PerfectShuffle-generated table to synthesize it from other shuffles. unsigned NumElts = VT.getVectorNumElements(); Index: llvm/test/CodeGen/AArch64/concat-vector.ll =================================================================== --- llvm/test/CodeGen/AArch64/concat-vector.ll +++ llvm/test/CodeGen/AArch64/concat-vector.ll @@ -88,8 +88,7 @@ define <4 x half> @concat9(<2 x half> %A, <2 x half> %B) { ; CHECK-LABEL: concat9: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #4 -; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #4 +; CHECK-NEXT: zip1 v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret %v4half= shufflevector <2 x half> %A, <2 x half> %B, <4 x i32> ret <4 x half> %v4half Index: llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll @@ -0,0 +1,157 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define <4 x half> @shuffle1(<2 x half> %a, <2 x half> %b) { +; CHECK-LABEL: shuffle1: +; CHECK: zip1 v0.2s, v1.2s, v0.2s +; CHECK-NEXT: ret +entry: + %res = shufflevector <2 x half> %a, <2 x half> %b, <4 x i32> + ret <4 x half> %res +} + +define <4 x half> @shuffle2(<2 x half> %a, <2 x half> %b) { +; CHECK-LABEL: shuffle2: +; CHECK: zip1 v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ret +entry: + %res = shufflevector <2 x half> %a, <2 x half> %b, <4 x i32> + ret <4 x half> %res +} + +define <4 x i32> @shuffle3(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: shuffle3: +; CHECK: mov v0.d[0], v1.d[1] +; CHECK-NEXT: ret +entry: + %res = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + ret <4 x i32> %res +} + +define <4 x float> @shuffle4(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: shuffle4: +; CHECK: mov v0.d[1], v1.d[1] +; CHECK-NEXT: ret +entry: + %res = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + ret <4 x float> %res +} + +define <16 x i8> @shuffle5(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: shuffle5: +; CHECK: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ret +entry: + %res = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %res +} + +define <16 x i8> @shuffle6(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: shuffle6: +; CHECK: trn1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ret +entry: + %res = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %res +} + +define <8 x i8> @shuffle7(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: shuffle7: +; CHECK: uzp2 v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ret +entry: + %res = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> + ret <8 x i8> %res +} + +define <8 x i8> @shuffle8(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: shuffle8: +; CHECK: trn2 v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ret +entry: + %res = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> + ret <8 x i8> %res +} + +; No blocks +define <8 x i8> @shuffle9(<8 x i8> %a) { +; CHECK-LABEL: shuffle9: +; CHECK: rev32 v0.4h, v0.4h + %res = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> + ret <8 x i8> %res +} + +define <8 x i16> @shuffle10(<8 x i16> %a) { +; CHECK-LABEL: shuffle10: +; CHECK: rev64 v0.4s, v0.4s + %res = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + ret <8 x i16> %res +} + +define <4 x i16> @shuffle11(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: shuffle11: +; CHECK: mov v1.s[1], v0.s[0] +; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: ret +entry: + %res = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> + ret <4 x i16> %res +} + +define <8 x i8> @shuffle12(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: shuffle12: +; CHECK: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-NEXT: trn2 v0.4h, v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %res = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> + ret <8 x i8> %res +} + +define <8 x i16> @shuffle_widen_faili1(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: shuffle_widen_faili1: +; CHECK: rev32 v2.4h, v0.4h +; CHECK-NEXT: rev32 v3.4h, v1.4h +; CHECK-NEXT: ext v1.8b, v2.8b, v1.8b, #4 +; CHECK-NEXT: ext v0.8b, v3.8b, v0.8b, #4 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret +entry: + %res = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> + ret <8 x i16> %res +} + +define <8 x i16> @shuffle_widen_fail2(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: shuffle_widen_fail2: +; CHECK: uzp1 v2.4h, v0.4h, v0.4h +; CHECK-NEXT: trn1 v3.4h, v1.4h, v1.4h +; CHECK-NEXT: ext v1.8b, v2.8b, v1.8b, #4 +; CHECK-NEXT: ext v0.8b, v3.8b, v0.8b, #4 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret +entry: + %res = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> + ret <8 x i16> %res +} + +define <8 x i16> @shuffle_widen_fail3(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: shuffle_widen_fail3: +; CHECK: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-NEXT: ret +entry: + %res = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + ret <8 x i16> %res +} Index: llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll +++ llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll @@ -460,8 +460,7 @@ ; Don't use SVE for 64-bit vectors. define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2) #0 { ; CHECK-LABEL: concat_v4f16: -; CHECK: ext v0.8b, v0.8b, v0.8b, #4 -; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #4 +; CHECK: zip1 v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret %res = shufflevector <2 x half> %op1, <2 x half> %op2, <4 x i32> ret <4 x half> %res