Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -19560,12 +19560,62 @@ } } + unsigned WhichResult; + if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult == 0) + return convertFromScalableVector( + DAG, VT, + DAG.getNode(AArch64ISD::ZIP1, DL, Op1.getValueType(), Op1, Op2)); + + if (isTRNMask(ShuffleMask, VT, WhichResult)) { + unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; + return convertFromScalableVector( + DAG, VT, DAG.getNode(Opc, DL, Op1.getValueType(), Op1, Op2)); + } + + if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0) + return convertFromScalableVector( + DAG, VT, + DAG.getNode(AArch64ISD::ZIP1, DL, Op1.getValueType(), Op1, Op1)); + + if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) { + unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; + return convertFromScalableVector( + DAG, VT, DAG.getNode(Opc, DL, Op1.getValueType(), Op1, Op1)); + } + + // In order to ensure the correctness of the shuffle lowering result, + // when the vector length and the target register size are inconsistent, + // we need to add some restrictions to prevent the new generated instructions + // like zip2/uzp1/uzp2/rev from having some wrong or undefined behavior. unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits(); unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits(); - if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits() && - ShuffleVectorInst::isReverseMask(ShuffleMask) && Op2.isUndef()) { - Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1); - return convertFromScalableVector(DAG, VT, Op); + if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) { + if (ShuffleVectorInst::isReverseMask(ShuffleMask) && Op2.isUndef()) { + Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1); + return convertFromScalableVector(DAG, VT, Op); + } + + if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult != 0) + return convertFromScalableVector( + DAG, VT, + DAG.getNode(AArch64ISD::ZIP2, DL, Op1.getValueType(), Op1, Op2)); + + if (isUZPMask(ShuffleMask, VT, WhichResult)) { + unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; + return convertFromScalableVector( + DAG, VT, DAG.getNode(Opc, DL, Op1.getValueType(), Op1, Op2)); + } + + if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0) + return convertFromScalableVector( + DAG, VT, + DAG.getNode(AArch64ISD::ZIP2, DL, Op1.getValueType(), Op1, Op1)); + + if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { + unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; + return convertFromScalableVector( + DAG, VT, DAG.getNode(Opc, DL, Op1.getValueType(), Op1, Op1)); + } } return SDValue(); Index: llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll @@ -0,0 +1,664 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -aarch64-sve-vector-bits-min=256 -aarch64-sve-vector-bits-max=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 +; RUN: llc -aarch64-sve-vector-bits-min=512 -aarch64-sve-vector-bits-max=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_512 + +target triple = "aarch64-unknown-linux-gnu" + +define void @zip1_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; VBITS_EQ_256-LABEL: zip1_v32i8: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ptrue p0.b +; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_EQ_256-NEXT: st2 { v0.16b, v1.16b }, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_EQ_512-LABEL: zip1_v32i8: +; VBITS_EQ_512: // %bb.0: +; VBITS_EQ_512-NEXT: ptrue p0.b, vl32 +; VBITS_EQ_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_EQ_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_EQ_512-NEXT: st2 { v0.16b, v1.16b }, [x0] +; VBITS_EQ_512-NEXT: ret + %tmp1 = load <32 x i8>, <32 x i8>* %a + %tmp2 = load <32 x i8>, <32 x i8>* %b + %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> + store <32 x i8> %tmp3, <32 x i8>* %a + ret void +} + +define void @zip_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; VBITS_EQ_256-LABEL: zip_v32i16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_EQ_256-NEXT: zip2 z4.h, z1.h, z3.h +; VBITS_EQ_256-NEXT: zip1 z1.h, z1.h, z3.h +; VBITS_EQ_256-NEXT: zip2 z3.h, z0.h, z2.h +; VBITS_EQ_256-NEXT: zip1 z0.h, z0.h, z2.h +; VBITS_EQ_256-NEXT: add z0.h, p0/m, z0.h, z1.h +; VBITS_EQ_256-NEXT: movprfx z1, z4 +; VBITS_EQ_256-NEXT: add z1.h, p0/m, z1.h, z3.h +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_EQ_512-LABEL: zip_v32i16: +; VBITS_EQ_512: // %bb.0: +; VBITS_EQ_512-NEXT: ptrue p0.h +; VBITS_EQ_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_EQ_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_EQ_512-NEXT: zip1 z2.h, z0.h, z1.h +; VBITS_EQ_512-NEXT: zip2 z0.h, z0.h, z1.h +; VBITS_EQ_512-NEXT: add z0.h, p0/m, z0.h, z2.h +; VBITS_EQ_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_EQ_512-NEXT: ret + %tmp1 = load <32 x i16>, <32 x i16>* %a + %tmp2 = load <32 x i16>, <32 x i16>* %b + %tmp3 = shufflevector <32 x i16> %tmp1, <32 x i16> %tmp2, <32 x i32> + %tmp4 = shufflevector <32 x i16> %tmp1, <32 x i16> %tmp2, <32 x i32> + %tmp5 = add <32 x i16> %tmp3, %tmp4 + store <32 x i16> %tmp5, <32 x i16>* %a + ret void +} + +define void @zip1_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; VBITS_EQ_256-LABEL: zip1_v16i16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ptrue p0.h +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_EQ_256-NEXT: st2 { v0.8h, v1.8h }, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_EQ_512-LABEL: zip1_v16i16: +; VBITS_EQ_512: // %bb.0: +; VBITS_EQ_512-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_EQ_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_EQ_512-NEXT: st2 { v0.8h, v1.8h }, [x0] +; VBITS_EQ_512-NEXT: ret + %tmp1 = load <16 x i16>, <16 x i16>* %a + %tmp2 = load <16 x i16>, <16 x i16>* %b + %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> + store <16 x i16> %tmp3, <16 x i16>* %a + ret void +} + +define void @zip1_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; VBITS_EQ_256-LABEL: zip1_v8i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ptrue p0.s +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_EQ_256-NEXT: st2 { v0.4s, v1.4s }, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_EQ_512-LABEL: zip1_v8i32: +; VBITS_EQ_512: // %bb.0: +; VBITS_EQ_512-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_EQ_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_EQ_512-NEXT: st2 { v0.4s, v1.4s }, [x0] +; VBITS_EQ_512-NEXT: ret + %tmp1 = load <8 x i32>, <8 x i32>* %a + %tmp2 = load <8 x i32>, <8 x i32>* %b + %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> + store <8 x i32> %tmp3, <8 x i32>* %a + ret void +} + +define void @zip_v4f64(<4 x double>* %a, <4 x double>* %b) #0 { +; VBITS_EQ_256-LABEL: zip_v4f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ptrue p0.d +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: zip1 z2.d, z0.d, z1.d +; VBITS_EQ_256-NEXT: zip2 z0.d, z0.d, z1.d +; VBITS_EQ_256-NEXT: fadd z0.d, z2.d, z0.d +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_EQ_512-LABEL: zip_v4f64: +; VBITS_EQ_512: // %bb.0: +; VBITS_EQ_512-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; VBITS_EQ_512-NEXT: sub x9, sp, #48 +; VBITS_EQ_512-NEXT: mov x29, sp +; VBITS_EQ_512-NEXT: and sp, x9, #0xffffffffffffffe0 +; VBITS_EQ_512-NEXT: .cfi_def_cfa w29, 16 +; VBITS_EQ_512-NEXT: .cfi_offset w30, -8 +; VBITS_EQ_512-NEXT: .cfi_offset w29, -16 +; VBITS_EQ_512-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_EQ_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_EQ_512-NEXT: mov z2.d, z1.d[3] +; VBITS_EQ_512-NEXT: mov z3.d, z0.d[3] +; VBITS_EQ_512-NEXT: stp d3, d2, [sp, #16] +; VBITS_EQ_512-NEXT: mov z2.d, z1.d[2] +; VBITS_EQ_512-NEXT: mov z3.d, z0.d[2] +; VBITS_EQ_512-NEXT: zip1 z0.d, z0.d, z1.d +; VBITS_EQ_512-NEXT: stp d3, d2, [sp] +; VBITS_EQ_512-NEXT: ld1d { z2.d }, p0/z, [sp] +; VBITS_EQ_512-NEXT: fadd z0.d, p0/m, z0.d, z2.d +; VBITS_EQ_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_EQ_512-NEXT: mov sp, x29 +; VBITS_EQ_512-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; VBITS_EQ_512-NEXT: ret + %tmp1 = load <4 x double>, <4 x double>* %a + %tmp2 = load <4 x double>, <4 x double>* %b + %tmp3 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> + %tmp4 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> + %tmp5 = fadd <4 x double> %tmp3, %tmp4 + store <4 x double> %tmp5, <4 x double>* %a + ret void +} + +; Don't use SVE for 128-bit vectors +define void @zip_v4i32(<4 x i32>* %a, <4 x i32>* %b) #0 { +; CHECK-LABEL: zip_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: zip1 v2.4s, v0.4s, v1.4s +; CHECK-NEXT: zip2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: add v0.4s, v2.4s, v0.4s +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <4 x i32>, <4 x i32>* %a + %tmp2 = load <4 x i32>, <4 x i32>* %b + %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> + %tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> + %tmp5 = add <4 x i32> %tmp3, %tmp4 + store <4 x i32> %tmp5, <4 x i32>* %a + ret void +} + +define void @zip1_v8i32_undef(<8 x i32>* %a) #0 { +; VBITS_EQ_256-LABEL: zip1_v8i32_undef: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ptrue p0.s +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: mov v1.16b, v0.16b +; VBITS_EQ_256-NEXT: st2 { v0.4s, v1.4s }, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_EQ_512-LABEL: zip1_v8i32_undef: +; VBITS_EQ_512: // %bb.0: +; VBITS_EQ_512-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_EQ_512-NEXT: mov v1.16b, v0.16b +; VBITS_EQ_512-NEXT: st2 { v0.4s, v1.4s }, [x0] +; VBITS_EQ_512-NEXT: ret + %tmp1 = load <8 x i32>, <8 x i32>* %a + %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> + store <8 x i32> %tmp2, <8 x i32>* %a + ret void +} + +define void @trn_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; VBITS_EQ_256-LABEL: trn_v32i8: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ptrue p0.b +; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_EQ_256-NEXT: trn1 z2.b, z0.b, z1.b +; VBITS_EQ_256-NEXT: trn2 z0.b, z0.b, z1.b +; VBITS_EQ_256-NEXT: add z0.b, p0/m, z0.b, z2.b +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_EQ_512-LABEL: trn_v32i8: +; VBITS_EQ_512: // %bb.0: +; VBITS_EQ_512-NEXT: ptrue p0.b, vl32 +; VBITS_EQ_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_EQ_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_EQ_512-NEXT: trn1 z2.b, z0.b, z1.b +; VBITS_EQ_512-NEXT: trn2 z0.b, z0.b, z1.b +; VBITS_EQ_512-NEXT: add z0.b, p0/m, z0.b, z2.b +; VBITS_EQ_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_EQ_512-NEXT: ret + %tmp1 = load <32 x i8>, <32 x i8>* %a + %tmp2 = load <32 x i8>, <32 x i8>* %b + %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> + %tmp4 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> + %tmp5 = add <32 x i8> %tmp3, %tmp4 + store <32 x i8> %tmp5, <32 x i8>* %a + ret void +} + +define void @trn_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; VBITS_EQ_256-LABEL: trn_v32i16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_EQ_256-NEXT: trn1 z4.h, z0.h, z2.h +; VBITS_EQ_256-NEXT: trn1 z5.h, z1.h, z3.h +; VBITS_EQ_256-NEXT: trn2 z0.h, z0.h, z2.h +; VBITS_EQ_256-NEXT: trn2 z1.h, z1.h, z3.h +; VBITS_EQ_256-NEXT: add z0.h, p0/m, z0.h, z4.h +; VBITS_EQ_256-NEXT: add z1.h, p0/m, z1.h, z5.h +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_EQ_512-LABEL: trn_v32i16: +; VBITS_EQ_512: // %bb.0: +; VBITS_EQ_512-NEXT: ptrue p0.h +; VBITS_EQ_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_EQ_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_EQ_512-NEXT: trn1 z2.h, z0.h, z1.h +; VBITS_EQ_512-NEXT: trn2 z0.h, z0.h, z1.h +; VBITS_EQ_512-NEXT: add z0.h, p0/m, z0.h, z2.h +; VBITS_EQ_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_EQ_512-NEXT: ret + %tmp1 = load <32 x i16>, <32 x i16>* %a + %tmp2 = load <32 x i16>, <32 x i16>* %b + %tmp3 = shufflevector <32 x i16> %tmp1, <32 x i16> %tmp2, <32 x i32> + %tmp4 = shufflevector <32 x i16> %tmp1, <32 x i16> %tmp2, <32 x i32> + %tmp5 = add <32 x i16> %tmp3, %tmp4 + store <32 x i16> %tmp5, <32 x i16>* %a + ret void +} + +define void @trn_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; VBITS_EQ_256-LABEL: trn_v16i16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ptrue p0.h +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_EQ_256-NEXT: trn1 z2.h, z0.h, z1.h +; VBITS_EQ_256-NEXT: trn2 z0.h, z0.h, z1.h +; VBITS_EQ_256-NEXT: add z0.h, p0/m, z0.h, z2.h +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_EQ_512-LABEL: trn_v16i16: +; VBITS_EQ_512: // %bb.0: +; VBITS_EQ_512-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_EQ_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_EQ_512-NEXT: trn1 z2.h, z0.h, z1.h +; VBITS_EQ_512-NEXT: trn2 z0.h, z0.h, z1.h +; VBITS_EQ_512-NEXT: add z0.h, p0/m, z0.h, z2.h +; VBITS_EQ_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_EQ_512-NEXT: ret + %tmp1 = load <16 x i16>, <16 x i16>* %a + %tmp2 = load <16 x i16>, <16 x i16>* %b + %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> + %tmp4 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> + %tmp5 = add <16 x i16> %tmp3, %tmp4 + store <16 x i16> %tmp5, <16 x i16>* %a + ret void +} + +define void @trn_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; VBITS_EQ_256-LABEL: trn_v8i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ptrue p0.s +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_EQ_256-NEXT: trn1 z2.s, z0.s, z1.s +; VBITS_EQ_256-NEXT: trn2 z0.s, z0.s, z1.s +; VBITS_EQ_256-NEXT: add z0.s, p0/m, z0.s, z2.s +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_EQ_512-LABEL: trn_v8i32: +; VBITS_EQ_512: // %bb.0: +; VBITS_EQ_512-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_EQ_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_EQ_512-NEXT: trn1 z2.s, z0.s, z1.s +; VBITS_EQ_512-NEXT: trn2 z0.s, z0.s, z1.s +; VBITS_EQ_512-NEXT: add z0.s, p0/m, z0.s, z2.s +; VBITS_EQ_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_EQ_512-NEXT: ret + %tmp1 = load <8 x i32>, <8 x i32>* %a + %tmp2 = load <8 x i32>, <8 x i32>* %b + %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> + %tmp4 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> + %tmp5 = add <8 x i32> %tmp3, %tmp4 + store <8 x i32> %tmp5, <8 x i32>* %a + ret void +} + +define void @trn_v4f64(<4 x double>* %a, <4 x double>* %b) #0 { +; VBITS_EQ_256-LABEL: trn_v4f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ptrue p0.d +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: trn1 z2.d, z0.d, z1.d +; VBITS_EQ_256-NEXT: trn2 z0.d, z0.d, z1.d +; VBITS_EQ_256-NEXT: fadd z0.d, z2.d, z0.d +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_EQ_512-LABEL: trn_v4f64: +; VBITS_EQ_512: // %bb.0: +; VBITS_EQ_512-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_EQ_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_EQ_512-NEXT: trn1 z2.d, z0.d, z1.d +; VBITS_EQ_512-NEXT: trn2 z0.d, z0.d, z1.d +; VBITS_EQ_512-NEXT: fadd z0.d, p0/m, z0.d, z2.d +; VBITS_EQ_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_EQ_512-NEXT: ret + %tmp1 = load <4 x double>, <4 x double>* %a + %tmp2 = load <4 x double>, <4 x double>* %b + %tmp3 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> + %tmp4 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> + %tmp5 = fadd <4 x double> %tmp3, %tmp4 + store <4 x double> %tmp5, <4 x double>* %a + ret void +} + +; Don't use SVE for 128-bit vectors +define void @trn_v4f32(<4 x float>* %a, <4 x float>* %b) #0 { +; CHECK-LABEL: trn_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: trn1 v2.4s, v0.4s, v1.4s +; CHECK-NEXT: trn2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: fadd v0.4s, v2.4s, v0.4s +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <4 x float>, <4 x float>* %a + %tmp2 = load <4 x float>, <4 x float>* %b + %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> + %tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> + %tmp5 = fadd <4 x float> %tmp3, %tmp4 + store <4 x float> %tmp5, <4 x float>* %a + ret void +} + +define void @trn_v8i32_undef(<8 x i32>* %a) #0 { +; VBITS_EQ_256-LABEL: trn_v8i32_undef: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ptrue p0.s +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: trn1 z1.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: trn2 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: add z0.s, p0/m, z0.s, z1.s +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_EQ_512-LABEL: trn_v8i32_undef: +; VBITS_EQ_512: // %bb.0: +; VBITS_EQ_512-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_EQ_512-NEXT: trn1 z1.s, z0.s, z0.s +; VBITS_EQ_512-NEXT: trn2 z0.s, z0.s, z0.s +; VBITS_EQ_512-NEXT: add z0.s, p0/m, z0.s, z1.s +; VBITS_EQ_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_EQ_512-NEXT: ret + %tmp1 = load <8 x i32>, <8 x i32>* %a + %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> + %tmp4 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> + %tmp5 = add <8 x i32> %tmp3, %tmp4 + store <8 x i32> %tmp5, <8 x i32>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" } + +define void @zip2_v32i8(<32 x i8>* %a, <32 x i8>* %b) #1 { +; CHECK-LABEL: zip2_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #16 +; CHECK-NEXT: st2 { v0.16b, v1.16b }, [x0] +; CHECK-NEXT: ret + %tmp1 = load <32 x i8>, <32 x i8>* %a + %tmp2 = load <32 x i8>, <32 x i8>* %b + %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> + store <32 x i8> %tmp3, <32 x i8>* %a + ret void +} + +define void @zip2_v16i16(<16 x i16>* %a, <16 x i16>* %b) #1 { +; CHECK-LABEL: zip2_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #16 +; CHECK-NEXT: st2 { v0.8h, v1.8h }, [x0] +; CHECK-NEXT: ret + %tmp1 = load <16 x i16>, <16 x i16>* %a + %tmp2 = load <16 x i16>, <16 x i16>* %b + %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> + store <16 x i16> %tmp3, <16 x i16>* %a + ret void +} + +define void @zip2_v8i32(<8 x i32>* %a, <8 x i32>* %b) #1 { +; CHECK-LABEL: zip2_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #16 +; CHECK-NEXT: st2 { v0.4s, v1.4s }, [x0] +; CHECK-NEXT: ret + %tmp1 = load <8 x i32>, <8 x i32>* %a + %tmp2 = load <8 x i32>, <8 x i32>* %b + %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> + store <8 x i32> %tmp3, <8 x i32>* %a + ret void +} + +define void @zip2_v8i32_undef(<8 x i32>* %a) #1 { +; CHECK-LABEL: zip2_v8i32_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: st2 { v0.4s, v1.4s }, [x0] +; CHECK-NEXT: ret + %tmp1 = load <8 x i32>, <8 x i32>* %a + %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> + store <8 x i32> %tmp2, <8 x i32>* %a + ret void +} + +define void @uzp_v32i8(<32 x i8>* %a, <32 x i8>* %b) #1 { +; CHECK-LABEL: uzp_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: uzp1 z2.b, z0.b, z1.b +; CHECK-NEXT: uzp2 z0.b, z0.b, z1.b +; CHECK-NEXT: add z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <32 x i8>, <32 x i8>* %a + %tmp2 = load <32 x i8>, <32 x i8>* %b + %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> + %tmp4 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> + %tmp5 = add <32 x i8> %tmp3, %tmp4 + store <32 x i8> %tmp5, <32 x i8>* %a + ret void +} + +define void @uzp_v32i16(<32 x i16>* %a, <32 x i16>* %b) #1 { +; CHECK-LABEL: uzp_v32i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] +; CHECK-NEXT: uzp1 z5.h, z1.h, z0.h +; CHECK-NEXT: uzp2 z0.h, z1.h, z0.h +; CHECK-NEXT: add z0.h, p0/m, z0.h, z5.h +; CHECK-NEXT: uzp1 z4.h, z3.h, z2.h +; CHECK-NEXT: uzp2 z2.h, z3.h, z2.h +; CHECK-NEXT: movprfx z1, z4 +; CHECK-NEXT: add z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: st1h { z1.h }, p0, [x0, x8, lsl #1] +; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <32 x i16>, <32 x i16>* %a + %tmp2 = load <32 x i16>, <32 x i16>* %b + %tmp3 = shufflevector <32 x i16> %tmp1, <32 x i16> %tmp2, <32 x i32> + %tmp4 = shufflevector <32 x i16> %tmp1, <32 x i16> %tmp2, <32 x i32> + %tmp5 = add <32 x i16> %tmp3, %tmp4 + store <32 x i16> %tmp5, <32 x i16>* %a + ret void +} + +define void @uzp_v16i16(<16 x i16>* %a, <16 x i16>* %b) #1 { +; CHECK-LABEL: uzp_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: uzp1 z2.h, z0.h, z1.h +; CHECK-NEXT: uzp2 z0.h, z0.h, z1.h +; CHECK-NEXT: add z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <16 x i16>, <16 x i16>* %a + %tmp2 = load <16 x i16>, <16 x i16>* %b + %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> + %tmp4 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> + %tmp5 = add <16 x i16> %tmp3, %tmp4 + store <16 x i16> %tmp5, <16 x i16>* %a + ret void +} + +define void @uzp_v8f32(<8 x float>* %a, <8 x float>* %b) #1 { +; CHECK-LABEL: uzp_v8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: uzp1 z2.s, z0.s, z1.s +; CHECK-NEXT: uzp2 z0.s, z0.s, z1.s +; CHECK-NEXT: fadd z0.s, z2.s, z0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <8 x float>, <8 x float>* %a + %tmp2 = load <8 x float>, <8 x float>* %b + %tmp3 = shufflevector <8 x float> %tmp1, <8 x float> %tmp2, <8 x i32> + %tmp4 = shufflevector <8 x float> %tmp1, <8 x float> %tmp2, <8 x i32> + %tmp5 = fadd <8 x float> %tmp3, %tmp4 + store <8 x float> %tmp5, <8 x float>* %a + ret void +} + +define void @uzp_v4i64(<4 x i64>* %a, <4 x i64>* %b) #1 { +; CHECK-LABEL: uzp_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d +; CHECK-NEXT: add z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <4 x i64>, <4 x i64>* %a + %tmp2 = load <4 x i64>, <4 x i64>* %b + %tmp3 = shufflevector <4 x i64> %tmp1, <4 x i64> %tmp2, <4 x i32> + %tmp4 = shufflevector <4 x i64> %tmp1, <4 x i64> %tmp2, <4 x i32> + %tmp5 = add <4 x i64> %tmp3, %tmp4 + store <4 x i64> %tmp5, <4 x i64>* %a + ret void +} + +; Don't use SVE for 128-bit vectors +define void @uzp_v8i16(<8 x i16>* %a, <8 x i16>* %b) #1 { +; CHECK-LABEL: uzp_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: uzp1 v2.8h, v0.8h, v1.8h +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: add v0.8h, v2.8h, v0.8h +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <8 x i16>, <8 x i16>* %a + %tmp2 = load <8 x i16>, <8 x i16>* %b + %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> + %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> + %tmp5 = add <8 x i16> %tmp3, %tmp4 + store <8 x i16> %tmp5, <8 x i16>* %a + ret void +} + +define void @uzp_v8i32_undef(<8 x i32>* %a) #1 { +; CHECK-LABEL: uzp_v8i32_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: uzp1 z1.s, z0.s, z0.s +; CHECK-NEXT: uzp2 z0.s, z0.s, z0.s +; CHECK-NEXT: add z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <8 x i32>, <8 x i32>* %a + %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> + %tmp4 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> + %tmp5 = add <8 x i32> %tmp3, %tmp4 + store <8 x i32> %tmp5, <8 x i32>* %a + ret void +} + +attributes #1 = { "target-features"="+sve" vscale_range(2,2) } + +define void @zip_vscale2_4(<4 x double>* %a, <4 x double>* %b) #2 { +; CHECK-LABEL: zip_vscale2_4: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: sub x9, sp, #48 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z2.d, z1.d[3] +; CHECK-NEXT: mov z3.d, z0.d[3] +; CHECK-NEXT: stp d3, d2, [sp, #16] +; CHECK-NEXT: mov z2.d, z1.d[2] +; CHECK-NEXT: mov z3.d, z0.d[2] +; CHECK-NEXT: zip1 z0.d, z0.d, z1.d +; CHECK-NEXT: stp d3, d2, [sp] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [sp] +; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + %tmp1 = load <4 x double>, <4 x double>* %a + %tmp2 = load <4 x double>, <4 x double>* %b + %tmp3 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> + %tmp4 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> + %tmp5 = fadd <4 x double> %tmp3, %tmp4 + store <4 x double> %tmp5, <4 x double>* %a + ret void +} + +attributes #2 = { "target-features"="+sve" vscale_range(2,4) }