diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -324,6 +324,8 @@ BITREVERSE_MERGE_PASSTHRU, BSWAP_MERGE_PASSTHRU, + REVH_MERGE_PASSTHRU, + REVW_MERGE_PASSTHRU, CTLZ_MERGE_PASSTHRU, CTPOP_MERGE_PASSTHRU, DUP_MERGE_PASSTHRU, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -204,6 +204,8 @@ return false; case AArch64ISD::BITREVERSE_MERGE_PASSTHRU: case AArch64ISD::BSWAP_MERGE_PASSTHRU: + case AArch64ISD::REVH_MERGE_PASSTHRU: + case AArch64ISD::REVW_MERGE_PASSTHRU: case AArch64ISD::CTLZ_MERGE_PASSTHRU: case AArch64ISD::CTPOP_MERGE_PASSTHRU: case AArch64ISD::DUP_MERGE_PASSTHRU: @@ -2227,6 +2229,8 @@ MAKE_CASE(AArch64ISD::STNP) MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::REVH_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::REVW_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU) @@ -4213,6 +4217,12 @@ case Intrinsic::aarch64_sve_revb: return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sve_revh: + return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sve_revw: + return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_sxtb: return DAG.getNode( AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), @@ -19530,6 +19540,34 @@ return convertFromScalableVector(DAG, VT, Op); } + for (unsigned LaneSize : {64U, 32U, 16U}) { + if (isREVMask(ShuffleMask, VT, LaneSize)) { + EVT NewVT = + getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), LaneSize)); + unsigned RevOp; + unsigned EltSz = VT.getScalarSizeInBits(); + if (EltSz == 8) + RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU; + else if (EltSz == 16) + RevOp = AArch64ISD::REVH_MERGE_PASSTHRU; + else + RevOp = AArch64ISD::REVW_MERGE_PASSTHRU; + + Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1); + Op = LowerToPredicatedOp(Op, DAG, RevOp); + Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op); + return convertFromScalableVector(DAG, VT, Op); + } + } + + unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits(); + unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits(); + if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits() && + ShuffleVectorInst::isReverseMask(ShuffleMask) && Op2.isUndef()) { + Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1); + return convertFromScalableVector(DAG, VT, Op); + } + return SDValue(); } diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -231,6 +231,8 @@ def AArch64frecpx_mt : SDNode<"AArch64ISD::FRECPX_MERGE_PASSTHRU", SDT_AArch64Arith>; def AArch64rbit_mt : SDNode<"AArch64ISD::BITREVERSE_MERGE_PASSTHRU", SDT_AArch64Arith>; def AArch64revb_mt : SDNode<"AArch64ISD::BSWAP_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64revh_mt : SDNode<"AArch64ISD::REVH_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64revw_mt : SDNode<"AArch64ISD::REVW_MERGE_PASSTHRU", SDT_AArch64Arith>; // These are like the above but we don't yet have need for ISD nodes. They allow // a single pattern to match intrinsic and ISD operand layouts. @@ -680,8 +682,8 @@ defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", AArch64rbit_mt>; defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", AArch64revb_mt>; - defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", int_aarch64_sve_revh>; - defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw", int_aarch64_sve_revw>; + defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", AArch64revh_mt>; + defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw", AArch64revw_mt>; defm REV_PP : sve_int_perm_reverse_p<"rev", vector_reverse>; defm REV_ZZ : sve_int_perm_reverse_z<"rev", vector_reverse>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -6485,14 +6485,14 @@ def _S : sve_int_perm_rev<0b10, 0b01, asm, ZPR32>; def _D : sve_int_perm_rev<0b11, 0b01, asm, ZPR64>; - def : SVE_3_Op_Pat(NAME # _S)>; - def : SVE_3_Op_Pat(NAME # _D)>; + def : SVE_1_Op_Passthru_Pat(NAME # _S)>; + def : SVE_1_Op_Passthru_Pat(NAME # _D)>; } multiclass sve_int_perm_rev_revw { def _D : sve_int_perm_rev<0b11, 0b10, asm, ZPR64>; - def : SVE_3_Op_Pat(NAME # _D)>; + def : SVE_1_Op_Passthru_Pat(NAME # _D)>; } class sve_int_perm_cpy_r sz8_64, string asm, ZPRRegOp zprty, diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll @@ -0,0 +1,470 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 +; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 + +target triple = "aarch64-unknown-linux-gnu" + +; REVB pattern for shuffle v32i8 -> v16i16 +define void @test_revbv16i16(<32 x i8>* %a) #0 { +; CHECK-LABEL: test_revbv16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: revb z0.h, p1/m, z0.h +; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <32 x i8>, <32 x i8>* %a + %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> + store <32 x i8> %tmp2, <32 x i8>* %a + ret void +} + +; REVB pattern for shuffle v32i8 -> v8i32 +define void @test_revbv8i32(<32 x i8>* %a) #0 { +; CHECK-LABEL: test_revbv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: revb z0.s, p1/m, z0.s +; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <32 x i8>, <32 x i8>* %a + %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> + store <32 x i8> %tmp2, <32 x i8>* %a + ret void +} + +; REVB pattern for shuffle v32i8 -> v4i64 +define void @test_revbv4i64(<32 x i8>* %a) #0 { +; CHECK-LABEL: test_revbv4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: revb z0.d, p1/m, z0.d +; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <32 x i8>, <32 x i8>* %a + %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> + store <32 x i8> %tmp2, <32 x i8>* %a + ret void +} + +; REVH pattern for shuffle v16i16 -> v8i32 +define void @test_revhv8i32(<16 x i16>* %a) #0 { +; CHECK-LABEL: test_revhv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: revh z0.s, p1/m, z0.s +; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <16 x i16>, <16 x i16>* %a + %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> + store <16 x i16> %tmp2, <16 x i16>* %a + ret void +} + +; REVH pattern for shuffle v16f16 -> v8f32 +define void @test_revhv8f32(<16 x half>* %a) #0 { +; CHECK-LABEL: test_revhv8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: revh z0.s, p1/m, z0.s +; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <16 x half>, <16 x half>* %a + %tmp2 = shufflevector <16 x half> %tmp1, <16 x half> undef, <16 x i32> + store <16 x half> %tmp2, <16 x half>* %a + ret void +} + +; REVH pattern for shuffle v16i16 -> v4i64 +define void @test_revhv4i64(<16 x i16>* %a) #0 { +; CHECK-LABEL: test_revhv4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: revh z0.d, p1/m, z0.d +; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <16 x i16>, <16 x i16>* %a + %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> + store <16 x i16> %tmp2, <16 x i16>* %a + ret void +} + +; REVW pattern for shuffle v8i32 -> v4i64 +define void @test_revwv4i64(<8 x i32>* %a) #0 { +; CHECK-LABEL: test_revwv4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: revw z0.d, p1/m, z0.d +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <8 x i32>, <8 x i32>* %a + %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> + store <8 x i32> %tmp2, <8 x i32>* %a + ret void +} + +; REVW pattern for shuffle v8f32 -> v4f64 +define void @test_revwv4f64(<8 x float>* %a) #0 { +; CHECK-LABEL: test_revwv4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: revw z0.d, p1/m, z0.d +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <8 x float>, <8 x float>* %a + %tmp2 = shufflevector <8 x float> %tmp1, <8 x float> undef, <8 x i32> + store <8 x float> %tmp2, <8 x float>* %a + ret void +} + +; Don't use SVE for 128-bit vectors +define <16 x i8> @test_revv16i8(<16 x i8>* %a) #0 { +; CHECK-LABEL: test_revv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: rev64 v0.16b, v0.16b +; CHECK-NEXT: ret + %tmp1 = load <16 x i8>, <16 x i8>* %a + %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> + ret <16 x i8> %tmp2 +} + +; REVW pattern for shuffle two v8i32 inputs with the second input available. +define void @test_revwv8i32v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: test_revwv8i32v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x1] +; CHECK-NEXT: revw z0.d, p1/m, z0.d +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <8 x i32>, <8 x i32>* %a + %tmp2 = load <8 x i32>, <8 x i32>* %b + %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> + store <8 x i32> %tmp3, <8 x i32>* %a + ret void +} + +; REVH pattern for shuffle v32i16 with 256 bits and 512 bits SVE. +define void @test_revhv32i16(<32 x i16>* %a) #0 { +; VBITS_EQ_256-LABEL: test_revhv32i16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: ptrue p1.d +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: revh z0.d, p1/m, z0.d +; VBITS_EQ_256-NEXT: revh z1.d, p1/m, z1.d +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_256-LABEL: test_revhv32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl32 +; VBITS_GE_256-NEXT: ptrue p1.d +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: revh z0.d, p1/m, z0.d +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %tmp1 = load <32 x i16>, <32 x i16>* %a + %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <32 x i32> + store <32 x i16> %tmp2, <32 x i16>* %a + ret void +} + +; Only support to reverse bytes / halfwords / words within elements +define void @test_rev_elts_fail(<4 x i64>* %a) #1 { +; CHECK-LABEL: test_rev_elts_fail: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: sub x9, sp, #48 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: mov z1.d, z0.d[2] +; CHECK-NEXT: mov z2.d, z0.d[3] +; CHECK-NEXT: mov x10, v0.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: fmov x11, d0 +; CHECK-NEXT: stp x9, x8, [sp, #16] +; CHECK-NEXT: stp x10, x11, [sp] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + %tmp1 = load <4 x i64>, <4 x i64>* %a + %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> + store <4 x i64> %tmp2, <4 x i64>* %a + ret void +} + +; REV instruction will reverse the order of all elements in the vector. +; When the vector length and the target register size are inconsistent, +; the correctness of generated REV instruction for shuffle pattern cannot be guaranteed. + +; sve-vector-bits-min=256, sve-vector-bits-max is not set, REV inst can't be generated. +define void @test_revv8i32(<8 x i32>* %a) #0 { +; CHECK-LABEL: test_revv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: sub x9, sp, #48 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: mov w9, v0.s[2] +; CHECK-NEXT: mov w11, v0.s[3] +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: mov z1.s, z0.s[4] +; CHECK-NEXT: mov z2.s, z0.s[5] +; CHECK-NEXT: mov z3.s, z0.s[6] +; CHECK-NEXT: mov z0.s, z0.s[7] +; CHECK-NEXT: stp w8, w10, [sp, #24] +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: stp w11, w9, [sp, #16] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov w11, s0 +; CHECK-NEXT: stp w8, w10, [sp, #8] +; CHECK-NEXT: stp w11, w9, [sp] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + %tmp1 = load <8 x i32>, <8 x i32>* %a + %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> + store <8 x i32> %tmp2, <8 x i32>* %a + ret void +} + +; REV pattern for v32i8 shuffle with vscale_range(2,2) +define void @test_revv32i8_vl256(<32 x i8>* %a) #1 { +; CHECK-LABEL: test_revv32i8_vl256: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: rev z0.b, z0.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <32 x i8>, <32 x i8>* %a + %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> + store <32 x i8> %tmp2, <32 x i8>* %a + ret void +} + +; REV pattern for v16i16 shuffle with vscale_range(2,2) +define void @test_revv16i16_vl256(<16 x i16>* %a) #1 { +; CHECK-LABEL: test_revv16i16_vl256: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: rev z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <16 x i16>, <16 x i16>* %a + %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> + store <16 x i16> %tmp2, <16 x i16>* %a + ret void +} + +; REV pattern for v8f32 shuffle with vscale_range(2,2) +define void @test_revv8f32_vl256(<8 x float>* %a) #1 { +; CHECK-LABEL: test_revv8f32_vl256: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: rev z0.s, z0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <8 x float>, <8 x float>* %a + %tmp2 = shufflevector <8 x float> %tmp1, <8 x float> undef, <8 x i32> + store <8 x float> %tmp2, <8 x float>* %a + ret void +} + +; REV pattern for v4f64 shuffle with vscale_range(2,2) +define void @test_revv4f64_vl256(<4 x double>* %a) #1 { +; CHECK-LABEL: test_revv4f64_vl256: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: rev z0.d, z0.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <4 x double>, <4 x double>* %a + %tmp2 = shufflevector <4 x double> %tmp1, <4 x double> undef, <4 x i32> + store <4 x double> %tmp2, <4 x double>* %a + ret void +} + +; REV pattern for shuffle two v8i32 inputs with the second input available, vscale_range(2,2). +define void @test_revv8i32v8i32(<8 x i32>* %a, <8 x i32>* %b) #1 { +; CHECK-LABEL: test_revv8i32v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x1] +; CHECK-NEXT: rev z0.s, z0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <8 x i32>, <8 x i32>* %a + %tmp2 = load <8 x i32>, <8 x i32>* %b + %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> + store <8 x i32> %tmp3, <8 x i32>* %a + ret void +} + +; Illegal REV pattern. +define void @test_rev_fail(<16 x i16>* %a) #1 { +; CHECK-LABEL: test_rev_fail: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: sub x9, sp, #48 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: mov z1.h, z0.h[8] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z5.h, z0.h[12] +; CHECK-NEXT: mov z2.h, z0.h[9] +; CHECK-NEXT: mov z3.h, z0.h[10] +; CHECK-NEXT: mov z4.h, z0.h[11] +; CHECK-NEXT: fmov w11, s2 +; CHECK-NEXT: strh w9, [sp, #30] +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: fmov w12, s3 +; CHECK-NEXT: strh w8, [sp, #14] +; CHECK-NEXT: fmov w8, s4 +; CHECK-NEXT: mov z6.h, z0.h[13] +; CHECK-NEXT: mov z7.h, z0.h[14] +; CHECK-NEXT: mov z16.h, z0.h[15] +; CHECK-NEXT: umov w10, v0.h[1] +; CHECK-NEXT: strh w9, [sp, #22] +; CHECK-NEXT: umov w9, v0.h[2] +; CHECK-NEXT: strh w11, [sp, #28] +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: strh w12, [sp, #26] +; CHECK-NEXT: fmov w12, s7 +; CHECK-NEXT: strh w8, [sp, #24] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: strh w11, [sp, #20] +; CHECK-NEXT: umov w11, v0.h[3] +; CHECK-NEXT: strh w12, [sp, #18] +; CHECK-NEXT: umov w12, v0.h[4] +; CHECK-NEXT: strh w8, [sp, #16] +; CHECK-NEXT: umov w8, v0.h[5] +; CHECK-NEXT: umov w10, v0.h[6] +; CHECK-NEXT: strh w9, [sp, #10] +; CHECK-NEXT: umov w9, v0.h[7] +; CHECK-NEXT: strh w11, [sp, #8] +; CHECK-NEXT: strh w12, [sp, #6] +; CHECK-NEXT: strh w8, [sp, #4] +; CHECK-NEXT: strh w10, [sp, #2] +; CHECK-NEXT: strh w9, [sp] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] +; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + %tmp1 = load <16 x i16>, <16 x i16>* %a + %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> + store <16 x i16> %tmp2, <16 x i16>* %a + ret void +} + +; Don't use SVE for 128-bit shuffle with two inputs +define void @test_revv8i16v8i16(<8 x i16>* %a, <8 x i16>* %b, <16 x i16>* %c) #1 { +; CHECK-LABEL: test_revv8i16v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: sub x9, sp, #48 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ldr q0, [x1] +; CHECK-NEXT: orr x9, x8, #0x1e +; CHECK-NEXT: orr x10, x8, #0x1c +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: orr x12, x8, #0x10 +; CHECK-NEXT: orr x11, x8, #0x18 +; CHECK-NEXT: str h0, [sp, #22] +; CHECK-NEXT: st1 { v0.h }[4], [x9] +; CHECK-NEXT: orr x9, x8, #0xe +; CHECK-NEXT: st1 { v0.h }[5], [x10] +; CHECK-NEXT: orr x10, x8, #0xc +; CHECK-NEXT: st1 { v0.h }[3], [x12] +; CHECK-NEXT: mov w12, #26 +; CHECK-NEXT: st1 { v1.h }[4], [x9] +; CHECK-NEXT: orr x9, x8, #0x8 +; CHECK-NEXT: st1 { v0.h }[7], [x11] +; CHECK-NEXT: orr x11, x8, #0x2 +; CHECK-NEXT: st1 { v1.h }[5], [x10] +; CHECK-NEXT: orr x10, x8, #0x4 +; CHECK-NEXT: st1 { v1.h }[7], [x9] +; CHECK-NEXT: orr x9, x8, x12 +; CHECK-NEXT: st1 { v1.h }[2], [x11] +; CHECK-NEXT: mov w11, #10 +; CHECK-NEXT: st1 { v1.h }[1], [x10] +; CHECK-NEXT: mov w10, #18 +; CHECK-NEXT: st1 { v0.h }[6], [x9] +; CHECK-NEXT: mov w9, #20 +; CHECK-NEXT: orr x9, x8, x9 +; CHECK-NEXT: orr x10, x8, x10 +; CHECK-NEXT: st1 { v1.h }[3], [x8] +; CHECK-NEXT: orr x8, x8, x11 +; CHECK-NEXT: str h1, [sp, #6] +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: st1 { v0.h }[1], [x9] +; CHECK-NEXT: st1 { v0.h }[2], [x10] +; CHECK-NEXT: st1 { v1.h }[6], [x8] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] +; CHECK-NEXT: st1h { z0.h }, p0, [x2] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + %tmp1 = load <8 x i16>, <8 x i16>* %a + %tmp2 = load <8 x i16>, <8 x i16>* %b + %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> + store <16 x i16> %tmp3, <16 x i16>* %c + ret void +} + +attributes #0 = { "target-features"="+sve" } +attributes #1 = { "target-features"="+sve" vscale_range(2,2) }