Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -25146,7 +25146,64 @@ } } - return SDValue(); + if (!MaxSVESize || MinSVESize != MaxSVESize) + return SDValue(); + + bool Swap = false; + if (Op1.isUndef() || isZerosVector(Op1.getNode())) { + std::swap(Op1, Op2); + Swap = true; + } + bool IsUndefOrZero = Op2.isUndef() || isZerosVector(Op2.getNode()); + unsigned BitsPerElt = VT.getVectorElementType().getSizeInBits(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned IndexLen = MinSVESize / BitsPerElt; + unsigned FillElements = IndexLen - NumElts; + + if (BitsPerElt != 8 && BitsPerElt != 16 && BitsPerElt != 32 && + BitsPerElt != 64) + return SDValue(); + EVT MaskEltType = EVT::getIntegerVT(*DAG.getContext(), BitsPerElt); + EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen); + + SmallVector TBLMask; + for (int Val : ShuffleMask) { + unsigned Offset = Val; + if (Swap) + Offset = Offset < NumElts ? Offset + NumElts : Offset - NumElts; + else if (IsUndefOrZero && Offset >= NumElts) + Offset = 255; + else if (Offset >= NumElts && FillElements) + Offset += IndexLen - NumElts; + TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i64)); + } + for (unsigned i = 0; i < FillElements; ++i) + TBLMask.push_back(DAG.getConstant(255, DL, MVT::i64)); + + EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType); + SDValue VecMask = + DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen)); + SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask); + + SDValue Shuffle; + if (IsUndefOrZero || Swap) { + Shuffle = convertFromScalableVector( + DAG, VT, + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, + DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32), + Op1, SVEMask)); + } else { + if (Subtarget->hasSVE2()) + Shuffle = convertFromScalableVector( + DAG, VT, + DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, + DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32), Op1, + Op2, SVEMask)); + else + return SDValue(); + } + return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); } SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op, Index: llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll +++ llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll @@ -165,7 +165,7 @@ define void @test_revhv32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: test_revhv32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ptrue p1.d ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] @@ -194,29 +194,13 @@ define void @test_rev_elts_fail(ptr %a) #1 { ; CHECK-LABEL: test_rev_elts_fail: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: .cfi_def_cfa w29, 16 -; CHECK-NEXT: .cfi_offset w30, -8 -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: sub x9, sp, #48 -; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-NEXT: adrp x8, .LCPI11_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI11_0 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: mov z1.d, z0.d[2] -; CHECK-NEXT: fmov x11, d0 -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: mov z1.d, z0.d[3] -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: mov x10, v0.d[1] -; CHECK-NEXT: stp x9, x8, [sp, #16] -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: stp x10, x11, [sp] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x8] +; CHECK-NEXT: tbl z0.d, { z0.d }, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret %tmp1 = load <4 x i64>, ptr %a %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> @@ -381,61 +365,13 @@ define void @test_rev_fail(ptr %a) #1 { ; CHECK-LABEL: test_rev_fail: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: .cfi_def_cfa w29, 16 -; CHECK-NEXT: .cfi_offset w30, -8 -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: sub x9, sp, #48 -; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-NEXT: adrp x8, .LCPI20_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI20_0 ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: mov z1.h, z0.h[8] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: mov z4.h, z0.h[11] -; CHECK-NEXT: mov z5.h, z0.h[12] -; CHECK-NEXT: mov z2.h, z0.h[9] -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z3.h, z0.h[10] -; CHECK-NEXT: strh w9, [sp, #30] -; CHECK-NEXT: fmov w9, s5 -; CHECK-NEXT: mov z16.h, z0.h[15] -; CHECK-NEXT: fmov w11, s2 -; CHECK-NEXT: fmov w12, s3 -; CHECK-NEXT: strh w8, [sp, #24] -; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: mov z6.h, z0.h[13] -; CHECK-NEXT: mov z7.h, z0.h[14] -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: strh w9, [sp, #22] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: strh w11, [sp, #28] -; CHECK-NEXT: fmov w11, s6 -; CHECK-NEXT: strh w12, [sp, #26] -; CHECK-NEXT: fmov w12, s7 -; CHECK-NEXT: strh w8, [sp, #16] -; CHECK-NEXT: umov w8, v0.h[5] -; CHECK-NEXT: strh w10, [sp, #12] -; CHECK-NEXT: strh w11, [sp, #20] -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: strh w12, [sp, #18] -; CHECK-NEXT: umov w12, v0.h[4] -; CHECK-NEXT: umov w10, v0.h[6] -; CHECK-NEXT: strh w9, [sp, #10] -; CHECK-NEXT: umov w9, v0.h[7] -; CHECK-NEXT: strh w8, [sp, #4] -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: strh w11, [sp, #8] -; CHECK-NEXT: strh w12, [sp, #6] -; CHECK-NEXT: strh w10, [sp, #2] -; CHECK-NEXT: strh w9, [sp] -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x8] +; CHECK-NEXT: tbl z0.h, { z0.h }, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret %tmp1 = load <16 x i16>, ptr %a %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> @@ -472,16 +408,16 @@ ; CHECK-NEXT: st1 { v1.h }[4], [x9] ; CHECK-NEXT: orr x9, x8, #0x4 ; CHECK-NEXT: st1 { v1.h }[5], [x10] -; CHECK-NEXT: mov w10, #26 +; CHECK-NEXT: mov w10, #26 // =0x1a ; CHECK-NEXT: orr x10, x8, x10 ; CHECK-NEXT: st1 { v0.h }[3], [x12] ; CHECK-NEXT: st1 { v1.h }[1], [x9] ; CHECK-NEXT: orr x9, x8, #0x2 ; CHECK-NEXT: st1 { v1.h }[7], [x11] -; CHECK-NEXT: mov w11, #20 -; CHECK-NEXT: mov w12, #18 +; CHECK-NEXT: mov w11, #20 // =0x14 +; CHECK-NEXT: mov w12, #18 // =0x12 ; CHECK-NEXT: st1 { v0.h }[6], [x10] -; CHECK-NEXT: mov w10, #10 +; CHECK-NEXT: mov w10, #10 // =0xa ; CHECK-NEXT: orr x11, x8, x11 ; CHECK-NEXT: st1 { v1.h }[2], [x9] ; CHECK-NEXT: orr x9, x8, x12 Index: llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll +++ llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll @@ -50,7 +50,7 @@ define void @shuffle_ext_byone_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shuffle_ext_byone_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] @@ -93,7 +93,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v128i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl128 -; CHECK-NEXT: mov w8, #127 +; CHECK-NEXT: mov w8, #127 // =0x7f ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: whilels p1.b, xzr, x8 @@ -127,7 +127,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v256i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl256 -; CHECK-NEXT: mov w8, #255 +; CHECK-NEXT: mov w8, #255 // =0xff ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: whilels p1.b, xzr, x8 @@ -215,7 +215,7 @@ define void @shuffle_ext_byone_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shuffle_ext_byone_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] @@ -254,7 +254,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v64i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: mov w8, #63 +; CHECK-NEXT: mov w8, #63 // =0x3f ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: whilels p1.h, xzr, x8 @@ -280,7 +280,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v128i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl128 -; CHECK-NEXT: mov w8, #127 +; CHECK-NEXT: mov w8, #127 // =0x7f ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: whilels p1.h, xzr, x8 @@ -351,7 +351,7 @@ define void @shuffle_ext_byone_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shuffle_ext_byone_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] @@ -388,7 +388,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v32i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: mov w8, #31 // =0x1f ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: whilels p1.s, xzr, x8 @@ -410,7 +410,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v64i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl64 -; CHECK-NEXT: mov w8, #63 +; CHECK-NEXT: mov w8, #63 // =0x3f ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: whilels p1.s, xzr, x8 @@ -463,7 +463,7 @@ define void @shuffle_ext_byone_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shuffle_ext_byone_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] @@ -499,7 +499,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v16i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: mov w8, #15 +; CHECK-NEXT: mov w8, #15 // =0xf ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: whilels p1.d, xzr, x8 @@ -519,7 +519,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v32i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: mov w8, #31 // =0x1f ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: whilels p1.d, xzr, x8 @@ -578,7 +578,7 @@ define void @shuffle_ext_byone_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shuffle_ext_byone_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] @@ -614,7 +614,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v64f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: mov w8, #63 +; CHECK-NEXT: mov w8, #63 // =0x3f ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: whilels p1.h, xzr, x8 @@ -640,7 +640,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v128f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl128 -; CHECK-NEXT: mov w8, #127 +; CHECK-NEXT: mov w8, #127 // =0x7f ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: whilels p1.h, xzr, x8 @@ -710,7 +710,7 @@ define void @shuffle_ext_byone_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shuffle_ext_byone_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] @@ -744,7 +744,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v32f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: mov w8, #31 // =0x1f ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: whilels p1.s, xzr, x8 @@ -766,7 +766,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v64f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl64 -; CHECK-NEXT: mov w8, #63 +; CHECK-NEXT: mov w8, #63 // =0x3f ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: whilels p1.s, xzr, x8 @@ -818,7 +818,7 @@ define void @shuffle_ext_byone_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shuffle_ext_byone_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] @@ -851,7 +851,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v16f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: mov w8, #15 +; CHECK-NEXT: mov w8, #15 // =0xf ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: whilels p1.d, xzr, x8 @@ -871,7 +871,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v32f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: mov w8, #31 // =0x1f ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: whilels p1.d, xzr, x8 @@ -938,4 +938,59 @@ ret void } +define void @shuffle_v4f64_tbl_op1(ptr %a, ptr %b) #1 { +; CHECK-LABEL: shuffle_v4f64_tbl_op1: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI42_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI42_0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x8] +; CHECK-NEXT: tbl z0.d, { z0.d }, z1.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x double>, ptr %a + %op2 = load <4 x double>, ptr %b + %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> + store <4 x double> %ret, ptr %a + ret void +} + +define void @shuffle_v4f64_tbl_op2(ptr %a, ptr %b) #1 { +; CHECK-LABEL: shuffle_v4f64_tbl_op2: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI43_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI43_0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x8] +; CHECK-NEXT: tbl z0.d, { z0.d }, z1.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x double>, ptr %a + %op2 = load <4 x double>, ptr %b + %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> + store <4 x double> %ret, ptr %a + ret void +} + +define void @shuffle_v4f64_tbl2(ptr %a, ptr %b) #2 { +; CHECK-LABEL: shuffle_v4f64_tbl2: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: index z2.d, #2, #1 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: tbl z0.d, { z0.d, z1.d }, z2.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x double>, ptr %a + %op2 = load <4 x double>, ptr %b + %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> + store <4 x double> %ret, ptr %a + ret void +} + attributes #0 = { "target-features"="+sve" } +attributes #1 = { "target-features"="+sve" vscale_range(2,2) } +attributes #2 = { "target-features"="+sve2" vscale_range(2,2) } Index: llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll +++ llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll @@ -137,7 +137,7 @@ ; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: mov x20, sp ; CHECK-NEXT: bl def -; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: mov x8, #4 // =0x4 ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x20] ; CHECK-NEXT: ld2d { z2.d, z3.d }, p0/z, [x20, x8, lsl #3] Index: llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll +++ llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll @@ -379,4 +379,125 @@ ret void } + +; CHECK: .LCPI23_0: +; CHECK-NEXT: .hword 10 +; CHECK-NEXT: .hword 1 +; CHECK-NEXT: .hword 3 +; CHECK-NEXT: .hword 8 +; CHECK-NEXT: .hword 255 +; CHECK-NEXT: .hword 255 +; CHECK-NEXT: .hword 255 +; CHECK-NEXT: .hword 255 +define <4 x i16> @shuffle_v4i16_tbl2(ptr %a, ptr %b) #1 { +; CHECK-LABEL: shuffle_v4i16_tbl2: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI23_0 +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_0] +; CHECK-NEXT: tbl z0.h, { z0.h, z1.h }, z2.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %op1 = load <4 x i16>, ptr %a + %op2 = load <4 x i16>, ptr %b + %1 = shufflevector <4 x i16> %op1, <4 x i16> %op2, <4 x i32> + ret <4 x i16> %1 +} + +; CHECK: .LCPI24_0: +; CHECK-NEXT: .hword 0 +; CHECK-NEXT: .hword 3 +; CHECK-NEXT: .hword 7 +; CHECK-NEXT: .hword 7 +; CHECK-NEXT: .hword 15 +; CHECK-NEXT: .hword 0 +; CHECK-NEXT: .hword 0 +; CHECK-NEXT: .hword 1 +define <8 x i16> @shuffle_v8i16_tbl2(ptr %a, ptr %b) #1 { +; CHECK-LABEL: shuffle_v8i16_tbl2: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI24_0 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI24_0] +; CHECK-NEXT: tbl z0.h, { z0.h, z1.h }, z2.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %op1 = load <8 x i16>, ptr %a + %op2 = load <8 x i16>, ptr %b + %1 = shufflevector <8 x i16> %op1, <8 x i16> %op2, <8 x i32> + ret <8 x i16> %1 +} + +; CHECK: .LCPI25_0: +; CHECK-NEXT: .hword 0 +; CHECK-NEXT: .hword 3 +; CHECK-NEXT: .hword 7 +; CHECK-NEXT: .hword 7 +; CHECK-NEXT: .hword 1 +; CHECK-NEXT: .hword 0 +; CHECK-NEXT: .hword 0 +; CHECK-NEXT: .hword 1 +define <8 x i16> @shuffle_v8i16_tbl_op1(ptr %a, ptr %b) #1 { +; CHECK-LABEL: shuffle_v8i16_tbl_op1: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI25_0 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_0] +; CHECK-NEXT: tbl z0.h, { z0.h }, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %op1 = load <8 x i16>, ptr %a + %op2 = load <8 x i16>, ptr %b + %1 = shufflevector <8 x i16> %op1, <8 x i16> %op2, <8 x i32> + ret <8 x i16> %1 +} + +; CHECK: .LCPI26_0: +; CHECK-NEXT: .hword 2 +; CHECK-NEXT: .hword 5 +; CHECK-NEXT: .hword 2 +; CHECK-NEXT: .hword 3 +; CHECK-NEXT: .hword 7 +; CHECK-NEXT: .hword 3 +; CHECK-NEXT: .hword 3 +; CHECK-NEXT: .hword 2 +define <8 x i16> @shuffle_v8i16_tbl_op2(ptr %a, ptr %b) #1 { +; CHECK-LABEL: shuffle_v8i16_tbl_op2: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI26_0 +; CHECK-NEXT: ldr q0, [x1] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_0] +; CHECK-NEXT: tbl z0.h, { z0.h }, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %op1 = load <8 x i16>, ptr %a + %op2 = load <8 x i16>, ptr %b + %1 = shufflevector <8 x i16> %op1, <8 x i16> %op2, <8 x i32> + ret <8 x i16> %1 +} + +; CHECK: .LCPI27_0: +; CHECK-NEXT: .word 0 +; CHECK-NEXT: .word 3 +; CHECK-NEXT: .word 5 +; CHECK-NEXT: .word 1 +define <4 x float> @shuffle_v4f32_tbl_op2(ptr %a, ptr %b) #1 { +; CHECK-LABEL: shuffle_v4f32_tbl_op2: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI27_0 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI27_0] +; CHECK-NEXT: tbl z0.s, { z0.s, z1.s }, z2.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %op1 = load <4 x float>, ptr %a + %op2 = load <4 x float>, ptr %b + %1 = shufflevector <4 x float> %op1, <4 x float> %op2, <4 x i32> + ret <4 x float> %1 +} + attributes #0 = { "target-features"="+sve" } +attributes #1 = { "target-features"="+sve2" vscale_range(1,1) }