Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -2999,29 +2999,44 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) { // We know that the extracted result type is legal. EVT SubVT = N->getValueType(0); - SDValue Idx = N->getOperand(1); SDLoc dl(N); SDValue Lo, Hi; - if (SubVT.isScalableVector() != - N->getOperand(0).getValueType().isScalableVector()) - report_fatal_error("Extracting a fixed-length vector from an illegal " - "scalable vector is not yet supported"); - GetSplitVector(N->getOperand(0), Lo, Hi); - uint64_t LoElts = Lo.getValueType().getVectorMinNumElements(); + uint64_t LoEltsMin = Lo.getValueType().getVectorMinNumElements(); uint64_t IdxVal = cast(Idx)->getZExtValue(); - if (IdxVal < LoElts) { - assert(IdxVal + SubVT.getVectorMinNumElements() <= LoElts && + if (IdxVal < LoEltsMin) { + assert(IdxVal + SubVT.getVectorMinNumElements() <= LoEltsMin && "Extracted subvector crosses vector split!"); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Lo, Idx); - } else { + } else if (SubVT.isScalableVector() == + N->getOperand(0).getValueType().isScalableVector()) return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Hi, - DAG.getVectorIdxConstant(IdxVal - LoElts, dl)); - } + DAG.getVectorIdxConstant(IdxVal - LoEltsMin, dl)); + + // Spill the vector to the stack. We should use the alignment for + // the smallest part. + SDValue Vec = N->getOperand(0); + EVT VecVT = Vec.getValueType(); + Align SmallestAlign = DAG.getReducedAlign(VecVT, /*UseABI=*/false); + SDValue StackPtr = + DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign); + auto &MF = DAG.getMachineFunction(); + auto FrameIndex = cast(StackPtr.getNode())->getIndex(); + auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex); + + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo, + SmallestAlign); + + // Extract the subvector by loading the correct part. + StackPtr = TLI.getVectorSubVecPointer(DAG, StackPtr, VecVT, SubVT, Idx); + + return DAG.getLoad( + SubVT, dl, Store, StackPtr, + MachinePointerInfo::getUnknownStack(DAG.getMachineFunction())); } SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) { Index: llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll +++ llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll @@ -1,11 +1,333 @@ -; RUN: not --crash llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s -o - | FileCheck %s -; Extracting a fixed-length vector from an illegal subvector +; Extracting a legal fixed-length vector from an illegal subvector -; CHECK-ERROR: ERROR: Extracting a fixed-length vector from an illegal scalable vector is not yet supported define <4 x i32> @extract_v4i32_nxv16i32_12( %arg) { +; CHECK-LABEL: extract_v4i32_nxv16i32_12: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1w { z3.s }, p0, [sp, #3, mul vl] +; CHECK-NEXT: st1w { z2.s }, p0, [sp, #2, mul vl] +; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: ldr q0, [sp, #48] +; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret %ext = call <4 x i32> @llvm.vector.extract.v4i32.nxv16i32( %arg, i64 12) ret <4 x i32> %ext } +define <8 x i16> @extract_v8i16_nxv32i16_8( %arg) { +; CHECK-LABEL: extract_v8i16_nxv32i16_8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl] +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: ldr q0, [sp, #16] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %ext = call <8 x i16> @llvm.vector.extract.v8i16.nxv32i16( %arg, i64 8) + ret <8 x i16> %ext +} + +define <4 x i16> @extract_v4i16_nxv32i16_8( %arg) { +; CHECK-LABEL: extract_v4i16_nxv32i16_8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: st1h { z3.h }, p0, [sp, #3, mul vl] +; CHECK-NEXT: st1h { z2.h }, p0, [sp, #2, mul vl] +; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl] +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: ldr d0, [sp, #32] +; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %ext = call <4 x i16> @llvm.vector.extract.v4i16.nxv32i16( %arg, i64 16) + ret <4 x i16> %ext +} + +; The result type gets promoted, leading to us extracting 2 elements from a nxv32i16. +; Hence we don't end up in SplitVecOp_EXTRACT_SUBVECTOR, but in SplitVecOp_EXTRACT_VECTOR_ELT instead. +define <2 x i16> @extract_v2i16_nxv32i16_8( %arg) { +; CHECK-LABEL: extract_v2i16_nxv32i16_8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-8 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add x8, x8, #32 +; CHECK-NEXT: st1h { z3.h }, p0, [sp, #3, mul vl] +; CHECK-NEXT: st1h { z2.h }, p0, [sp, #2, mul vl] +; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl] +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: st1h { z3.h }, p0, [sp, #7, mul vl] +; CHECK-NEXT: st1h { z2.h }, p0, [sp, #6, mul vl] +; CHECK-NEXT: st1h { z1.h }, p0, [sp, #5, mul vl] +; CHECK-NEXT: st1h { z0.h }, p0, [sp, #4, mul vl] +; CHECK-NEXT: ld1 { v0.h }[0], [x8] +; CHECK-NEXT: addvl x8, sp, #4 +; CHECK-NEXT: add x8, x8, #34 +; CHECK-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: addvl sp, sp, #8 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %ext = call <2 x i16> @llvm.vector.extract.v2i16.nxv32i16( %arg, i64 16) + ret <2 x i16> %ext +} + +define <2 x i64> @extract_v2i64_nxv8i64_8( %arg) { +; CHECK-LABEL: extract_v2i64_nxv8i64_8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; CHECK-NEXT: cnth x8 +; CHECK-NEXT: mov w9, #8 +; CHECK-NEXT: sub x8, x8, #2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: cmp x8, #8 +; CHECK-NEXT: st1d { z3.d }, p0, [sp, #3, mul vl] +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: lsl x8, x8, #3 +; CHECK-NEXT: st1d { z2.d }, p0, [sp, #2, mul vl] +; CHECK-NEXT: st1d { z1.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: ldr q0, [x9, x8] +; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %ext = call <2 x i64> @llvm.vector.extract.v2i64.nxv8i64( %arg, i64 8) + ret <2 x i64> %ext +} + +define <4 x float> @extract_v4f32_nxv16f32_12( %arg) { +; CHECK-LABEL: extract_v4f32_nxv16f32_12: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1w { z3.s }, p0, [sp, #3, mul vl] +; CHECK-NEXT: st1w { z2.s }, p0, [sp, #2, mul vl] +; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: ldr q0, [sp, #48] +; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %ext = call <4 x float> @llvm.vector.extract.v4f32.nxv16f32( %arg, i64 12) + ret <4 x float> %ext +} + +define <2 x float> @extract_v2f32_nxv16f32_2( %arg) { +; CHECK-LABEL: extract_v2f32_nxv16f32_2: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %ext = call <2 x float> @llvm.vector.extract.v2f32.nxv16f32( %arg, i64 2) + ret <2 x float> %ext +} + +define <4 x i1> @extract_v4i1_nxv32i1_0( %arg) { +; CHECK-LABEL: extract_v4i1_nxv32i1_0: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 +; CHECK-NEXT: umov w8, v1.b[1] +; CHECK-NEXT: umov w9, v1.b[2] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: umov w8, v1.b[3] +; CHECK-NEXT: mov v0.h[2], w9 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %ext = call <4 x i1> @llvm.vector.extract.v4i1.nxv32i1( %arg, i64 0) + ret <4 x i1> %ext +} + +; The result type gets promoted, leading to us extracting 4 elements from a nxv32i16. +; Hence we don't end up in SplitVecOp_EXTRACT_SUBVECTOR, but in SplitVecOp_EXTRACT_VECTOR_ELT instead. +define <4 x i1> @extract_v4i1_nxv32i1_16( %arg) { +; CHECK-LABEL: extract_v4i1_nxv32i1_16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-8 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p2.b +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: mov z0.b, p1/z, #1 // =0x1 +; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 +; CHECK-NEXT: st1b { z0.b }, p2, [sp, #1, mul vl] +; CHECK-NEXT: st1b { z1.b }, p2, [sp] +; CHECK-NEXT: st1b { z0.b }, p2, [sp, #3, mul vl] +; CHECK-NEXT: st1b { z1.b }, p2, [sp, #2, mul vl] +; CHECK-NEXT: st1b { z0.b }, p2, [sp, #5, mul vl] +; CHECK-NEXT: st1b { z1.b }, p2, [sp, #4, mul vl] +; CHECK-NEXT: st1b { z0.b }, p2, [sp, #7, mul vl] +; CHECK-NEXT: st1b { z1.b }, p2, [sp, #6, mul vl] +; CHECK-NEXT: ld1 { v0.b }[0], [x8] +; CHECK-NEXT: addvl x8, sp, #2 +; CHECK-NEXT: add x8, x8, #17 +; CHECK-NEXT: ld1 { v0.b }[2], [x8] +; CHECK-NEXT: addvl x8, sp, #4 +; CHECK-NEXT: add x8, x8, #18 +; CHECK-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-NEXT: addvl x8, sp, #6 +; CHECK-NEXT: add x8, x8, #19 +; CHECK-NEXT: ld1 { v0.b }[6], [x8] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: addvl sp, sp, #8 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %ext = call <4 x i1> @llvm.vector.extract.v4i1.nxv32i1( %arg, i64 16) + ret <4 x i1> %ext +} + +define <4 x i1> @extract_v4i1_v32i1_16(<32 x i1> %arg) { +; CHECK-LABEL: extract_v4i1_v32i1_16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [sp, #64] +; CHECK-NEXT: ldr w9, [sp, #72] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ldr w8, [sp, #80] +; CHECK-NEXT: mov v0.h[1], w9 +; CHECK-NEXT: mov v0.h[2], w8 +; CHECK-NEXT: ldr w8, [sp, #88] +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %ext = call <4 x i1> @llvm.vector.extract.v4i1.v32i1(<32 x i1> %arg, i64 16) + ret <4 x i1> %ext +} + +; The result type gets promoted, leading to us extracting 4 elements from a nxv32i3. +; Hence we don't end up in SplitVecOp_EXTRACT_SUBVECTOR, but in SplitVecOp_EXTRACT_VECTOR_ELT instead. +define <4 x i3> @extract_v4i3_nxv32i3_16( %arg) { +; CHECK-LABEL: extract_v4i3_nxv32i3_16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-8 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: st1b { z1.b }, p0, [sp, #1, mul vl] +; CHECK-NEXT: st1b { z0.b }, p0, [sp] +; CHECK-NEXT: st1b { z1.b }, p0, [sp, #3, mul vl] +; CHECK-NEXT: st1b { z0.b }, p0, [sp, #2, mul vl] +; CHECK-NEXT: st1b { z1.b }, p0, [sp, #5, mul vl] +; CHECK-NEXT: st1b { z0.b }, p0, [sp, #4, mul vl] +; CHECK-NEXT: st1b { z1.b }, p0, [sp, #7, mul vl] +; CHECK-NEXT: st1b { z0.b }, p0, [sp, #6, mul vl] +; CHECK-NEXT: ld1 { v0.b }[0], [x8] +; CHECK-NEXT: addvl x8, sp, #2 +; CHECK-NEXT: add x8, x8, #17 +; CHECK-NEXT: ld1 { v0.b }[2], [x8] +; CHECK-NEXT: addvl x8, sp, #4 +; CHECK-NEXT: add x8, x8, #18 +; CHECK-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-NEXT: addvl x8, sp, #6 +; CHECK-NEXT: add x8, x8, #19 +; CHECK-NEXT: ld1 { v0.b }[6], [x8] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: addvl sp, sp, #8 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %ext = call <4 x i3> @llvm.vector.extract.v4i3.nxv32i3( %arg, i64 16) + ret <4 x i3> %ext +} + +; Extracting an illegal fixed-length vector from an illegal subvector + +define <2 x i32> @extract_v2i32_nxv16i32_2( %arg) { +; CHECK-LABEL: extract_v2i32_nxv16i32_2: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %ext = call <2 x i32> @llvm.vector.extract.v2i32.nxv16i32( %arg, i64 2) + ret <2 x i32> %ext +} + +define <4 x i64> @extract_v4i64_nxv8i64_0( %arg) { +; CHECK-LABEL: extract_v4i64_nxv8i64_0: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z1.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: ldr q1, [sp, #16] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %ext = call <4 x i64> @llvm.vector.extract.v4i64.nxv8i64( %arg, i64 0) + ret <4 x i64> %ext +} + + +declare <2 x i64> @llvm.vector.extract.v2i64.nxv8i64(, i64) +declare <4 x i64> @llvm.vector.extract.v4i64.nxv8i64(, i64) +declare <4 x float> @llvm.vector.extract.v4f32.nxv16f32(, i64) +declare <2 x float> @llvm.vector.extract.v2f32.nxv16f32(, i64) declare <4 x i32> @llvm.vector.extract.v4i32.nxv16i32(, i64) +declare <2 x i32> @llvm.vector.extract.v2i32.nxv16i32(, i64) +declare <8 x i16> @llvm.vector.extract.v8i16.nxv32i16(, i64) +declare <4 x i16> @llvm.vector.extract.v4i16.nxv32i16(, i64) +declare <2 x i16> @llvm.vector.extract.v2i16.nxv32i16(, i64) +declare <4 x i1> @llvm.vector.extract.v4i1.nxv32i1(, i64) +declare <4 x i1> @llvm.vector.extract.v4i1.v32i1(<32 x i1>, i64) +declare <4 x i3> @llvm.vector.extract.v4i3.nxv32i3(, i64)