Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -21485,10 +21485,13 @@ // Try to move vector bitcast after extract_subv by scaling extraction index: // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index') if (V.getOpcode() == ISD::BITCAST && - V.getOperand(0).getValueType().isVector() && + V.getOperand(0).getValueType().isFixedLengthVector() && (!LegalOperations || TLI.isOperationLegal(ISD::BITCAST, NVT))) { SDValue SrcOp = V.getOperand(0); EVT SrcVT = SrcOp.getValueType(); + // For scalable vectors, we purposely add the bitcasts, and only deal + // with integer extract_subvector. So we don't reorder those particular + // bitcasts. unsigned SrcNumElts = SrcVT.getVectorMinNumElements(); unsigned DestNumElts = V.getValueType().getVectorMinNumElements(); if ((SrcNumElts % DestNumElts) == 0) { Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14928,11 +14928,28 @@ static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { + EVT InVT = N->getOperand(0).getValueType(); + EVT OutVT = N->getValueType(0); + SDLoc DL(N); + // Reorder when the scalable vector's inner type is floating point and the + // outer type is not scalable vector + if (InVT.isScalableVector() && InVT.isFloatingPoint() && + DCI.isBeforeLegalize() && !OutVT.isScalableVector()) { + // Bitcast the input + SDValue VecOp = N->getOperand(0); + VecOp = DAG.getNode(ISD::BITCAST, DL, InVT.changeTypeToInteger(), VecOp); + // Perform extract in integer type + SDValue Extract = + DAG.getNode(N->getOpcode(), DL, OutVT.changeTypeToInteger(), VecOp, + N->getOperand(1)); + // Bitcast back to fp type + return DAG.getNode(ISD::BITCAST, DL, OutVT, Extract); + } + if (DCI.isBeforeLegalizeOps()) return SDValue(); - EVT VT = N->getValueType(0); - if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1) + if (!OutVT.isScalableVector() || OutVT.getVectorElementType() != MVT::i1) return SDValue(); SDValue V = N->getOperand(0); @@ -14943,7 +14960,7 @@ // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const) if (V.getOpcode() == ISD::SPLAT_VECTOR) if (isa(V.getOperand(0))) - return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0)); + return DAG.getNode(ISD::SPLAT_VECTOR, DL, OutVT, V.getOperand(0)); return SDValue(); } Index: llvm/test/CodeGen/AArch64/extract-insert-element-sve.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/extract-insert-element-sve.ll @@ -0,0 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64 -mattr=+sve -asm-verbose=1 < %s | FileCheck %s + +; Extract from packed SVE vectors into different sizes of NEON registers. + +define <2 x float> @extract_subreg_2f32_unpacked_nx2xf32( %vec) nounwind { +; CHECK-LABEL: extract_subreg_2f32_unpacked_nx2xf32: +; CHECK: // %bb.0: +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: ret + %vec.e0 = extractelement %vec, i32 0 + %vec.e1 = extractelement %vec, i32 1 + + %1 = insertelement <2 x float> undef, float %vec.e0, i32 0 + %2 = insertelement <2 x float> %1, float %vec.e1, i32 1 + ret <2 x float> %2 +} + +define <4 x half> @extract_subreg_4f16_unpacked_nx4xf16( %vec) nounwind { +; CHECK-LABEL: extract_subreg_4f16_unpacked_nx4xf16: +; CHECK: // %bb.0: +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: ret + %vec.e0 = extractelement %vec, i32 0 + %vec.e1 = extractelement %vec, i32 1 + %vec.e2 = extractelement %vec, i32 2 + %vec.e3 = extractelement %vec, i32 3 + + %1 = insertelement <4 x half> undef, half %vec.e0, i32 0 + %2 = insertelement <4 x half> %1, half %vec.e1, i32 1 + %3 = insertelement <4 x half> %2, half %vec.e2, i32 2 + %4 = insertelement <4 x half> %3, half %vec.e3, i32 3 + ret <4 x half> %4 +}