diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1228,6 +1228,10 @@ setOperationAction(ISD::BITCAST, MVT::i8, Custom); setOperationAction(ISD::BITCAST, MVT::i16, Custom); + setOperationAction(ISD::BITCAST, MVT::v2i8, Custom); + setOperationAction(ISD::BITCAST, MVT::v2i16, Custom); + setOperationAction(ISD::BITCAST, MVT::v4i8, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom); setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom); setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom); @@ -23019,6 +23023,23 @@ Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT)); } +static void CustomNonLegalBITCASTResults(SDNode *N, + SmallVectorImpl &Results, + SelectionDAG &DAG, EVT ExtendVT, + EVT CastVT) { + SDLoc DL(N); + SDValue Op = N->getOperand(0); + EVT VT = N->getValueType(0); + + // Use SCALAR_TO_VECTOR for lane zero + SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op); + SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec); + SDValue IdxZero = DAG.getVectorIdxConstant(0, DL); + Results.push_back( + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero)); + return; +} + void AArch64TargetLowering::ReplaceBITCASTResults( SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { SDLoc DL(N); @@ -23026,6 +23047,21 @@ EVT VT = N->getValueType(0); EVT SrcVT = Op.getValueType(); + if (VT == MVT::v2i16 && SrcVT == MVT::i32) { + CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16); + return; + } + + if (VT == MVT::v4i8 && SrcVT == MVT::i32) { + CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8); + return; + } + + if (VT == MVT::v2i8 && SrcVT == MVT::i16) { + CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8); + return; + } + if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) { assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() && "Expected fp->int bitcast!"); diff --git a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll --- a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll @@ -446,25 +446,16 @@ define <4 x i8> @bitcast(i32 %0) { ; CHECK-LE-LABEL: bitcast: ; CHECK-LE: // %bb.0: -; CHECK-LE-NEXT: sub sp, sp, #16 -; CHECK-LE-NEXT: .cfi_def_cfa_offset 16 -; CHECK-LE-NEXT: str w0, [sp, #12] -; CHECK-LE-NEXT: ldr s0, [sp, #12] -; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-LE-NEXT: add sp, sp, #16 +; CHECK-LE-NEXT: fmov s0, w0 +; CHECK-LE-NEXT: zip1 v0.8b, v0.8b, v0.8b ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: bitcast: ; CHECK-BE: // %bb.0: -; CHECK-BE-NEXT: sub sp, sp, #16 -; CHECK-BE-NEXT: .cfi_def_cfa_offset 16 -; CHECK-BE-NEXT: str w0, [sp, #12] -; CHECK-BE-NEXT: ldr s0, [sp, #12] +; CHECK-BE-NEXT: fmov s0, w0 ; CHECK-BE-NEXT: rev32 v0.8b, v0.8b -; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-BE-NEXT: rev64 v0.4h, v0.4h -; CHECK-BE-NEXT: add sp, sp, #16 +; CHECK-BE-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-BE-NEXT: rev64 v0.8b, v0.8b ; CHECK-BE-NEXT: ret %2 = bitcast i32 %0 to <4 x i8> ret <4 x i8> %2 diff --git a/llvm/test/CodeGen/AArch64/neon-bitcast.ll b/llvm/test/CodeGen/AArch64/neon-bitcast.ll --- a/llvm/test/CodeGen/AArch64/neon-bitcast.ll +++ b/llvm/test/CodeGen/AArch64/neon-bitcast.ll @@ -514,3 +514,66 @@ ret <16 x i8> %val } +define <2 x i16> @bitcast_i32_to_v2i16(i32 %word) { +; CHECK-LE-LABEL: bitcast_i32_to_v2i16: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: fmov s0, w0 +; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: bitcast_i32_to_v2i16: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: fmov s0, w0 +; CHECK-BE-NEXT: rev32 v0.4h, v0.4h +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: ret + %ret = bitcast i32 %word to <2 x i16> + ret <2 x i16> %ret +} + +define <4 x i8> @bitcast_i32_to_v4i8(i32 %word) { +; CHECK-LE-LABEL: bitcast_i32_to_v4i8: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: fmov s0, w0 +; CHECK-LE-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: bitcast_i32_to_v4i8: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: fmov s0, w0 +; CHECK-BE-NEXT: rev32 v0.8b, v0.8b +; CHECK-BE-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-BE-NEXT: rev64 v0.8b, v0.8b +; CHECK-BE-NEXT: ret + %ret = bitcast i32 %word to <4 x i8> + ret <4 x i8> %ret +} + +; TODO: Eliminate redundant moving back and forth between gpr and vectors +define <2 x i8> @bitcast_i16_to_v2i8(i16 %word) { +; CHECK-LE-LABEL: bitcast_i16_to_v2i8: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: fmov s0, w0 +; CHECK-LE-NEXT: umov w8, v0.b[0] +; CHECK-LE-NEXT: umov w9, v0.b[1] +; CHECK-LE-NEXT: fmov s0, w8 +; CHECK-LE-NEXT: mov v0.s[1], w9 +; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: bitcast_i16_to_v2i8: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: fmov s0, w0 +; CHECK-BE-NEXT: rev16 v0.16b, v0.16b +; CHECK-BE-NEXT: umov w8, v0.b[0] +; CHECK-BE-NEXT: umov w9, v0.b[1] +; CHECK-BE-NEXT: fmov s0, w8 +; CHECK-BE-NEXT: mov v0.s[1], w9 +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: ret + %ret = bitcast i16 %word to <2 x i8> + ret <2 x i8> %ret +} +