Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -855,6 +855,7 @@ SmallVectorImpl &InVals, bool isThisReturn, SDValue ThisVal) const; + SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1125,6 +1125,13 @@ setOperationAction(ISD::VSCALE, MVT::i32, Custom); setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); + + setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom); } if (Subtarget->hasSVE()) { @@ -4472,6 +4479,41 @@ return SDValue(); } +// Custom lowering for extending v4i8 vector loads. +SDValue AArch64TargetLowering::LowerLOAD(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + LoadSDNode *LoadNode = cast(Op); + assert(LoadNode && "Expected custom lowering of a load node"); + EVT VT = Op->getValueType(0); + assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32"); + + if (LoadNode->getMemoryVT() != MVT::v4i8) + return SDValue(); + + unsigned ExtType; + if (LoadNode->getExtensionType() == ISD::SEXTLOAD) + ExtType = ISD::SIGN_EXTEND; + else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD || + LoadNode->getExtensionType() == ISD::EXTLOAD) + ExtType = ISD::ZERO_EXTEND; + else + return SDValue(); + + SDValue Load = DAG.getLoad(MVT::f32, DL, DAG.getEntryNode(), + LoadNode->getBasePtr(), + MachinePointerInfo()); + SDValue Chain = Load.getValue(1); + SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load); + SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec); + SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC); + Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext, + DAG.getConstant(0, DL, MVT::i64)); + if (VT == MVT::v4i32) + Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext); + return DAG.getMergeValues({Ext, Chain}, DL); +} + // Generate SUBS and CSEL for integer abs. SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); @@ -4715,7 +4757,7 @@ case ISD::LOAD: if (useSVEForFixedLengthVectorVT(Op.getValueType())) return LowerFixedLengthVectorLoadToSVE(Op, DAG); - llvm_unreachable("Unexpected request to lower ISD::LOAD"); + return LowerLOAD(Op, DAG); case ISD::ADD: return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED); case ISD::AND: Index: llvm/test/CodeGen/AArch64/neon-extload.ll =================================================================== --- llvm/test/CodeGen/AArch64/neon-extload.ll +++ llvm/test/CodeGen/AArch64/neon-extload.ll @@ -5,26 +5,17 @@ define <4 x i32> @fsext_v4i32(<4 x i8>* %a) { ; LE-LABEL: fsext_v4i32: ; LE: // %bb.0: -; LE-NEXT: ldrsb w8, [x0] -; LE-NEXT: ldrsb w9, [x0, #1] -; LE-NEXT: ldrsb w10, [x0, #2] -; LE-NEXT: ldrsb w11, [x0, #3] -; LE-NEXT: fmov s0, w8 -; LE-NEXT: mov v0.s[1], w9 -; LE-NEXT: mov v0.s[2], w10 -; LE-NEXT: mov v0.s[3], w11 +; LE-NEXT: ldr s0, [x0] +; LE-NEXT: sshll v0.8h, v0.8b, #0 +; LE-NEXT: sshll v0.4s, v0.4h, #0 ; LE-NEXT: ret ; ; BE-LABEL: fsext_v4i32: ; BE: // %bb.0: -; BE-NEXT: ldrsb w8, [x0] -; BE-NEXT: ldrsb w9, [x0, #1] -; BE-NEXT: ldrsb w10, [x0, #2] -; BE-NEXT: ldrsb w11, [x0, #3] -; BE-NEXT: fmov s0, w8 -; BE-NEXT: mov v0.s[1], w9 -; BE-NEXT: mov v0.s[2], w10 -; BE-NEXT: mov v0.s[3], w11 +; BE-NEXT: ldr s0, [x0] +; BE-NEXT: rev32 v0.8b, v0.8b +; BE-NEXT: sshll v0.8h, v0.8b, #0 +; BE-NEXT: sshll v0.4s, v0.4h, #0 ; BE-NEXT: rev64 v0.4s, v0.4s ; BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; BE-NEXT: ret @@ -36,26 +27,17 @@ define <4 x i32> @fzext_v4i32(<4 x i8>* %a) { ; LE-LABEL: fzext_v4i32: ; LE: // %bb.0: -; LE-NEXT: ldrb w8, [x0] -; LE-NEXT: ldrb w9, [x0, #1] -; LE-NEXT: ldrb w10, [x0, #2] -; LE-NEXT: ldrb w11, [x0, #3] -; LE-NEXT: fmov s0, w8 -; LE-NEXT: mov v0.s[1], w9 -; LE-NEXT: mov v0.s[2], w10 -; LE-NEXT: mov v0.s[3], w11 +; LE-NEXT: ldr s0, [x0] +; LE-NEXT: ushll v0.8h, v0.8b, #0 +; LE-NEXT: ushll v0.4s, v0.4h, #0 ; LE-NEXT: ret ; ; BE-LABEL: fzext_v4i32: ; BE: // %bb.0: -; BE-NEXT: ldrb w8, [x0] -; BE-NEXT: ldrb w9, [x0, #1] -; BE-NEXT: ldrb w10, [x0, #2] -; BE-NEXT: ldrb w11, [x0, #3] -; BE-NEXT: fmov s0, w8 -; BE-NEXT: mov v0.s[1], w9 -; BE-NEXT: mov v0.s[2], w10 -; BE-NEXT: mov v0.s[3], w11 +; BE-NEXT: ldr s0, [x0] +; BE-NEXT: rev32 v0.8b, v0.8b +; BE-NEXT: ushll v0.8h, v0.8b, #0 +; BE-NEXT: ushll v0.4s, v0.4h, #0 ; BE-NEXT: rev64 v0.4s, v0.4s ; BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; BE-NEXT: ret @@ -65,16 +47,21 @@ } define i32 @loadExt.i32(<4 x i8>* %ref) { -; CHECK-LABEL: loadExt.i32: -; CHECK: ldrb ; LE-LABEL: loadExt.i32: ; LE: // %bb.0: -; LE-NEXT: ldrb w0, [x0] +; LE-NEXT: ldr s0, [x0] +; LE-NEXT: ushll v0.8h, v0.8b, #0 +; LE-NEXT: umov w8, v0.h[0] +; LE-NEXT: and w0, w8, #0xff ; LE-NEXT: ret ; ; BE-LABEL: loadExt.i32: ; BE: // %bb.0: -; BE-NEXT: ldrb w0, [x0] +; BE-NEXT: ldr s0, [x0] +; BE-NEXT: rev32 v0.8b, v0.8b +; BE-NEXT: ushll v0.8h, v0.8b, #0 +; BE-NEXT: umov w8, v0.h[0] +; BE-NEXT: and w0, w8, #0xff ; BE-NEXT: ret %a = load <4 x i8>, <4 x i8>* %ref %vecext = extractelement <4 x i8> %a, i32 0 @@ -85,27 +72,16 @@ define <4 x i16> @fsext_v4i16(<4 x i8>* %a) { ; LE-LABEL: fsext_v4i16: ; LE: // %bb.0: -; LE-NEXT: ldrsb w8, [x0] -; LE-NEXT: ldrsb w9, [x0, #1] -; LE-NEXT: ldrsb w10, [x0, #2] -; LE-NEXT: ldrsb w11, [x0, #3] -; LE-NEXT: fmov s0, w8 -; LE-NEXT: mov v0.h[1], w9 -; LE-NEXT: mov v0.h[2], w10 -; LE-NEXT: mov v0.h[3], w11 +; LE-NEXT: ldr s0, [x0] +; LE-NEXT: sshll v0.8h, v0.8b, #0 ; LE-NEXT: // kill: def $d0 killed $d0 killed $q0 ; LE-NEXT: ret ; ; BE-LABEL: fsext_v4i16: ; BE: // %bb.0: -; BE-NEXT: ldrsb w8, [x0] -; BE-NEXT: ldrsb w9, [x0, #1] -; BE-NEXT: ldrsb w10, [x0, #2] -; BE-NEXT: ldrsb w11, [x0, #3] -; BE-NEXT: fmov s0, w8 -; BE-NEXT: mov v0.h[1], w9 -; BE-NEXT: mov v0.h[2], w10 -; BE-NEXT: mov v0.h[3], w11 +; BE-NEXT: ldr s0, [x0] +; BE-NEXT: rev32 v0.8b, v0.8b +; BE-NEXT: sshll v0.8h, v0.8b, #0 ; BE-NEXT: rev64 v0.4h, v0.4h ; BE-NEXT: ret %x = load <4 x i8>, <4 x i8>* %a @@ -116,30 +92,86 @@ define <4 x i16> @fzext_v4i16(<4 x i8>* %a) { ; LE-LABEL: fzext_v4i16: ; LE: // %bb.0: -; LE-NEXT: ldrb w8, [x0] -; LE-NEXT: ldrb w9, [x0, #1] -; LE-NEXT: ldrb w10, [x0, #2] -; LE-NEXT: ldrb w11, [x0, #3] -; LE-NEXT: fmov s0, w8 -; LE-NEXT: mov v0.h[1], w9 -; LE-NEXT: mov v0.h[2], w10 -; LE-NEXT: mov v0.h[3], w11 +; LE-NEXT: ldr s0, [x0] +; LE-NEXT: ushll v0.8h, v0.8b, #0 ; LE-NEXT: // kill: def $d0 killed $d0 killed $q0 ; LE-NEXT: ret ; ; BE-LABEL: fzext_v4i16: ; BE: // %bb.0: -; BE-NEXT: ldrb w8, [x0] -; BE-NEXT: ldrb w9, [x0, #1] -; BE-NEXT: ldrb w10, [x0, #2] -; BE-NEXT: ldrb w11, [x0, #3] -; BE-NEXT: fmov s0, w8 -; BE-NEXT: mov v0.h[1], w9 -; BE-NEXT: mov v0.h[2], w10 -; BE-NEXT: mov v0.h[3], w11 +; BE-NEXT: ldr s0, [x0] +; BE-NEXT: rev32 v0.8b, v0.8b +; BE-NEXT: ushll v0.8h, v0.8b, #0 ; BE-NEXT: rev64 v0.4h, v0.4h ; BE-NEXT: ret %x = load <4 x i8>, <4 x i8>* %a %y = zext <4 x i8> %x to <4 x i16> ret <4 x i16> %y } + +define <4 x i16> @anyext_v4i16(<4 x i8> *%a, <4 x i8> *%b) { +; LE-LABEL: anyext_v4i16: +; LE: // %bb.0: +; LE-NEXT: ldr s0, [x0] +; LE-NEXT: ldr s1, [x1] +; LE-NEXT: ushll v0.8h, v0.8b, #0 +; LE-NEXT: ushll v1.8h, v1.8b, #0 +; LE-NEXT: add v0.4h, v0.4h, v1.4h +; LE-NEXT: shl v0.4h, v0.4h, #8 +; LE-NEXT: sshr v0.4h, v0.4h, #8 +; LE-NEXT: ret +; +; BE-LABEL: anyext_v4i16: +; BE: // %bb.0: +; BE-NEXT: ldr s0, [x0] +; BE-NEXT: ldr s1, [x1] +; BE-NEXT: rev32 v0.8b, v0.8b +; BE-NEXT: rev32 v1.8b, v1.8b +; BE-NEXT: ushll v0.8h, v0.8b, #0 +; BE-NEXT: ushll v1.8h, v1.8b, #0 +; BE-NEXT: add v0.4h, v0.4h, v1.4h +; BE-NEXT: shl v0.4h, v0.4h, #8 +; BE-NEXT: sshr v0.4h, v0.4h, #8 +; BE-NEXT: rev64 v0.4h, v0.4h +; BE-NEXT: ret + %x = load <4 x i8>, <4 x i8>* %a, align 4 + %y = load <4 x i8>, <4 x i8>* %b, align 4 + %z = add <4 x i8> %x, %y + %s = sext <4 x i8> %z to <4 x i16> + ret <4 x i16> %s +} + +define <4 x i32> @anyext_v4i32(<4 x i8> *%a, <4 x i8> *%b) { +; LE-LABEL: anyext_v4i32: +; LE: // %bb.0: +; LE-NEXT: ldr s0, [x0] +; LE-NEXT: ldr s1, [x1] +; LE-NEXT: ushll v0.8h, v0.8b, #0 +; LE-NEXT: ushll v1.8h, v1.8b, #0 +; LE-NEXT: add v0.4h, v0.4h, v1.4h +; LE-NEXT: ushll v0.4s, v0.4h, #0 +; LE-NEXT: shl v0.4s, v0.4s, #24 +; LE-NEXT: sshr v0.4s, v0.4s, #24 +; LE-NEXT: ret +; +; BE-LABEL: anyext_v4i32: +; BE: // %bb.0: +; BE-NEXT: ldr s0, [x0] +; BE-NEXT: ldr s1, [x1] +; BE-NEXT: rev32 v0.8b, v0.8b +; BE-NEXT: rev32 v1.8b, v1.8b +; BE-NEXT: ushll v0.8h, v0.8b, #0 +; BE-NEXT: ushll v1.8h, v1.8b, #0 +; BE-NEXT: add v0.4h, v0.4h, v1.4h +; BE-NEXT: ushll v0.4s, v0.4h, #0 +; BE-NEXT: shl v0.4s, v0.4s, #24 +; BE-NEXT: sshr v0.4s, v0.4s, #24 +; BE-NEXT: rev64 v0.4s, v0.4s +; BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; BE-NEXT: ret + %x = load <4 x i8>, <4 x i8>* %a, align 4 + %y = load <4 x i8>, <4 x i8>* %b, align 4 + %z = add <4 x i8> %x, %y + %s = sext <4 x i8> %z to <4 x i32> + ret <4 x i32> %s +}