Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -1393,8 +1393,8 @@ /// operations except for the pointer size. If AllowUnknown is true, this /// will return MVT::Other for types with no EVT counterpart (e.g. structs), /// otherwise it will assert. - EVT getValueType(const DataLayout &DL, Type *Ty, - bool AllowUnknown = false) const { + virtual EVT getValueType(const DataLayout &DL, Type *Ty, + bool AllowUnknown = false) const { // Lower scalar pointers to native pointer types. if (auto *PTy = dyn_cast(Ty)) return getPointerTy(DL, PTy->getAddressSpace()); Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -8736,13 +8736,21 @@ OpInfo.ConstraintType == TargetLowering::C_Register) && "Unknown constraint type!"); - // TODO: Support this. + SDLoc dl = getCurSDLoc(); + if (OpInfo.isIndirect) { - emitInlineAsmError( - Call, "Don't know how to handle indirect register inputs yet " - "for constraint '" + - Twine(OpInfo.ConstraintCode) + "'"); - return; + // For the time being we only know how to deal with MVT::i64x8. + if (OpInfo.ConstraintVT == MVT::i64x8) { + InOperandVal = DAG.getLoad(OpInfo.ConstraintVT, dl, Chain, + InOperandVal, MachinePointerInfo()); + } else { + // TODO: Support this. + emitInlineAsmError( + Call, "Don't know how to handle indirect register inputs yet " + "for constraint '" + + Twine(OpInfo.ConstraintCode) + "'"); + return; + } } // Copy the input into the appropriate registers. @@ -8756,8 +8764,6 @@ if (DetectWriteToReservedRegister()) return; - SDLoc dl = getCurSDLoc(); - OpInfo.AssignedRegs.getCopyToRegs(InOperandVal, DAG, dl, Chain, &Flag, &Call); Index: llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -656,6 +656,9 @@ case 'x': Reg = getXRegFromWReg(Reg); break; + case 't': + Reg = getXRegFromXRegTuple(Reg); + break; } O << AArch64InstPrinter::getRegisterName(Reg); @@ -752,6 +755,10 @@ AArch64::GPR64allRegClass.contains(Reg)) return printAsmMRegister(MO, 'x', O); + // If this is an x register tuple, print an x register. + if (AArch64::GPR64x8ClassRegClass.contains(Reg)) + return printAsmMRegister(MO, 't', O); + unsigned AltName = AArch64::NoRegAltName; const TargetRegisterClass *RegClass; if (AArch64::ZPRRegClass.contains(Reg)) { Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -330,6 +330,10 @@ // Cast between vectors of the same element type but differ in length. REINTERPRET_CAST, + // Nodes to build an LD64B / ST64B 64-bit quantity out of i64, and vice versa + LS64_BUILD, + LS64_EXTRACT, + LD1_MERGE_ZERO, LD1S_MERGE_ZERO, LDNF1_MERGE_ZERO, @@ -823,6 +827,9 @@ bool isAllActivePredicate(SDValue N) const; + EVT getValueType(const DataLayout &DL, Type *Ty, + bool AllowUnknown = false) const override; + private: /// Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -246,6 +246,14 @@ addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass); addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass); + if (Subtarget->hasLS64()) { + addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass); + setOperationAction(ISD::BITCAST, MVT::i64x8, Legal); + setOperationAction(ISD::UNDEF, MVT::i64x8, Legal); + setOperationAction(ISD::LOAD, MVT::i64x8, Custom); + setOperationAction(ISD::STORE, MVT::i64x8, Custom); + } + if (Subtarget->hasFPARMv8()) { addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass); @@ -1991,6 +1999,8 @@ MAKE_CASE(AArch64ISD::LASTA) MAKE_CASE(AArch64ISD::LASTB) MAKE_CASE(AArch64ISD::REINTERPRET_CAST) + MAKE_CASE(AArch64ISD::LS64_BUILD) + MAKE_CASE(AArch64ISD::LS64_EXTRACT) MAKE_CASE(AArch64ISD::TBL) MAKE_CASE(AArch64ISD::FADD_PRED) MAKE_CASE(AArch64ISD::FADDA_PRED) @@ -4556,17 +4566,52 @@ {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()}, StoreNode->getMemoryVT(), StoreNode->getMemOperand()); return Result; + } else if (MemVT == MVT::i64x8) { + SDValue Value = StoreNode->getValue(); + assert(Value->getValueType(0) == MVT::i64x8); + SDValue Chain = StoreNode->getChain(); + SDValue Base = StoreNode->getBasePtr(); + EVT PtrVT = Base.getValueType(); + for (unsigned i = 0; i < 8; i++) { + SDValue Part = + DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64, + Value, DAG.getConstant(i, Dl, MVT::i32)); + SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base, + DAG.getConstant(i * 8, Dl, PtrVT)); + Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(), + StoreNode->getOriginalAlign()); + } + return Chain; } return SDValue(); } -// Custom lowering for extending v4i8 vector loads. SDValue AArch64TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); LoadSDNode *LoadNode = cast(Op); assert(LoadNode && "Expected custom lowering of a load node"); + + if (LoadNode->getMemoryVT() == MVT::i64x8) { + SmallVector Ops; + SDValue Base = LoadNode->getBasePtr(); + SDValue Chain = LoadNode->getChain(); + EVT PtrVT = Base.getValueType(); + for (unsigned i = 0; i < 8; i++) { + SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base, + DAG.getConstant(i * 8, DL, PtrVT)); + SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr, + LoadNode->getPointerInfo(), + LoadNode->getOriginalAlign()); + Ops.push_back(Part); + Chain = SDValue(Part.getNode(), 1); + } + SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops); + return DAG.getMergeValues({Loaded, Chain}, DL); + } + + // Custom lowering for extending v4i8 vector loads. EVT VT = Op->getValueType(0); assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32"); @@ -8112,6 +8157,8 @@ case 'r': if (VT.isScalableVector()) return std::make_pair(0U, nullptr); + if (Subtarget->hasLS64() && VT.getSizeInBits() == 512) + return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass); if (VT.getFixedSizeInBits() == 64) return std::make_pair(0U, &AArch64::GPR64commonRegClass); return std::make_pair(0U, &AArch64::GPR32commonRegClass); @@ -8199,6 +8246,16 @@ return Res; } +EVT AArch64TargetLowering::getValueType(const DataLayout &DL, llvm::Type *Ty, + bool AllowUnknown) const { + if (Subtarget->hasLS64()) + if (ArrayType *ATy = dyn_cast(Ty)) + if (ATy->getNumElements() == 8 && ATy->getElementType()->isIntegerTy(64)) + return EVT(MVT::i64x8); + + return TargetLowering::getValueType(DL, Ty, AllowUnknown); +} + /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. void AArch64TargetLowering::LowerAsmOperandForConstraint( Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -8054,6 +8054,20 @@ // FIXME: add SVE dot-product patterns. } +// Custom DAG nodes and isel rules to make a 64-byte block out of eight GPRs, +// so that it can be used as input to inline asm, and vice versa. +def LS64_BUILD : SDNode<"AArch64ISD::LS64_BUILD", SDTypeProfile<1, 8, []>>; +def LS64_EXTRACT : SDNode<"AArch64ISD::LS64_EXTRACT", SDTypeProfile<1, 2, []>>; +def : Pat<(i64x8 (LS64_BUILD GPR64:$x0, GPR64:$x1, GPR64:$x2, GPR64:$x3, + GPR64:$x4, GPR64:$x5, GPR64:$x6, GPR64:$x7)), + (REG_SEQUENCE GPR64x8Class, + $x0, x8sub_0, $x1, x8sub_1, $x2, x8sub_2, $x3, x8sub_3, + $x4, x8sub_4, $x5, x8sub_5, $x6, x8sub_6, $x7, x8sub_7)>; +foreach i = 0-7 in { + def : Pat<(i64 (LS64_EXTRACT (i64x8 GPR64x8:$val), (i32 i))), + (EXTRACT_SUBREG $val, !cast("x8sub_"#i))>; +} + let Predicates = [HasLS64] in { def LD64B: LoadStore64B<0b101, "ld64b", (ins GPR64sp:$Rn), (outs GPR64x8:$Rt)>; Index: llvm/lib/Target/AArch64/AArch64RegisterInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -724,7 +724,9 @@ !foreach(i, [0,1,2,3,4,5,6,7], !cast("x8sub_"#i)), !foreach(i, [0,1,2,3,4,5,6,7], (trunc (decimate (rotl GPR64, i), 2), 12))>; -def GPR64x8Class : RegisterClass<"AArch64", [i64], 64, (trunc Tuples8X, 12)>; +def GPR64x8Class : RegisterClass<"AArch64", [i64x8], 512, (trunc Tuples8X, 12)> { + let Size = 512; +} def GPR64x8AsmOp : AsmOperandClass { let Name = "GPR64x8"; let ParserMethod = "tryParseGPR64x8"; Index: llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h =================================================================== --- llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -106,6 +106,25 @@ return Reg; } +inline static unsigned getXRegFromXRegTuple(unsigned RegTuple) { + switch (RegTuple) { + case AArch64::X0_X1_X2_X3_X4_X5_X6_X7: return AArch64::X0; + case AArch64::X2_X3_X4_X5_X6_X7_X8_X9: return AArch64::X2; + case AArch64::X4_X5_X6_X7_X8_X9_X10_X11: return AArch64::X4; + case AArch64::X6_X7_X8_X9_X10_X11_X12_X13: return AArch64::X6; + case AArch64::X8_X9_X10_X11_X12_X13_X14_X15: return AArch64::X8; + case AArch64::X10_X11_X12_X13_X14_X15_X16_X17: return AArch64::X10; + case AArch64::X12_X13_X14_X15_X16_X17_X18_X19: return AArch64::X12; + case AArch64::X14_X15_X16_X17_X18_X19_X20_X21: return AArch64::X14; + case AArch64::X16_X17_X18_X19_X20_X21_X22_X23: return AArch64::X16; + case AArch64::X18_X19_X20_X21_X22_X23_X24_X25: return AArch64::X18; + case AArch64::X20_X21_X22_X23_X24_X25_X26_X27: return AArch64::X20; + case AArch64::X22_X23_X24_X25_X26_X27_X28_FP: return AArch64::X22; + } + // For anything else, return it unchanged. + return RegTuple; +} + static inline unsigned getBRegFromDReg(unsigned Reg) { switch (Reg) { case AArch64::D0: return AArch64::B0; Index: llvm/test/CodeGen/AArch64/ls64-inline-asm.ll =================================================================== --- llvm/test/CodeGen/AArch64/ls64-inline-asm.ll +++ llvm/test/CodeGen/AArch64/ls64-inline-asm.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64 -mattr=+ls64 -verify-machineinstrs -o - %s | FileCheck %s +; RUN: llc -mtriple=aarch64_be -mattr=+ls64 -verify-machineinstrs -o - %s | FileCheck %s + +%struct.foo = type { [8 x i64] } + +define void @load(%struct.foo* %output, i8* %addr) { +; CHECK-LABEL: load: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: //APP +; CHECK-NEXT: ld64b x2, [x1] +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: stp x8, x9, [x0, #48] +; CHECK-NEXT: stp x6, x7, [x0, #32] +; CHECK-NEXT: stp x4, x5, [x0, #16] +; CHECK-NEXT: stp x2, x3, [x0] +; CHECK-NEXT: ret +entry: + tail call void asm sideeffect "ld64b $0,[$1]", "=*r,r,~{memory}"(%struct.foo* %output, i8* %addr) + ret void +} + +define void @store(%struct.foo* %input, i8* %addr) { +; CHECK-LABEL: store: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp x8, x9, [x0, #48] +; CHECK-NEXT: ldp x6, x7, [x0, #32] +; CHECK-NEXT: ldp x4, x5, [x0, #16] +; CHECK-NEXT: ldp x2, x3, [x0] +; CHECK-NEXT: //APP +; CHECK-NEXT: st64b x2, [x1] +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ret +entry: + tail call void asm sideeffect "st64b $0,[$1]", "*r,r,~{memory}"(%struct.foo* %input, i8* %addr) + ret void +} +