Index: llvm/include/llvm/CodeGen/MachineMemOperand.h =================================================================== --- llvm/include/llvm/CodeGen/MachineMemOperand.h +++ llvm/include/llvm/CodeGen/MachineMemOperand.h @@ -150,7 +150,10 @@ MOTargetFlag2 = 1u << 7, MOTargetFlag3 = 1u << 8, - LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ MOTargetFlag3) + // The memory access is non-faulting + MONonFaulting = 1u << 9, + + LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ MONonFaulting) }; private: @@ -261,6 +264,7 @@ bool isNonTemporal() const { return FlagVals & MONonTemporal; } bool isDereferenceable() const { return FlagVals & MODereferenceable; } bool isInvariant() const { return FlagVals & MOInvariant; } + bool isNonFaulting() const { return FlagVals & MONonFaulting; } /// Returns true if this operation has an atomic ordering requirement of /// unordered or higher, false otherwise. Index: llvm/include/llvm/CodeGen/SelectionDAGNodes.h =================================================================== --- llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -548,8 +548,9 @@ uint16_t IsNonTemporal : 1; uint16_t IsDereferenceable : 1; uint16_t IsInvariant : 1; + uint16_t IsNonFaulting : 1; }; - enum { NumMemSDNodeBits = NumSDNodeBits + 4 }; + enum { NumMemSDNodeBits = NumSDNodeBits + 5 }; class LSBaseSDNodeBitfields { friend class LSBaseSDNode; @@ -1321,6 +1322,7 @@ bool isNonTemporal() const { return MemSDNodeBits.IsNonTemporal; } bool isDereferenceable() const { return MemSDNodeBits.IsDereferenceable; } bool isInvariant() const { return MemSDNodeBits.IsInvariant; } + bool isNonFaulting() const { return MemSDNodeBits.IsNonFaulting; } // Returns the offset from the location of the access. int64_t getSrcValueOffset() const { return MMO->getOffset(); } Index: llvm/include/llvm/IR/IntrinsicsAArch64.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAArch64.td +++ llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -775,6 +775,12 @@ LLVMPointerTo<0>], [IntrReadMem, IntrArgMemOnly]>; + class AdvSIMD_1Vec_PredFaultingLoad_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + LLVMPointerToElt<0>], + [IntrReadMem, IntrArgMemOnly]>; + class AdvSIMD_1Vec_PredStore_Intrinsic : Intrinsic<[], [llvm_anyvector_ty, @@ -1070,6 +1076,8 @@ def int_aarch64_sve_ldnt1 : AdvSIMD_1Vec_PredLoad_Intrinsic; +def int_aarch64_sve_ldnf1 : AdvSIMD_1Vec_PredFaultingLoad_Intrinsic; + // // Stores // Index: llvm/lib/CodeGen/MachineOperand.cpp =================================================================== --- llvm/lib/CodeGen/MachineOperand.cpp +++ llvm/lib/CodeGen/MachineOperand.cpp @@ -1089,6 +1089,8 @@ if (getFlags() & MachineMemOperand::MOTargetFlag3) OS << '"' << getTargetMMOFlagName(*TII, MachineMemOperand::MOTargetFlag3) << "\" "; + if (isNonFaulting()) + OS << "non-faulting "; assert((isLoad() || isStore()) && "machine memory operand must be a load or store (or both)"); Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -4479,6 +4479,22 @@ } } + if (Operand.getOpcode() == ISD::SPLAT_VECTOR) { + if (ConstantSDNode *C = dyn_cast(Operand.getOperand(0))) { + const APInt &Val = C->getAPIntValue(); + switch (Opcode) { + default: break; + case ISD::ANY_EXTEND: + case ISD::ZERO_EXTEND: + return getConstant(Val.zextOrTrunc(VT.getScalarSizeInBits()), DL, VT, + C->isTargetOpcode(), C->isOpaque()); + case ISD::SIGN_EXTEND: + return getConstant(Val.sextOrTrunc(VT.getScalarSizeInBits()), DL, VT, + C->isTargetOpcode(), C->isOpaque()); + } + } + } + unsigned OpOpcode = Operand.getNode()->getOpcode(); switch (Opcode) { case ISD::TokenFactor: @@ -8914,6 +8930,7 @@ MemSDNodeBits.IsNonTemporal = MMO->isNonTemporal(); MemSDNodeBits.IsDereferenceable = MMO->isDereferenceable(); MemSDNodeBits.IsInvariant = MMO->isInvariant(); + MemSDNodeBits.IsNonFaulting = MMO->isNonFaulting(); // We check here that the size of the memory operand fits within the size of // the MMO. This is because the MMO might indicate only a possible address Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8525,6 +8525,16 @@ Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal; return true; } + case Intrinsic::aarch64_sve_ldnf1: { + PointerType *PtrTy = cast(I.getArgOperand(1)->getType()); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.ptrVal = I.getArgOperand(1); + Info.offset = 0; + Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonFaulting; + return true; + } case Intrinsic::aarch64_sve_stnt1: { PointerType *PtrTy = cast(I.getArgOperand(2)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; @@ -12283,6 +12293,7 @@ case Intrinsic::aarch64_neon_st4lane: return performNEONPostLDSTCombine(N, DCI, DAG); case Intrinsic::aarch64_sve_ldnt1: + case Intrinsic::aarch64_sve_ldnf1: return performLDNT1Combine(N, DAG); case Intrinsic::aarch64_sve_stnt1: return performSTNT1Combine(N, DAG); Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -266,7 +266,8 @@ (masked_ld node:$ptr, undef, node:$pred, node:$def), [{ return cast(N)->getExtensionType() == ISD::NON_EXTLOAD && cast(N)->isUnindexed() && - !cast(N)->isNonTemporal(); + !cast(N)->isNonTemporal() && + !cast(N)->isNonFaulting(); }]>; // sign extending masked load fragments. def asext_masked_load : @@ -274,7 +275,8 @@ (masked_ld node:$ptr, undef, node:$pred, node:$def),[{ return (cast(N)->getExtensionType() == ISD::EXTLOAD || cast(N)->getExtensionType() == ISD::SEXTLOAD) && - cast(N)->isUnindexed(); + cast(N)->isUnindexed() && + !cast(N)->isNonFaulting(); }]>; def asext_masked_load_i8 : PatFrag<(ops node:$ptr, node:$pred, node:$def), @@ -296,7 +298,8 @@ PatFrag<(ops node:$ptr, node:$pred, node:$def), (masked_ld node:$ptr, undef, node:$pred, node:$def), [{ return cast(N)->getExtensionType() == ISD::ZEXTLOAD && - cast(N)->isUnindexed(); + cast(N)->isUnindexed() && + !cast(N)->isNonFaulting(); }]>; def zext_masked_load_i8 : PatFrag<(ops node:$ptr, node:$pred, node:$def), @@ -319,7 +322,71 @@ (masked_ld node:$ptr, undef, node:$pred, node:$def), [{ return cast(N)->getExtensionType() == ISD::NON_EXTLOAD && cast(N)->isUnindexed() && - cast(N)->isNonTemporal(); + cast(N)->isNonTemporal() && + !cast(N)->isNonFaulting(); +}]>; + +def non_faulting_load : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (masked_ld node:$ptr, undef, node:$pred, node:$def), [{ + return cast(N)->getExtensionType() == ISD::NON_EXTLOAD && + cast(N)->isUnindexed() && + !cast(N)->isNonTemporal() && + cast(N)->isNonFaulting(); +}]>; + +def sext_non_faulting_load : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (masked_ld node:$ptr, undef, node:$pred, node:$def), [{ + return cast(N)->getExtensionType() == ISD::SEXTLOAD && + cast(N)->isUnindexed() && + !cast(N)->isNonTemporal() && + cast(N)->isNonFaulting(); +}]>; + +def sext_non_faulting_load_i8 : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (sext_non_faulting_load node:$ptr, node:$pred, node:$def), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; + +def sext_non_faulting_load_i16 : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (sext_non_faulting_load node:$ptr, node:$pred, node:$def), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; + +def sext_non_faulting_load_i32 : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (sext_non_faulting_load node:$ptr, node:$pred, node:$def), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i32; +}]>; + +def zext_non_faulting_load : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (masked_ld node:$ptr, undef, node:$pred, node:$def), [{ + return cast(N)->getExtensionType() == ISD::ZEXTLOAD && + cast(N)->isUnindexed() && + !cast(N)->isNonTemporal() && + cast(N)->isNonFaulting(); +}]>; + +def zext_non_faulting_load_i8 : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (zext_non_faulting_load node:$ptr, node:$pred, node:$def), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; + +def zext_non_faulting_load_i16 : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (zext_non_faulting_load node:$ptr, node:$pred, node:$def), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; + +def zext_non_faulting_load_i32 : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (zext_non_faulting_load node:$ptr, node:$pred, node:$def), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i32; }]>; // non-truncating masked store fragment. Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1188,6 +1188,30 @@ defm : pred_store; defm : pred_store; defm : pred_store; + + // 2-element contiguous non-faulting loads + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + + // 4-element contiguous non-faulting loads + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + + // 8-element contiguous non-faulting loads + defm : pred_load; + defm : pred_load; + defm : pred_load; + + // 16-element contiguous non-faulting loads + defm : pred_load; } let Predicates = [HasSVE2] in { Index: llvm/lib/Target/AArch64/SVEInstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/SVEInstrFormats.td +++ llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -5211,14 +5211,21 @@ multiclass sve_mem_cld_si_base dtype, bit nf, string asm, RegisterOperand listty, ZPRRegOp zprty> { - def "" : sve_mem_cld_si_base; + def _REAL : sve_mem_cld_si_base; def : InstAlias(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>; + (!cast(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>; def : InstAlias(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>; + (!cast(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>; def : InstAlias(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>; + (!cast(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>; + + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1, mayLoad = 1 in { + def "" : Pseudo<(outs listty:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), []>, + PseudoInstExpansion<(!cast(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4)>; + } } multiclass sve_mem_cld_si dtype, string asm, RegisterOperand listty, Index: llvm/test/CodeGen/AArch64/sve-intrinsics-loads-nf.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-intrinsics-loads-nf.ll @@ -0,0 +1,182 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +define @ldnf1b( %pg, i8* %a) { +; CHECK-LABEL: ldnf1b: +; CHECK: ldnf1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnf1.nxv16i8( %pg, i8* %a) + ret %load +} + +define @ldnf1b_h( %pg, i8* %a) { +; CHECK-LABEL: ldnf1b_h: +; CHECK: ldnf1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnf1.nxv8i8( %pg, i8* %a) + %res = zext %load to + ret %res +} + +define @ldnf1sb_h( %pg, i8* %a) { +; CHECK-LABEL: ldnf1sb_h: +; CHECK: ldnf1sb { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnf1.nxv8i8( %pg, i8* %a) + %res = sext %load to + ret %res +} + +define @ldnf1h( %pg, i16* %a) { +; CHECK-LABEL: ldnf1h: +; CHECK: ldnf1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnf1.nxv8i16( %pg, i16* %a) + ret %load +} + +define @ldnf1h_f16( %pg, half* %a) { +; CHECK-LABEL: ldnf1h_f16: +; CHECK: ldnf1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnf1.nxv8f16( %pg, half* %a) + ret %load +} + +define @ldnf1b_s( %pg, i8* %a) { +; CHECK-LABEL: ldnf1b_s: +; CHECK: ldnf1b { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnf1.nxv4i8( %pg, i8* %a) + %res = zext %load to + ret %res +} + +define @ldnf1sb_s( %pg, i8* %a) { +; CHECK-LABEL: ldnf1sb_s: +; CHECK: ldnf1sb { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnf1.nxv4i8( %pg, i8* %a) + %res = sext %load to + ret %res +} + +define @ldnf1h_s( %pg, i16* %a) { +; CHECK-LABEL: ldnf1h_s: +; CHECK: ldnf1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnf1.nxv4i16( %pg, i16* %a) + %res = zext %load to + ret %res +} + +define @ldnf1sh_s( %pg, i16* %a) { +; CHECK-LABEL: ldnf1sh_s: +; CHECK: ldnf1sh { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnf1.nxv4i16( %pg, i16* %a) + %res = sext %load to + ret %res +} + +define @ldnf1w( %pg, i32* %a) { +; CHECK-LABEL: ldnf1w: +; CHECK: ldnf1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnf1.nxv4i32( %pg, i32* %a) + ret %load +} + +define @ldnf1w_f32( %pg, float* %a) { +; CHECK-LABEL: ldnf1w_f32: +; CHECK: ldnf1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnf1.nxv4f32( %pg, float* %a) + ret %load +} + +define @ldnf1b_d( %pg, i8* %a) { +; CHECK-LABEL: ldnf1b_d: +; CHECK: ldnf1b { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnf1.nxv2i8( %pg, i8* %a) + %res = zext %load to + ret %res +} + +define @ldnf1sb_d( %pg, i8* %a) { +; CHECK-LABEL: ldnf1sb_d: +; CHECK: ldnf1sb { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnf1.nxv2i8( %pg, i8* %a) + %res = sext %load to + ret %res +} + +define @ldnf1h_d( %pg, i16* %a) { +; CHECK-LABEL: ldnf1h_d: +; CHECK: ldnf1h { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnf1.nxv2i16( %pg, i16* %a) + %res = zext %load to + ret %res +} + +define @ldnf1sh_d( %pg, i16* %a) { +; CHECK-LABEL: ldnf1sh_d: +; CHECK: ldnf1sh { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnf1.nxv2i16( %pg, i16* %a) + %res = sext %load to + ret %res +} + +define @ldnf1w_d( %pg, i32* %a) { +; CHECK-LABEL: ldnf1w_d: +; CHECK: ldnf1w { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnf1.nxv2i32( %pg, i32* %a) + %res = zext %load to + ret %res +} + +define @ldnf1sw_d( %pg, i32* %a) { +; CHECK-LABEL: ldnf1sw_d: +; CHECK: ldnf1sw { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnf1.nxv2i32( %pg, i32* %a) + %res = sext %load to + ret %res +} + +define @ldnf1d( %pg, i64* %a) { +; CHECK-LABEL: ldnf1d: +; CHECK: ldnf1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnf1.nxv2i64( %pg, i64* %a) + ret %load +} + +define @ldnf1d_f64( %pg, double* %a) { +; CHECK-LABEL: ldnf1d_f64: +; CHECK: ldnf1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnf1.nxv2f64( %pg, double* %a) + ret %load +} + +declare @llvm.aarch64.sve.ldnf1.nxv16i8(, i8*) + +declare @llvm.aarch64.sve.ldnf1.nxv8i8(, i8*) +declare @llvm.aarch64.sve.ldnf1.nxv8i16(, i16*) +declare @llvm.aarch64.sve.ldnf1.nxv8f16(, half*) + +declare @llvm.aarch64.sve.ldnf1.nxv4i8(, i8*) +declare @llvm.aarch64.sve.ldnf1.nxv4i16(, i16*) +declare @llvm.aarch64.sve.ldnf1.nxv4i32(, i32*) +declare @llvm.aarch64.sve.ldnf1.nxv4f32(, float*) + +declare @llvm.aarch64.sve.ldnf1.nxv2i8(, i8*) +declare @llvm.aarch64.sve.ldnf1.nxv2i16(, i16*) +declare @llvm.aarch64.sve.ldnf1.nxv2i32(, i32*) +declare @llvm.aarch64.sve.ldnf1.nxv2i64(, i64*) +declare @llvm.aarch64.sve.ldnf1.nxv2f64(, double*)