Index: llvm/include/llvm/IR/IntrinsicsAArch64.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAArch64.td +++ llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -768,6 +768,20 @@ def llvm_nxv2f64_ty : LLVMType; let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". + + class AdvSIMD_1Vec_PredLoad_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + LLVMPointerTo<0>], + [IntrReadMem, IntrArgMemOnly]>; + + class AdvSIMD_1Vec_PredStore_Intrinsic + : Intrinsic<[], + [llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + LLVMPointerTo<0>], + [IntrArgMemOnly, NoCapture<2>]>; + class AdvSIMD_Merged1VectorArg_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, @@ -1000,6 +1014,18 @@ [IntrReadMem, IntrArgMemOnly]>; // +// Loads +// + +def int_aarch64_sve_ldnt1 : AdvSIMD_1Vec_PredLoad_Intrinsic; + +// +// Stores +// + +def int_aarch64_sve_stnt1 : AdvSIMD_1Vec_PredStore_Intrinsic; + +// // Integer arithmetic // Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6581,8 +6581,10 @@ if (Align == 0) // Ensure that codegen never sees alignment 0 Align = getEVTAlignment(MemVT); - if (!Size) - Size = MemVT.getStoreSize(); + if (!Size && !MemVT.isScalableVector()) + Size = MemVT.getStoreSize(); + else if (!Size) + Size = MemVT.getStoreSize().getKnownMinSize(); MachineFunction &MF = getMachineFunction(); MachineMemOperand *MMO = Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -196,7 +196,10 @@ UUNPKHI, UUNPKLO, + // SVE specific operations. INSR, + MLOAD, + MSTORE, // Unsigned gather loads. GLD1, Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1336,6 +1336,8 @@ case AArch64ISD::UUNPKHI: return "AArch64ISD::UUNPKHI"; case AArch64ISD::UUNPKLO: return "AArch64ISD::UUNPKLO"; case AArch64ISD::INSR: return "AArch64ISD::INSR"; + case AArch64ISD::MLOAD: return "AArch64ISD::MLOAD"; + case AArch64ISD::MSTORE: return "AArch64ISD::MSTORE"; case AArch64ISD::GLD1: return "AArch64ISD::GLD1"; case AArch64ISD::GLD1_SCALED: return "AArch64ISD::GLD1_SCALED"; case AArch64ISD::GLD1_SXTW: return "AArch64ISD::GLD1_SXTW"; @@ -8494,6 +8496,26 @@ Info.align = Align(16); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; + case Intrinsic::aarch64_sve_ldnt1: { + PointerType *PtrTy = cast(I.getArgOperand(1)->getType()); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.ptrVal = I.getArgOperand(1); + Info.offset = 0; + Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal; + return true; + } + case Intrinsic::aarch64_sve_stnt1: { + PointerType *PtrTy = cast(I.getArgOperand(2)->getType()); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.ptrVal = I.getArgOperand(2); + Info.offset = 0; + Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); + Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal; + return true; + } default: break; } @@ -10716,6 +10738,45 @@ return NewST1; } +static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + + EVT LoadVT = VT; + if (VT.isFloatingPoint()) + LoadVT = VT.changeTypeToInteger(); + + auto *MINode = cast(N); + SDValue L = DAG.getMaskedLoad(VT, DL, MINode->getChain(), + MINode->getOperand(3), DAG.getUNDEF(LoadVT), + MINode->getOperand(2), DAG.getUNDEF(LoadVT), + MINode->getMemoryVT(), MINode->getMemOperand(), + ISD::UNINDEXED, ISD::NON_EXTLOAD, false); + + if (VT.isFloatingPoint()) { + SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) }; + return DAG.getMergeValues(Ops, DL); + } + + return L; +} + +static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + + SDValue Data = N->getOperand(2); + EVT DataVT = Data.getValueType(); + + if (DataVT.isFloatingPoint()) + Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data); + + auto *MINode = cast(N); + return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4), + DAG.getUNDEF(DataVT), MINode->getOperand(3), + MINode->getMemoryVT(), MINode->getMemOperand(), + ISD::UNINDEXED, false, false); +} + /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The /// load store optimizer pass will merge them to store pair stores. This should /// be better than a movi to create the vector zero followed by a vector store @@ -11932,6 +11993,10 @@ case Intrinsic::aarch64_neon_st3lane: case Intrinsic::aarch64_neon_st4lane: return performNEONPostLDSTCombine(N, DCI, DAG); + case Intrinsic::aarch64_sve_ldnt1: + return performLDNT1Combine(N, DAG); + case Intrinsic::aarch64_sve_stnt1: + return performSTNT1Combine(N, DAG); case Intrinsic::aarch64_sve_ld1_gather: return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1); case Intrinsic::aarch64_sve_ld1_gather_index: Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -265,7 +265,8 @@ PatFrag<(ops node:$ptr, node:$pred, node:$def), (masked_ld node:$ptr, undef, node:$pred, node:$def), [{ return cast(N)->getExtensionType() == ISD::NON_EXTLOAD && - cast(N)->isUnindexed(); + cast(N)->isUnindexed() && + !cast(N)->isNonTemporal(); }]>; // sign extending masked load fragments. def asext_masked_load : @@ -313,12 +314,21 @@ return cast(N)->getMemoryVT().getScalarType() == MVT::i32; }]>; +def non_temporal_load : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (masked_ld node:$ptr, undef, node:$pred, node:$def), [{ + return cast(N)->getExtensionType() == ISD::NON_EXTLOAD && + cast(N)->isUnindexed() && + cast(N)->isNonTemporal(); +}]>; + // non-truncating masked store fragment. def nontrunc_masked_store : PatFrag<(ops node:$val, node:$ptr, node:$pred), (masked_st node:$val, node:$ptr, undef, node:$pred), [{ return !cast(N)->isTruncatingStore() && - cast(N)->isUnindexed(); + cast(N)->isUnindexed() && + !cast(N)->isNonTemporal(); }]>; // truncating masked store fragments. def trunc_masked_store : @@ -343,6 +353,14 @@ return cast(N)->getMemoryVT().getScalarType() == MVT::i32; }]>; +def non_temporal_store : + PatFrag<(ops node:$val, node:$ptr, node:$pred), + (masked_st node:$val, node:$ptr, undef, node:$pred), [{ + return !cast(N)->isTruncatingStore() && + cast(N)->isUnindexed() && + cast(N)->isNonTemporal(); +}]>; + // Node definitions. def AArch64adrp : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>; def AArch64adr : SDNode<"AArch64ISD::ADR", SDTIntUnaryOp, []>; Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1154,6 +1154,18 @@ // 16-element contiguous stores defm : pred_store; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; } let Predicates = [HasSVE2] in { Index: llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll @@ -0,0 +1,88 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; LDNT1B +; + +define @ldnt1b_i8( %pred, * %addr) { +; CHECK-LABEL: ldnt1b_i8: +; CHECK: ldnt1b { z0.b }, p0/z, [x0, #0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ldnt1.nxv16i8( %pred, + * %addr) + ret %res +} + +; +; LDNT1H +; + +define @ldnt1h_i16( %pred, * %addr) { +; CHECK-LABEL: ldnt1h_i16: +; CHECK: ldnt1h { z0.h }, p0/z, [x0, #0, lsl #1] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ldnt1.nxv8i16( %pred, + * %addr) + ret %res +} + +define @ldnt1h_f16( %pred, * %addr) { +; CHECK-LABEL: ldnt1h_f16: +; CHECK: ldnt1h { z0.h }, p0/z, [x0, #0, lsl #1] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ldnt1.nxv8f16( %pred, + * %addr) + ret %res +} + +; +; LDNT1W +; + +define @ldnt1w_i32( %pred, * %addr) { +; CHECK-LABEL: ldnt1w_i32: +; CHECK: ldnt1w { z0.s }, p0/z, [x0, #0, lsl #2] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ldnt1.nxv4i32( %pred, + * %addr) + ret %res +} + +define @ldnt1w_f32( %pred, * %addr) { +; CHECK-LABEL: ldnt1w_f32: +; CHECK: ldnt1w { z0.s }, p0/z, [x0, #0, lsl #2] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ldnt1.nxv4f32( %pred, + * %addr) + ret %res +} + +; +; LDNT1D +; + +define @ldnt1d_i64( %pred, * %addr) { +; CHECK-LABEL: ldnt1d_i64: +; CHECK: ldnt1d { z0.d }, p0/z, [x0, #0, lsl #3] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ldnt1.nxv2i64( %pred, + * %addr) + ret %res +} + +define @ldnt1d_f64( %pred, * %addr) { +; CHECK-LABEL: ldnt1d_f64: +; CHECK: ldnt1d { z0.d }, p0/z, [x0, #0, lsl #3] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ldnt1.nxv2f64( %pred, + * %addr) + ret %res +} + +declare @llvm.aarch64.sve.ldnt1.nxv16i8(, *) +declare @llvm.aarch64.sve.ldnt1.nxv8i16(, *) +declare @llvm.aarch64.sve.ldnt1.nxv4i32(, *) +declare @llvm.aarch64.sve.ldnt1.nxv2i64(, *) +declare @llvm.aarch64.sve.ldnt1.nxv8f16(, *) +declare @llvm.aarch64.sve.ldnt1.nxv4f32(, *) +declare @llvm.aarch64.sve.ldnt1.nxv2f64(, *) Index: llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll @@ -0,0 +1,95 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; STNT1B +; + +define void @stnt1b_i8( %data, %pred, * %addr) { +; CHECK-LABEL: stnt1b_i8: +; CHECK: stnt1b { z0.b }, p0, [x0, #0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.nxv16i8( %data, + %pred, + * %addr) + ret void +} + +; +; STNT1H +; + +define void @stnt1h_i16( %data, %pred, * %addr) { +; CHECK-LABEL: stnt1h_i16: +; CHECK: stnt1h { z0.h }, p0, [x0, #0, lsl #1] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.nxv8i16( %data, + %pred, + * %addr) + ret void +} + +define void @stnt1h_f16( %data, %pred, * %addr) { +; CHECK-LABEL: stnt1h_f16: +; CHECK: stnt1h { z0.h }, p0, [x0, #0, lsl #1] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.nxv8f16( %data, + %pred, + * %addr) + ret void +} + +; +; STNT1W +; + +define void @stnt1w_i32( %data, %pred, * %addr) { +; CHECK-LABEL: stnt1w_i32: +; CHECK: stnt1w { z0.s }, p0, [x0, #0, lsl #2] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.nxv4i32( %data, + %pred, + * %addr) + ret void +} + +define void @stnt1w_f32( %data, %pred, * %addr) { +; CHECK-LABEL: stnt1w_f32: +; CHECK: stnt1w { z0.s }, p0, [x0, #0, lsl #2] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.nxv4f32( %data, + %pred, + * %addr) + ret void +} + +; +; STNT1D +; + +define void @stnt1d_i64( %data, %pred, * %addr) { +; CHECK-LABEL: stnt1d_i64: +; CHECK: stnt1d { z0.d }, p0, [x0, #0, lsl #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.nxv2i64( %data, + %pred, + * %addr) + ret void +} + +define void @stnt1d_f64( %data, %pred, * %addr) { +; CHECK-LABEL: stnt1d_f64: +; CHECK: stnt1d { z0.d }, p0, [x0, #0, lsl #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.nxv2f64( %data, + %pred, + * %addr) + ret void +} + +declare void @llvm.aarch64.sve.stnt1.nxv16i8(, , *) +declare void @llvm.aarch64.sve.stnt1.nxv8i16(, , *) +declare void @llvm.aarch64.sve.stnt1.nxv4i32(, , *) +declare void @llvm.aarch64.sve.stnt1.nxv2i64(, , *) +declare void @llvm.aarch64.sve.stnt1.nxv8f16(, , *) +declare void @llvm.aarch64.sve.stnt1.nxv4f32(, , *) +declare void @llvm.aarch64.sve.stnt1.nxv2f64(, , *)