Diff 346689

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Show First 20 Lines • Show All 975 Lines • ▼ Show 20 Lines	private:
SDValue LowerSVEStructLoad(unsigned Intrinsic, ArrayRef<SDValue> LoadOps,		SDValue LowerSVEStructLoad(unsigned Intrinsic, ArrayRef<SDValue> LoadOps,
EVT VT, SelectionDAG &DAG, const SDLoc &DL) const;		EVT VT, SelectionDAG &DAG, const SDLoc &DL) const;

SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op,		SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op,
SelectionDAG &DAG) const;		SelectionDAG &DAG) const;
SDValue LowerFixedLengthVectorIntExtendToSVE(SDValue Op,		SDValue LowerFixedLengthVectorIntExtendToSVE(SDValue Op,
SelectionDAG &DAG) const;		SelectionDAG &DAG) const;
SDValue LowerFixedLengthVectorLoadToSVE(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerFixedLengthVectorLoadToSVE(SDValue Op, SelectionDAG &DAG) const;
		SDValue LowerFixedLengthVectorMLoadToSVE(SDValue Op, SelectionDAG &DAG) const;
		Lint: Pre-merge checks Inline Actions clang-tidy: warning: invalid case style for function 'LowerFixedLengthVectorMLoadToSVE' [readability-identifier-naming] not useful Lint: Pre-merge checks: clang-tidy: warning: invalid case style for function 'LowerFixedLengthVectorMLoadToSVE'…
SDValue LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp, SelectionDAG &DAG) const;		SDValue LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp, SelectionDAG &DAG) const;
SDValue LowerPredReductionToSVE(SDValue ScalarOp, SelectionDAG &DAG) const;		SDValue LowerPredReductionToSVE(SDValue ScalarOp, SelectionDAG &DAG) const;
SDValue LowerReductionToSVE(unsigned Opcode, SDValue ScalarOp,		SDValue LowerReductionToSVE(unsigned Opcode, SDValue ScalarOp,
SelectionDAG &DAG) const;		SelectionDAG &DAG) const;
SDValue LowerFixedLengthVectorSelectToSVE(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerFixedLengthVectorSelectToSVE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFixedLengthVectorSetccToSVE(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerFixedLengthVectorSetccToSVE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFixedLengthVectorStoreToSVE(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerFixedLengthVectorStoreToSVE(SDValue Op, SelectionDAG &DAG) const;
		SDValue LowerFixedLengthVectorMStoreToSVE(SDValue Op,
		Lint: Pre-merge checks Inline Actions clang-tidy: warning: invalid case style for function 'LowerFixedLengthVectorMStoreToSVE' [readability-identifier-naming] not useful Lint: Pre-merge checks: clang-tidy: warning: invalid case style for function 'LowerFixedLengthVectorMStoreToSVE'…
		SelectionDAG &DAG) const;
SDValue LowerFixedLengthVectorTruncateToSVE(SDValue Op,		SDValue LowerFixedLengthVectorTruncateToSVE(SDValue Op,
SelectionDAG &DAG) const;		SelectionDAG &DAG) const;
SDValue LowerFixedLengthExtractVectorElt(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerFixedLengthExtractVectorElt(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFixedLengthInsertVectorElt(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerFixedLengthInsertVectorElt(SDValue Op, SelectionDAG &DAG) const;

SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,		SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
SmallVectorImpl<SDNode *> &Created) const override;		SmallVectorImpl<SDNode *> &Created) const override;
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,		SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
▲ Show 20 Lines • Show All 92 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,103 Lines • ▼ Show 20 Lines if (Subtarget->hasNEON()) {

if (Subtarget->hasSVE()) if (Subtarget->hasSVE())

setOperationAction(ISD::VSCALE, MVT::i32, Custom); setOperationAction(ISD::VSCALE, MVT::i32, Custom);

setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);

} }

if (Subtarget->hasSVE()) { if (Subtarget->hasSVE()) {

// FIXME: Add custom lowering of MLOAD to handle different passthrus (not a

// splat of 0 or undef) once vector selects supported in SVE codegen. See

// D68877 for more details.

for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) { for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {

setOperationAction(ISD::BITREVERSE, VT, Custom); setOperationAction(ISD::BITREVERSE, VT, Custom);

setOperationAction(ISD::BSWAP, VT, Custom); setOperationAction(ISD::BSWAP, VT, Custom);

setOperationAction(ISD::CTLZ, VT, Custom); setOperationAction(ISD::CTLZ, VT, Custom);

setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom);

setOperationAction(ISD::CTTZ, VT, Custom); setOperationAction(ISD::CTTZ, VT, Custom);

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);

setOperationAction(ISD::UINT_TO_FP, VT, Custom); setOperationAction(ISD::UINT_TO_FP, VT, Custom);

▲ Show 20 Lines • Show All 51 Lines • ▼ Show 20 Lines for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {

setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand);

// There are no legal MVT::nxv16f## based types. // There are no legal MVT::nxv16f## based types.

if (VT != MVT::nxv16i1) { if (VT != MVT::nxv16i1) {

setOperationAction(ISD::SINT_TO_FP, VT, Custom); setOperationAction(ISD::SINT_TO_FP, VT, Custom);

setOperationAction(ISD::UINT_TO_FP, VT, Custom); setOperationAction(ISD::UINT_TO_FP, VT, Custom);

} }

// NEON doesn't support masked loads or stores, but SVE does

for (auto VT :

sdesmalenUnsubmitted

Not Done

nit: is MVT::v2f16 missing or is this type widened?

sdesmalen: nit: is MVT::v2f16 missing or is this type widened?

DavidTrubyAuthorUnsubmitted

Done

I believe v2f16 isn't a legal type here, so we don't want to do the custom lowering for it. We can just allow it to be handled as before.

DavidTruby: I believe v2f16 isn't a legal type here, so we don't want to do the custom lowering for it. We…

sdesmalenUnsubmitted

Not Done

Fair enough. Would it be good to at least add a test for it?

sdesmalen: Fair enough. Would it be good to at least add a test for it?

DavidTrubyAuthorUnsubmitted

Done

I can do, although it's not really testing anything this patch is doing: the <2 x half> seems to be canonicalised before we come through these code paths so I assume that's already tested elsewhere.
Let me know if you want an added test for completeness though :)

DavidTruby: I can do, although it's not really testing anything this patch is doing: the <2 x half> seems…

sdesmalenUnsubmitted

Not Done

Let me know if you want an added test for completeness though :)

That would be good. I'm aware your patch isn't really doing anything specific for that type and I would suspect it all just works, but to confirm that I would have to download the patch, build it, and try it out, whereas it could also be confirmed (and guarded) by an extra test case.

sdesmalen: > Let me know if you want an added test for completeness though :) That would be good. I'm…

{MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,

MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,

MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {

setOperationAction(ISD::MLOAD, VT, Custom);

setOperationAction(ISD::MSTORE, VT, Custom);

}

} }

for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,

MVT::nxv4f32, MVT::nxv2f64}) { MVT::nxv4f32, MVT::nxv2f64}) {

for (auto InnerVT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, for (auto InnerVT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16,

MVT::nxv2f32, MVT::nxv4f32, MVT::nxv2f64}) { MVT::nxv2f32, MVT::nxv4f32, MVT::nxv2f64}) {

// Avoid marking truncating FP stores as legal to prevent the // Avoid marking truncating FP stores as legal to prevent the

// DAGCombiner from creating unsupported truncating stores. // DAGCombiner from creating unsupported truncating stores.

▲ Show 20 Lines • Show All 266 Lines • ▼ Show 20 Lines void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {

setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FNEG, VT, Custom);

setOperationAction(ISD::FRINT, VT, Custom); setOperationAction(ISD::FRINT, VT, Custom);

setOperationAction(ISD::FROUND, VT, Custom); setOperationAction(ISD::FROUND, VT, Custom);

setOperationAction(ISD::FROUNDEVEN, VT, Custom); setOperationAction(ISD::FROUNDEVEN, VT, Custom);

setOperationAction(ISD::FSQRT, VT, Custom); setOperationAction(ISD::FSQRT, VT, Custom);

setOperationAction(ISD::FSUB, VT, Custom); setOperationAction(ISD::FSUB, VT, Custom);

setOperationAction(ISD::FTRUNC, VT, Custom); setOperationAction(ISD::FTRUNC, VT, Custom);

setOperationAction(ISD::LOAD, VT, Custom); setOperationAction(ISD::LOAD, VT, Custom);

setOperationAction(ISD::MLOAD, VT, Custom);

setOperationAction(ISD::MSTORE, VT, Custom);

setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::MUL, VT, Custom);

setOperationAction(ISD::MULHS, VT, Custom); setOperationAction(ISD::MULHS, VT, Custom);

setOperationAction(ISD::MULHU, VT, Custom); setOperationAction(ISD::MULHU, VT, Custom);

setOperationAction(ISD::OR, VT, Custom); setOperationAction(ISD::OR, VT, Custom);

setOperationAction(ISD::SDIV, VT, Custom); setOperationAction(ISD::SDIV, VT, Custom);

setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom);

setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom);

setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom);

▲ Show 20 Lines • Show All 2,986 Lines • ▼ Show 20 Lines case ISD::FRAMEADDR:

return LowerFRAMEADDR(Op, DAG); return LowerFRAMEADDR(Op, DAG);

case ISD::SPONENTRY: case ISD::SPONENTRY:

return LowerSPONENTRY(Op, DAG); return LowerSPONENTRY(Op, DAG);

case ISD::RETURNADDR: case ISD::RETURNADDR:

return LowerRETURNADDR(Op, DAG); return LowerRETURNADDR(Op, DAG);

case ISD::ADDROFRETURNADDR: case ISD::ADDROFRETURNADDR:

return LowerADDROFRETURNADDR(Op, DAG); return LowerADDROFRETURNADDR(Op, DAG);

case ISD::CONCAT_VECTORS: case ISD::CONCAT_VECTORS:

return LowerCONCAT_VECTORS(Op, DAG); return LowerCONCAT_VECTORS(Op, DAG);

peterwaller-armUnsubmitted

Not Done

Nit: Extraneous blank.

peterwaller-arm: Nit: Extraneous blank.

case ISD::INSERT_VECTOR_ELT: case ISD::INSERT_VECTOR_ELT:

return LowerINSERT_VECTOR_ELT(Op, DAG); return LowerINSERT_VECTOR_ELT(Op, DAG);

case ISD::EXTRACT_VECTOR_ELT: case ISD::EXTRACT_VECTOR_ELT:

return LowerEXTRACT_VECTOR_ELT(Op, DAG); return LowerEXTRACT_VECTOR_ELT(Op, DAG);

case ISD::BUILD_VECTOR: case ISD::BUILD_VECTOR:

return LowerBUILD_VECTOR(Op, DAG); return LowerBUILD_VECTOR(Op, DAG);

case ISD::VECTOR_SHUFFLE: case ISD::VECTOR_SHUFFLE:

return LowerVECTOR_SHUFFLE(Op, DAG); return LowerVECTOR_SHUFFLE(Op, DAG);

▲ Show 20 Lines • Show All 62 Lines • ▼ Show 20 Lines return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED,

/*OverrideNEON=*/true); /*OverrideNEON=*/true);

case ISD::MULHU: case ISD::MULHU:

return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED, return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED,

/*OverrideNEON=*/true); /*OverrideNEON=*/true);

case ISD::INTRINSIC_WO_CHAIN: case ISD::INTRINSIC_WO_CHAIN:

return LowerINTRINSIC_WO_CHAIN(Op, DAG); return LowerINTRINSIC_WO_CHAIN(Op, DAG);

case ISD::STORE: case ISD::STORE:

return LowerSTORE(Op, DAG); return LowerSTORE(Op, DAG);

case ISD::MSTORE:

return LowerFixedLengthVectorMStoreToSVE(Op, DAG);

peterwaller-armUnsubmitted

Done

case ISD::MSTORE:

- if (useSVEForFixedLengthVectorVT(cast<MaskedStoreSDNode>(Op)->getValue().getValueType()), true)

+ if (useSVEForFixedLengthVectorVT(Op->getValueType()->getScalarType()))

return LowerFixedLengthVectorMStoreToSVE(Op, DAG);

Suggestion to avoid the cast.

Also, looks like you're using the "comma true" operator in an if, which I'm guessing is unintended?

peterwaller-arm: Suggestion to avoid the cast. Also, looks like you're using the "comma true" operator in an if…

case ISD::MGATHER: case ISD::MGATHER:

return LowerMGATHER(Op, DAG); return LowerMGATHER(Op, DAG);

case ISD::MSCATTER: case ISD::MSCATTER:

return LowerMSCATTER(Op, DAG); return LowerMSCATTER(Op, DAG);

case ISD::VECREDUCE_SEQ_FADD: case ISD::VECREDUCE_SEQ_FADD:

return LowerVECREDUCE_SEQ_FADD(Op, DAG); return LowerVECREDUCE_SEQ_FADD(Op, DAG);

case ISD::VECREDUCE_ADD: case ISD::VECREDUCE_ADD:

case ISD::VECREDUCE_AND: case ISD::VECREDUCE_AND:

Show All 27 Lines if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&

(ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64)) (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))

return SDValue(); return SDValue();

return LowerToPredicatedOp(Op, DAG, return LowerToPredicatedOp(Op, DAG,

AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU); AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);

} }

case ISD::TRUNCATE: case ISD::TRUNCATE:

return LowerTRUNCATE(Op, DAG); return LowerTRUNCATE(Op, DAG);

case ISD::MLOAD:

return LowerFixedLengthVectorMLoadToSVE(Op, DAG);

peterwaller-armUnsubmitted

Done

Comma true again.

peterwaller-arm: Comma true again.

sdesmalenUnsubmitted

Done

Are masked SVE masked load/store instructions not always more efficient than scalarising?

sdesmalen: Are masked SVE masked load/store instructions not always more efficient than scalarising?

DavidTrubyAuthorUnsubmitted

Done

These if statements aren't necessary, as we won't select custom lowering here unless we already have a type that we want to use this lowering for. Consequently I've just removed them.

DavidTruby: These if statements aren't necessary, as we won't select custom lowering here unless we already…

case ISD::LOAD: case ISD::LOAD:

if (useSVEForFixedLengthVectorVT(Op.getValueType())) if (useSVEForFixedLengthVectorVT(Op.getValueType()))

return LowerFixedLengthVectorLoadToSVE(Op, DAG); return LowerFixedLengthVectorLoadToSVE(Op, DAG);

llvm_unreachable("Unexpected request to lower ISD::LOAD"); llvm_unreachable("Unexpected request to lower ISD::LOAD");

case ISD::ADD: case ISD::ADD:

return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED); return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED);

case ISD::AND: case ISD::AND:

return LowerToScalableOp(Op, DAG); return LowerToScalableOp(Op, DAG);

▲ Show 20 Lines • Show All 12,584 Lines • ▼ Show 20 Lines auto NewLoad = DAG.getMaskedLoad(

Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(), Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(),

Load->getExtensionType()); Load->getExtensionType());

auto Result = convertFromScalableVector(DAG, VT, NewLoad); auto Result = convertFromScalableVector(DAG, VT, NewLoad);

SDValue MergedValues[2] = {Result, Load->getChain()}; SDValue MergedValues[2] = {Result, Load->getChain()};

return DAG.getMergeValues(MergedValues, DL); return DAG.getMergeValues(MergedValues, DL);

} }

static SDValue convertFixedMaskToScalableVector(SDValue Mask,

peterwaller-armUnsubmitted

Done

Nit: Extraneous blank

peterwaller-arm: Nit: Extraneous blank

SelectionDAG &DAG) {

sdesmalenUnsubmitted

Not Done

nit: It would be nice if this could be part of convertToScalableVector. That way, for any fixed-width vector we can ask for its scalable counterpart using the same interface.

sdesmalen: nit: It would be nice if this could be part of `convertToScalableVector`. That way, for //any//…

SDLoc DL(Mask);

EVT InVT = Mask.getValueType();

EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);

auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);

auto Op2 = DAG.getConstant(0, DL, ContainerVT);

auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);

EVT CmpVT = Pg.getValueType();

return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,

{Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});

DavidTrubyAuthorUnsubmitted

Done

Not sure how this got through, I'll re-format in the next revision or before pushing.

DavidTruby: Not sure how this got through, I'll re-format in the next revision or before pushing.

}

kmclaughlinUnsubmitted

Not Done

nit: can you just use return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO... here instead?

kmclaughlin: nit: can you just use `return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO...` here instead?

// Convert all fixed length vector loads larger than NEON to masked_loads.

SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(

SDValue Op, SelectionDAG &DAG) const {

bsmithUnsubmitted

Not Done

Nit: Neon sized vectors come through here also

bsmith: Nit: Neon sized vectors come through here also

auto Load = cast<MaskedLoadSDNode>(Op);

Lint: Pre-merge checks

clang-tidy: warning: 'auto Load' can be declared as 'auto *Load' [llvm-qualified-auto]
not useful

Lint: Pre-merge checks: clang-tidy: warning: 'auto Load' can be declared as 'auto *Load' [llvm-qualified-auto] [[https…

if (Load->getExtensionType() != ISD::LoadExtType::NON_EXTLOAD)

sdesmalenUnsubmitted

Not Done

Are you planning to handle sign/zero-extending loads in a follow-up patch?

sdesmalen: Are you planning to handle sign/zero-extending loads in a follow-up patch?

DavidTrubyAuthorUnsubmitted

Done

Right now they work but perform the extension manually (as you can see in the tests)
The plan is to submit a follow up patch implementing custom lowering for them to use the built in extension in the load instructions

DavidTruby: Right now they work but perform the extension manually (as you can see in the tests) The plan…

return SDValue();

SDLoc DL(Op);

EVT VT = Op.getValueType();

EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);

SDValue Mask = convertFixedMaskToScalableVector(Load->getMask(), DAG);

SDValue PassThru;

bool IsPassThruZeroOrUndef = false;

if (Load->getPassThru()->isUndef()) {

PassThru = DAG.getUNDEF(ContainerVT);

IsPassThruZeroOrUndef = true;

} else {

if (ContainerVT.isInteger())

sdesmalenUnsubmitted

Not Done

This may not be worth it, but just sharing my thoughts; when reading this code I kind of thought "Why not just call convertToScalableVector?". This would then end up as a INSERT_SUBVECTOR of a DUP, etc. But that could be simplified by a DAG combine which removes the INSERT_SUBVECTOR.

sdesmalen: This may not be worth it, but just sharing my thoughts; when reading this code I kind of…

DavidTrubyAuthorUnsubmitted

Done

Initially this is how I had tried to do it but I was getting selection failures because the types weren't quite right, so I decided to just manually create the correct vectors here

DavidTruby: Initially this is how I had tried to do it but I was getting selection failures because the…

PassThru = DAG.getConstant(0, DL, ContainerVT);

else

PassThru = DAG.getConstantFP(0, DL, ContainerVT);

if (isZerosVector(Load->getPassThru().getNode()))

IsPassThruZeroOrUndef = true;

}

auto NewLoad = DAG.getMaskedLoad(

ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),

Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),

Load->getAddressingMode(), Load->getExtensionType());

if (!IsPassThruZeroOrUndef) {

SDValue OldPassThru =

convertToScalableVector(DAG, ContainerVT, Load->getPassThru());

NewLoad = DAG.getSelect(DL, ContainerVT, Mask, NewLoad, OldPassThru);

}

auto Result = convertFromScalableVector(DAG, VT, NewLoad);

SDValue MergedValues[2] = {Result, Load->getChain()};

return DAG.getMergeValues(MergedValues, DL);

}

// Convert all fixed length vector stores larger than NEON to masked_stores. // Convert all fixed length vector stores larger than NEON to masked_stores.

SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE( SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(

SDValue Op, SelectionDAG &DAG) const { SDValue Op, SelectionDAG &DAG) const {

auto Store = cast<StoreSDNode>(Op); auto Store = cast<StoreSDNode>(Op);

SDLoc DL(Op); SDLoc DL(Op);

EVT VT = Store->getValue().getValueType(); EVT VT = Store->getValue().getValueType();

EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);

auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue()); auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());

return DAG.getMaskedStore( return DAG.getMaskedStore(

Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(), Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),

getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(), getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(),

Store->getMemOperand(), Store->getAddressingMode(), Store->getMemOperand(), Store->getAddressingMode(),

Store->isTruncatingStore()); Store->isTruncatingStore());

} }

SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(

SDValue Op, SelectionDAG &DAG) const {

auto Store = cast<MaskedStoreSDNode>(Op);

Lint: Pre-merge checks

clang-tidy: warning: 'auto Store' can be declared as 'auto *Store' [llvm-qualified-auto]
not useful

Lint: Pre-merge checks: clang-tidy: warning: 'auto Store' can be declared as 'auto *Store' [llvm-qualified-auto]…

if (Store->isTruncatingStore())

return SDValue();

SDLoc DL(Op);

EVT VT = Store->getValue().getValueType();

EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);

auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());

SDValue Mask = convertFixedMaskToScalableVector(Store->getMask(), DAG);

return DAG.getMaskedStore(

Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),

Mask, Store->getMemoryVT(), Store->getMemOperand(),

Store->getAddressingMode(), Store->isTruncatingStore());

}

SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE( SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(

SDValue Op, SelectionDAG &DAG) const { SDValue Op, SelectionDAG &DAG) const {

SDLoc dl(Op); SDLoc dl(Op);

EVT VT = Op.getValueType(); EVT VT = Op.getValueType();

EVT EltVT = VT.getVectorElementType(); EVT EltVT = VT.getVectorElementType();

bool Signed = Op.getOpcode() == ISD::SDIV; bool Signed = Op.getOpcode() == ISD::SDIV;

unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED; unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;

▲ Show 20 Lines • Show All 419 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Show First 20 Lines • Show All 218 Lines • ▼ Show 20 Lines bool isLegalElementTypeForSVE(Type *Ty) const {

if (Ty->isIntegerTy(8) || Ty->isIntegerTy(16) || if (Ty->isIntegerTy(8) || Ty->isIntegerTy(16) ||

Ty->isIntegerTy(32) || Ty->isIntegerTy(64)) Ty->isIntegerTy(32) || Ty->isIntegerTy(64))

return true; return true;

return false; return false;

} }

bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) { bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) {

if (isa<FixedVectorType>(DataType) || !ST->hasSVE()) if (!ST->hasSVE())

return false; return false;

// For fixed vectors, avoid scalarization if using SVE for them.

peterwaller-armUnsubmitted

Done

return false;

- // For fixed vectors, aqvoid scalarization if using SVE for them.

+ // For fixed vectors, avoid scalarization if using SVE for them.

if (isa<FixedVectorType>(DataType) && !ST->useSVEForFixedLengthVectors())

peterwaller-arm:

if (isa<FixedVectorType>(DataType) && !ST->useSVEForFixedLengthVectors())

return false; // Fall back to scalarization of masked operations.

kmclaughlinUnsubmitted

Not Done

nit: is it maybe worth merging this with the if (!ST->hasSVE()) condition above?

kmclaughlin: nit: is it maybe worth merging this with the `if (!ST->hasSVE())` condition above?

DavidTrubyAuthorUnsubmitted

Done

I kept these conditions separate because they're logically different and the if statement gets difficult to grok in my opinion if they're combined

DavidTruby: I kept these conditions separate because they're logically different and the if statement gets…

return isLegalElementTypeForSVE(DataType->getScalarType()); return isLegalElementTypeForSVE(DataType->getScalarType());

} }

bool isLegalMaskedLoad(Type *DataType, Align Alignment) { bool isLegalMaskedLoad(Type *DataType, Align Alignment) {

return isLegalMaskedLoadStore(DataType, Alignment); return isLegalMaskedLoadStore(DataType, Alignment);

} }

bool isLegalMaskedStore(Type *DataType, Align Alignment) { bool isLegalMaskedStore(Type *DataType, Align Alignment) {

▲ Show 20 Lines • Show All 69 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll

This file was added.

				; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s \| FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
				; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s \| FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
				; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s \| FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
				bsmithUnsubmitted Not Done Reply Inline Actions Nit: VBITS_GE_256 is redundant (it's used in all the same places as CHECK), just use CHECK. bsmith: Nit: VBITS_GE_256 is redundant (it's used in all the same places as CHECK), just use CHECK.
				; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s \| FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512

				target triple = "aarch64-unknown-linux-gnu"

				; Don't use SVE when its registers are no bigger than NEON.
				; NO_SVE-NOT: ptrue

				;
				; Masked Loads
				;
				define <2 x half> @masked_load_v2f16(<2 x half>* %ap, <2 x half>* %bp) #0 {
				; CHECK-LABEL: masked_load_v2f16:
				; CHECK: ldr s[[N0:[0-9]+]], [x0]
				; CHECK-NEXT: ldr s[[N1:[0-9]+]], [x1]
				; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].h, vl4
				; CHECK-NEXT: fcmeq v[[N2:[0-9]+]].4h, v[[N0]].4h, v[[N1]].4h
				sdesmalenUnsubmitted Done Reply Inline Actions Please use `CHECK-NEXT` for these, so that we can check there are no other instructions being emitted. sdesmalen: Please use `CHECK-NEXT` for these, so that we can check there are no other instructions being…
				; CHECK-NEXT: umov [[W0:w[0-9]+]], v[[N2]].h[0]
				; CHECK-NEXT: umov [[W1:w[0-9]+]], v[[N2]].h[1]
				; CHECK-NEXT: fmov s[[V0:[0-9]+]], [[W0]]
				; CHECK-NEXT: mov v[[V0]].s[1], [[W1]]
				; CHECK-NEXT: shl v[[V0]].2s, v[[V0]].2s, #16
				sdesmalenUnsubmitted Not Done Reply Inline Actions nit: fmov s[[V0:[0-9]+]], [[W0]] mov v[[V0]].s[1], [[W1]] sdesmalen: nit: fmov s[[V0:[0-9]+]], [[W0]] mov v[[V0]].s[1], [[W1]]
				; CHECK-NEXT: sshr v[[V0]].2s, v[[V0]].2s, #16
				; CHECK-NEXT: movi [[D0:d[0-9]+]], #0000000000000000
				; CHECK-NEXT: fmov [[W1]], s[[V0]]
				; CHECK-NEXT: mov [[W0]], v[[V0]].s[1]
				; CHECK-NEXT: mov [[V1:v[0-9]+]].h[0], [[W1]]
				; CHECK-NEXT: mov [[V1:v[0-9]+]].h[1], [[W0]]
				; CHECK-NEXT: shl v[[V0]].4h, [[V1]].4h, #15
				sdesmalenUnsubmitted Not Done Reply Inline Actions nit: mov [[V1]].h[1], [[W0]] sdesmalen: nit: mov [[V1]].h[1], [[W0]]
				; CHECK-NEXT: sshr v[[V0]].4h, v[[V0]].4h, #15
				; CHECK-NEXT: cmpne [[PG1:p[0-9]+]].h, [[PG0]]/z, z[[N2]].h, #0
				; CHECK-NEXT: ld1h { z0.h }, [[PG1]]/z, [x0]
				; CHECK-NEXT: ret
				%a = load <2 x half>, <2 x half>* %ap
				%b = load <2 x half>, <2 x half>* %bp
				%mask = fcmp oeq <2 x half> %a, %b
				%load = call <2 x half> @llvm.masked.load.v2f16(<2 x half>* %ap, i32 8, <2 x i1> %mask, <2 x half> zeroinitializer)
				ret <2 x half> %load
				}

				define <2 x float> @masked_load_v2f32(<2 x float>* %ap, <2 x float>* %bp) #0 {
				; CHECK-LABEL: masked_load_v2f32:
				; CHECK: ldr d[[N0:[0-9]+]], [x0]
				; CHECK-NEXT: ldr d[[N1:[0-9]+]], [x1]
				; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].s, vl2
				; CHECK-NEXT: fcmeq v[[N2:[0-9]+]].2s, v[[N0]].2s, v[[N1]].2s
				; CHECK-NEXT: cmpne [[PG1:p[0-9]+]].s, [[PG0]]/z, z[[N2]].s, #0
				; CHECK-NEXT: ld1w { z0.s }, [[PG1]]/z, [x0]
				; CHECK-NEXT: ret
				%a = load <2 x float>, <2 x float>* %ap
				%b = load <2 x float>, <2 x float>* %bp
				%mask = fcmp oeq <2 x float> %a, %b
				%load = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %ap, i32 8, <2 x i1> %mask, <2 x float> zeroinitializer)
				ret <2 x float> %load
				}

				define <4 x float> @masked_load_v4f32(<4 x float>* %ap, <4 x float>* %bp) #0 {
				; CHECK-LABEL: masked_load_v4f32:
				; CHECK: ldr q[[N0:[0-9]+]], [x0]
				; CHECK-NEXT: ldr q[[N1:[0-9]+]], [x1]
				; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].s, vl4
				; CHECK-NEXT: fcmeq v[[N2:[0-9]+]].4s, v[[N0]].4s, v[[N1]].4s
				; CHECK-NEXT: cmpne [[PG1:p[0-9]+]].s, [[PG0]]/z, z[[N2]].s, #0
				; CHECK-NEXT: ld1w { z0.s }, [[PG1]]/z, [x0]
				; CHECK-NEXT: ret
				%a = load <4 x float>, <4 x float>* %ap
				%b = load <4 x float>, <4 x float>* %bp
				%mask = fcmp oeq <4 x float> %a, %b
				%load = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %ap, i32 8, <4 x i1> %mask, <4 x float> zeroinitializer)
				ret <4 x float> %load
				}

				define <8 x float> @masked_load_v8f32(<8 x float>* %ap, <8 x float>* %bp) #0 {
				; CHECK-LABEL: masked_load_v8f32:
				; CHECK: ptrue [[PG0:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
				; CHECK-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
				; CHECK-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
				; CHECK-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
				; CHECK-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1
				; CHECK-NEXT: cmpne [[PG1]].s, [[PG0]]/z, [[Z0]].s, #0
				; CHECK-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x0]
				; CHECK-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8]
				; CHECK-NEXT: ret
				%a = load <8 x float>, <8 x float>* %ap
				%b = load <8 x float>, <8 x float>* %bp
				%mask = fcmp oeq <8 x float> %a, %b
				%load = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %ap, i32 8, <8 x i1> %mask, <8 x float> zeroinitializer)
				ret <8 x float> %load
				}

				define <16 x float> @masked_load_v16f32(<16 x float>* %ap, <16 x float>* %bp) #0 {
				; CHECK-LABEL: masked_load_v16f32:
				; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
				; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
				; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
				; VBITS_GE_512-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
				; VBITS_GE_512-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1
				; VBITS_GE_512-NEXT: cmpne [[PG1]].s, [[PG0]]/z, [[Z0]].s, #0
				; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8]
				; VBITS_GE_512-NEXT: ret
				%a = load <16 x float>, <16 x float>* %ap
				%b = load <16 x float>, <16 x float>* %bp
				%mask = fcmp oeq <16 x float> %a, %b
				%load = call <16 x float> @llvm.masked.load.v16f32(<16 x float>* %ap, i32 8, <16 x i1> %mask, <16 x float> zeroinitializer)
				ret <16 x float> %load
				}

				define <32 x float> @masked_load_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0 {
				; CHECK-LABEL: masked_load_v32f32:
				; VBITS_GE_1024: ptrue [[PG0:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
				; VBITS_GE_1024-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
				; VBITS_GE_1024-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
				; VBITS_GE_1024-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
				; VBITS_GE_1024-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1
				; VBITS_GE_1024-NEXT: cmpne [[PG1]].s, [[PG0]]/z, [[Z0]].s, #0
				; VBITS_GE_1024-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}]
				; VBITS_GE_1024-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8]
				; VBITS_GE_1024-NEXT: ret
				%a = load <32 x float>, <32 x float>* %ap
				%b = load <32 x float>, <32 x float>* %bp
				%mask = fcmp oeq <32 x float> %a, %b
				%load = call <32 x float> @llvm.masked.load.v32f32(<32 x float>* %ap, i32 8, <32 x i1> %mask, <32 x float> zeroinitializer)
				ret <32 x float> %load
				}

				define <64 x float> @masked_load_v64f32(<64 x float>* %ap, <64 x float>* %bp) #0 {
				; CHECK-LABEL: masked_load_v64f32:
				; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
				; VBITS_GE_2048-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
				; VBITS_GE_2048-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
				; VBITS_GE_2048-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
				; VBITS_GE_2048-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1
				; VBITS_GE_2048-NEXT: cmpne [[PG1]].s, [[PG0]]/z, [[Z0]].s, #0
				; VBITS_GE_2048-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}]
				; VBITS_GE_2048-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8]
				; VBITS_GE_2048-NEXT: ret

				%a = load <64 x float>, <64 x float>* %ap
				%b = load <64 x float>, <64 x float>* %bp
				%mask = fcmp oeq <64 x float> %a, %b
				%load = call <64 x float> @llvm.masked.load.v64f32(<64 x float>* %ap, i32 8, <64 x i1> %mask, <64 x float> zeroinitializer)
				ret <64 x float> %load
				}

				define <64 x i8> @masked_load_v64i8(<64 x i8>* %ap, <64 x i8>* %bp) #0 {
				; CHECK-LABEL: masked_load_v64i8:
				; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].b, vl64
				; VBITS_GE_512-NEXT: ld1b { [[Z0:z[0-9]+]].b }, p0/z, [x0]
				; VBITS_GE_512-NEXT: ld1b { [[Z1:z[0-9]+]].b }, p0/z, [x1]
				; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, [[Z1]].b
				; VBITS_GE_512-NEXT: mov [[Z0]].b, [[PG1]]/z, #-1
				; VBITS_GE_512-NEXT: cmpne [[PG1]].b, [[PG0]]/z, [[Z0]].b, #0
				; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG1]]/z, [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: st1b { [[Z0]].b }, [[PG0]], [x8]
				; VBITS_GE_512-NEXT: ret
				%a = load <64 x i8>, <64 x i8>* %ap
				%b = load <64 x i8>, <64 x i8>* %bp
				%mask = icmp eq <64 x i8> %a, %b
				%load = call <64 x i8> @llvm.masked.load.v64i8(<64 x i8>* %ap, i32 8, <64 x i1> %mask, <64 x i8> undef)
				ret <64 x i8> %load
				}

				define <32 x i16> @masked_load_v32i16(<32 x i16>* %ap, <32 x i16>* %bp) #0 {
				; CHECK-LABEL: masked_load_v32i16:
				; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].h, vl32
				; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0]
				; VBITS_GE_512-NEXT: ld1h { [[Z1:z[0-9]+]].h }, p0/z, [x1]
				; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, [[Z1]].h
				; VBITS_GE_512-NEXT: mov [[Z0]].h, [[PG1]]/z, #-1
				; VBITS_GE_512-NEXT: cmpne [[PG1]].h, [[PG0]]/z, [[Z0]].h, #0
				; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG1]]/z, [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG0]], [x8]
				; VBITS_GE_512: ret
				%a = load <32 x i16>, <32 x i16>* %ap
				%b = load <32 x i16>, <32 x i16>* %bp
				%mask = icmp eq <32 x i16> %a, %b
				%load = call <32 x i16> @llvm.masked.load.v32i16(<32 x i16>* %ap, i32 8, <32 x i1> %mask, <32 x i16> undef)
				ret <32 x i16> %load
				}

				define <16 x i32> @masked_load_v16i32(<16 x i32>* %ap, <16 x i32>* %bp) #0 {
				; CHECK-LABEL: masked_load_v16i32:
				; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].s, vl16
				; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
				; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
				; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
				; VBITS_GE_512-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1
				; VBITS_GE_512-NEXT: cmpne [[PG1]].s, [[PG0]]/z, [[Z0]].s, #0
				; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8]
				; VBITS_GE_512-NEXT: ret
				%a = load <16 x i32>, <16 x i32>* %ap
				%b = load <16 x i32>, <16 x i32>* %bp
				%mask = icmp eq <16 x i32> %a, %b
				%load = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %ap, i32 8, <16 x i1> %mask, <16 x i32> undef)
				ret <16 x i32> %load
				}

				define <8 x i64> @masked_load_v8i64(<8 x i64>* %ap, <8 x i64>* %bp) #0 {
				; CHECK-LABEL: masked_load_v8i64:
				; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].d, vl8
				; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
				; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1]
				; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].d, [[PG0]]/z, [[Z0]].d, [[Z1]].d
				; VBITS_GE_512-NEXT: mov [[Z0]].d, [[PG1]]/z, #-1
				; VBITS_GE_512-NEXT: cmpne [[PG1]].d, [[PG0]]/z, [[Z0]].d, #0
				; VBITS_GE_512-NEXT: ld1d { [[Z0]].d }, [[PG1]]/z, [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG0]], [x8]
				; VBITS_GE_512-NEXT: ret
				%a = load <8 x i64>, <8 x i64>* %ap
				%b = load <8 x i64>, <8 x i64>* %bp
				%mask = icmp eq <8 x i64> %a, %b
				%load = call <8 x i64> @llvm.masked.load.v8i64(<8 x i64>* %ap, i32 8, <8 x i1> %mask, <8 x i64> undef)
				ret <8 x i64> %load
				}

				define <8 x i64> @masked_load_passthru_v8i64(<8 x i64>* %ap, <8 x i64>* %bp) #0 {
				; CHECK-LABEL: masked_load_passthru_v8i64:
				; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].d, vl8
				; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
				; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1]
				; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].d, [[PG0]]/z, [[Z0]].d, [[Z1]].d
				; VBITS_GE_512-NEXT: mov [[Z0]].d, [[PG1]]/z, #-1
				; VBITS_GE_512-NEXT: cmpne [[PG1]].d, [[PG0]]/z, [[Z0]].d, #0
				; VBITS_GE_512-NEXT: ld1d { [[Z0]].d }, [[PG1]]/z, [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: sel [[Z2:z[0-9]+]].d, [[PG1]], [[Z0]].d, [[Z1]].d
				; VBITS_GE_512-NEXT: st1d { [[Z2]].d }, [[PG0]], [x8]
				; VBITS_GE_512-NEXT: ret
				%a = load <8 x i64>, <8 x i64>* %ap
				%b = load <8 x i64>, <8 x i64>* %bp
				%mask = icmp eq <8 x i64> %a, %b
				%load = call <8 x i64> @llvm.masked.load.v8i64(<8 x i64>* %ap, i32 8, <8 x i1> %mask, <8 x i64> %b)
				ret <8 x i64> %load
				}

				define <8 x double> @masked_load_passthru_v8f64(<8 x double>* %ap, <8 x double>* %bp) #0 {
				; CHECK-LABEL: masked_load_passthru_v8f64:
				; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].d, vl8
				; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
				; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1]
				; VBITS_GE_512-NEXT: fcmeq [[PG1:p[0-9]+]].d, [[PG0]]/z, [[Z0]].d, [[Z1]].d
				; VBITS_GE_512-NEXT: mov [[Z0]].d, [[PG1]]/z, #-1
				; VBITS_GE_512-NEXT: cmpne [[PG1]].d, [[PG0]]/z, [[Z0]].d, #0
				; VBITS_GE_512-NEXT: ld1d { [[Z0]].d }, [[PG1]]/z, [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: sel [[Z2:z[0-9]+]].d, [[PG1]], [[Z0]].d, [[Z1]].d
				; VBITS_GE_512-NEXT: st1d { [[Z2]].d }, [[PG0]], [x8]
				; VBITS_GE_512-NEXT: ret
				%a = load <8 x double>, <8 x double>* %ap
				%b = load <8 x double>, <8 x double>* %bp
				%mask = fcmp oeq <8 x double> %a, %b
				%load = call <8 x double> @llvm.masked.load.v8f64(<8 x double>* %ap, i32 8, <8 x i1> %mask, <8 x double> %b)
				ret <8 x double> %load
				}

				define <32 x i16> @masked_load_sext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 {
				; CHECK-LABEL: masked_load_sext_v32i8i16:
				; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].b, vl32
				; VBITS_GE_512-NEXT: ld1b { [[Z0:z[0-9]+]].b }, p0/z, [x0]
				; VBITS_GE_512-NEXT: ld1b { [[Z1:z[0-9]+]].b }, p0/z, [x1]
				; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, [[Z1]].b
				; VBITS_GE_512-NEXT: mov [[Z0]].b, [[PG1]]/z, #-1
				; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, #0
				; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG2]]/z, [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].h, vl32
				; VBITS_GE_512-NEXT: sunpklo [[Z0]].h, [[Z0]].b
				; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG2]], [x8]
				; VBITS_GE_512-NEXT: ret
				%a = load <32 x i8>, <32 x i8>* %ap
				%b = load <32 x i8>, <32 x i8>* %bp
				%mask = icmp eq <32 x i8> %a, %b
				%load = call <32 x i8> @llvm.masked.load.v32i8(<32 x i8>* %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
				%ext = sext <32 x i8> %load to <32 x i16>
				ret <32 x i16> %ext
				}

				define <16 x i32> @masked_load_sext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 {
				; CHECK-LABEL: masked_load_sext_v16i8i32:
				; VBITS_GE_512: ldr q0, [x0]
				; VBITS_GE_512-NEXT: ldr q1, [x1]
				; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].b, vl16
				; VBITS_GE_512-NEXT: cmeq v[[V:[0-9]+]].16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
				; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, #0
				; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG2]]/z, [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
				; VBITS_GE_512-NEXT: sunpklo [[Z0]].h, [[Z0]].b
				; VBITS_GE_512-NEXT: sunpklo [[Z0]].s, [[Z0]].h
				; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG2]], [x8]
				; VBITS_GE_512: ret
				%a = load <16 x i8>, <16 x i8>* %ap
				%b = load <16 x i8>, <16 x i8>* %bp
				%mask = icmp eq <16 x i8> %a, %b
				%load = call <16 x i8> @llvm.masked.load.v16i8(<16 x i8>* %ap, i32 8, <16 x i1> %mask, <16 x i8> undef)
				%ext = sext <16 x i8> %load to <16 x i32>
				ret <16 x i32> %ext
				}

				define <8 x i64> @masked_load_sext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 {
				; CHECK-LABEL: masked_load_sext_v8i8i64:
				; VBITS_GE_512: ldr d0, [x0]
				; VBITS_GE_512-NEXT: ldr d1, [x1]
				; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].b, vl8
				; VBITS_GE_512-NEXT: cmeq v[[V:[0-9]+]].8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
				; VBITS_GE_512-NEXT: cmpne p[[PG:[0-9]+]].b, p0/z, z[[V]].b, #0
				; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, p[[PG]]/z, [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d, vl8
				; VBITS_GE_512-NEXT: sunpklo [[Z0]].h, [[Z0]].b
				; VBITS_GE_512-NEXT: sunpklo [[Z0]].s, [[Z0]].h
				; VBITS_GE_512-NEXT: sunpklo [[Z0]].d, [[Z0]].s
				; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG2]], [x8]
				; VBITS_GE_512-NEXT: ret
				%a = load <8 x i8>, <8 x i8>* %ap
				%b = load <8 x i8>, <8 x i8>* %bp
				%mask = icmp eq <8 x i8> %a, %b
				%load = call <8 x i8> @llvm.masked.load.v8i8(<8 x i8>* %ap, i32 8, <8 x i1> %mask, <8 x i8> undef)
				%ext = sext <8 x i8> %load to <8 x i64>
				ret <8 x i64> %ext
				}

				define <16 x i32> @masked_load_sext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) #0 {
				; CHECK-LABEL: masked_load_sext_v16i16i32:
				; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].h, vl16
				; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0]
				; VBITS_GE_512-NEXT: ld1h { [[Z1:z[0-9]+]].h }, p0/z, [x1]
				; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, [[Z1]].h
				; VBITS_GE_512-NEXT: mov [[Z0]].h, [[PG1]]/z, #-1
				; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, #0
				; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG2]]/z, [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
				; VBITS_GE_512-NEXT: sunpklo [[Z0]].s, [[Z0]].h
				; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG2]], [x8]
				; VBITS_GE_512-NEXT: ret
				%a = load <16 x i16>, <16 x i16>* %ap
				%b = load <16 x i16>, <16 x i16>* %bp
				%mask = icmp eq <16 x i16> %a, %b
				%load = call <16 x i16> @llvm.masked.load.v16i16(<16 x i16>* %ap, i32 8, <16 x i1> %mask, <16 x i16> undef)
				%ext = sext <16 x i16> %load to <16 x i32>
				ret <16 x i32> %ext
				}

				define <8 x i64> @masked_load_sext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 {
				; CHECK-LABEL: masked_load_sext_v8i16i64:
				; VBITS_GE_512: ldr q0, [x0]
				; VBITS_GE_512-NEXT: ldr q1, [x1]
				; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].h, vl8
				; VBITS_GE_512-NEXT: cmeq v[[V:[0-9]+]].8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
				; VBITS_GE_512-NEXT: cmpne p[[PG:[0-9]+]].h, p0/z, z[[V]].h, #0
				; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, p[[PG]]/z, [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d, vl8
				; VBITS_GE_512-NEXT: sunpklo [[Z0]].s, [[Z0]].h
				; VBITS_GE_512-NEXT: sunpklo [[Z0]].d, [[Z0]].s
				; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG2]], [x8]
				; VBITS_GE_512-NEXT: ret
				%a = load <8 x i16>, <8 x i16>* %ap
				%b = load <8 x i16>, <8 x i16>* %bp
				%mask = icmp eq <8 x i16> %a, %b
				%load = call <8 x i16> @llvm.masked.load.v8i16(<8 x i16>* %ap, i32 8, <8 x i1> %mask, <8 x i16> undef)
				%ext = sext <8 x i16> %load to <8 x i64>
				ret <8 x i64> %ext
				}

				define <8 x i64> @masked_load_sext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 {
				; CHECK-LABEL: masked_load_sext_v8i32i64:
				; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].s, vl8
				; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
				; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
				; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
				; VBITS_GE_512-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1
				; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, #0
				; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG2]]/z, [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d, vl8
				; VBITS_GE_512-NEXT: sunpklo [[Z0]].d, [[Z0]].s
				; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG2]], [x8]
				; VBITS_GE_512-NEXT: ret
				%a = load <8 x i32>, <8 x i32>* %ap
				%b = load <8 x i32>, <8 x i32>* %bp
				%mask = icmp eq <8 x i32> %a, %b
				%load = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
				%ext = sext <8 x i32> %load to <8 x i64>
				ret <8 x i64> %ext
				}

				define <32 x i16> @masked_load_zext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 {
				; CHECK-LABEL: masked_load_zext_v32i8i16:
				; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].b, vl32
				; VBITS_GE_512-NEXT: ld1b { [[Z0:z[0-9]+]].b }, p0/z, [x0]
				; VBITS_GE_512-NEXT: ld1b { [[Z1:z[0-9]+]].b }, p0/z, [x1]
				; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, [[Z1]].b
				; VBITS_GE_512-NEXT: mov [[Z0]].b, [[PG1]]/z, #-1
				; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, #0
				; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG2]]/z, [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].h, vl32
				; VBITS_GE_512-NEXT: uunpklo [[Z0]].h, [[Z0]].b
				; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG2]], [x8]
				; VBITS_GE_512-NEXT: ret
				%a = load <32 x i8>, <32 x i8>* %ap
				%b = load <32 x i8>, <32 x i8>* %bp
				%mask = icmp eq <32 x i8> %a, %b
				%load = call <32 x i8> @llvm.masked.load.v32i8(<32 x i8>* %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
				%ext = zext <32 x i8> %load to <32 x i16>
				ret <32 x i16> %ext
				}

				define <16 x i32> @masked_load_zext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 {
				; CHECK-LABEL: masked_load_zext_v16i8i32:
				; VBITS_GE_512: ldr q0, [x0]
				; VBITS_GE_512-NEXT: ldr q1, [x1]
				; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].b, vl16
				; VBITS_GE_512-NEXT: cmeq v[[V:[0-9]+]].16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
				; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, #0
				; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG2]]/z, [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
				; VBITS_GE_512-NEXT: uunpklo [[Z0]].h, [[Z0]].b
				; VBITS_GE_512-NEXT: uunpklo [[Z0]].s, [[Z0]].h
				; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG2]], [x8]
				; VBITS_GE_512: ret
				%a = load <16 x i8>, <16 x i8>* %ap
				%b = load <16 x i8>, <16 x i8>* %bp
				%mask = icmp eq <16 x i8> %a, %b
				%load = call <16 x i8> @llvm.masked.load.v16i8(<16 x i8>* %ap, i32 8, <16 x i1> %mask, <16 x i8> undef)
				%ext = zext <16 x i8> %load to <16 x i32>
				ret <16 x i32> %ext
				}

				define <8 x i64> @masked_load_zext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 {
				; CHECK-LABEL: masked_load_zext_v8i8i64:
				; VBITS_GE_512: ldr d0, [x0]
				; VBITS_GE_512-NEXT: ldr d1, [x1]
				; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].b, vl8
				; VBITS_GE_512-NEXT: cmeq v[[V:[0-9]+]].8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
				; VBITS_GE_512-NEXT: cmpne p[[PG:[0-9]+]].b, p0/z, z[[V]].b, #0
				; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, p[[PG]]/z, [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d, vl8
				; VBITS_GE_512-NEXT: uunpklo [[Z0]].h, [[Z0]].b
				; VBITS_GE_512-NEXT: uunpklo [[Z0]].s, [[Z0]].h
				; VBITS_GE_512-NEXT: uunpklo [[Z0]].d, [[Z0]].s
				; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG2]], [x8]
				; VBITS_GE_512-NEXT: ret
				%a = load <8 x i8>, <8 x i8>* %ap
				%b = load <8 x i8>, <8 x i8>* %bp
				%mask = icmp eq <8 x i8> %a, %b
				%load = call <8 x i8> @llvm.masked.load.v8i8(<8 x i8>* %ap, i32 8, <8 x i1> %mask, <8 x i8> undef)
				%ext = zext <8 x i8> %load to <8 x i64>
				ret <8 x i64> %ext
				}

				define <16 x i32> @masked_load_zext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) #0 {
				; CHECK-LABEL: masked_load_zext_v16i16i32:
				; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].h, vl16
				; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0]
				; VBITS_GE_512-NEXT: ld1h { [[Z1:z[0-9]+]].h }, p0/z, [x1]
				; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, [[Z1]].h
				; VBITS_GE_512-NEXT: mov [[Z0]].h, [[PG1]]/z, #-1
				; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, #0
				; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG2]]/z, [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
				; VBITS_GE_512-NEXT: uunpklo [[Z0]].s, [[Z0]].h
				; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG2]], [x8]
				; VBITS_GE_512-NEXT: ret
				%a = load <16 x i16>, <16 x i16>* %ap
				%b = load <16 x i16>, <16 x i16>* %bp
				%mask = icmp eq <16 x i16> %a, %b
				%load = call <16 x i16> @llvm.masked.load.v16i16(<16 x i16>* %ap, i32 8, <16 x i1> %mask, <16 x i16> undef)
				%ext = zext <16 x i16> %load to <16 x i32>
				ret <16 x i32> %ext
				}

				define <8 x i64> @masked_load_zext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 {
				; CHECK-LABEL: masked_load_zext_v8i16i64:
				; VBITS_GE_512: ldr q0, [x0]
				; VBITS_GE_512-NEXT: ldr q1, [x1]
				; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].h, vl8
				; VBITS_GE_512-NEXT: cmeq v[[V:[0-9]+]].8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
				; VBITS_GE_512-NEXT: cmpne p[[PG:[0-9]+]].h, p0/z, z[[V]].h, #0
				; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, p[[PG]]/z, [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d, vl8
				; VBITS_GE_512-NEXT: uunpklo [[Z0]].s, [[Z0]].h
				; VBITS_GE_512-NEXT: uunpklo [[Z0]].d, [[Z0]].s
				; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG2]], [x8]
				; VBITS_GE_512-NEXT: ret
				%a = load <8 x i16>, <8 x i16>* %ap
				%b = load <8 x i16>, <8 x i16>* %bp
				%mask = icmp eq <8 x i16> %a, %b
				%load = call <8 x i16> @llvm.masked.load.v8i16(<8 x i16>* %ap, i32 8, <8 x i1> %mask, <8 x i16> undef)
				%ext = zext <8 x i16> %load to <8 x i64>
				ret <8 x i64> %ext
				}

				define <8 x i64> @masked_load_zext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 {
				; CHECK-LABEL: masked_load_zext_v8i32i64:
				; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].s, vl8
				; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
				; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
				; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
				; VBITS_GE_512-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1
				; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, #0
				; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG2]]/z, [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d, vl8
				; VBITS_GE_512-NEXT: uunpklo [[Z0]].d, [[Z0]].s
				; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG2]], [x8]
				; VBITS_GE_512-NEXT: ret
				%a = load <8 x i32>, <8 x i32>* %ap
				%b = load <8 x i32>, <8 x i32>* %bp
				%mask = icmp eq <8 x i32> %a, %b
				%load = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
				%ext = zext <8 x i32> %load to <8 x i64>
				ret <8 x i64> %ext
				}

				declare <2 x half> @llvm.masked.load.v2f16(<2 x half>*, i32, <2 x i1>, <2 x half>)
				declare <2 x float> @llvm.masked.load.v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
				declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
				declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
				declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
				declare <32 x float> @llvm.masked.load.v32f32(<32 x float>*, i32, <32 x i1>, <32 x float>)
				declare <64 x float> @llvm.masked.load.v64f32(<64 x float>*, i32, <64 x i1>, <64 x float>)

				declare <64 x i8> @llvm.masked.load.v64i8(<64 x i8>*, i32, <64 x i1>, <64 x i8>)
				declare <32 x i8> @llvm.masked.load.v32i8(<32 x i8>*, i32, <32 x i1>, <32 x i8>)
				declare <16 x i8> @llvm.masked.load.v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
				declare <16 x i16> @llvm.masked.load.v16i16(<16 x i16>*, i32, <16 x i1>, <16 x i16>)
				declare <8 x i8> @llvm.masked.load.v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>)
				declare <8 x i16> @llvm.masked.load.v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
				declare <8 x i32> @llvm.masked.load.v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>)
				declare <32 x i16> @llvm.masked.load.v32i16(<32 x i16>*, i32, <32 x i1>, <32 x i16>)
				declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
				declare <8 x i64> @llvm.masked.load.v8i64(<8 x i64>*, i32, <8 x i1>, <8 x i64>)
				declare <8 x double> @llvm.masked.load.v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)

				attributes #0 = { "target-features"="+sve" }

llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll

This file was added.

				; RUN: llc -aarch64-sve-vector-bits-min=128 < %s \| FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
				; RUN: llc -aarch64-sve-vector-bits-min=256 < %s \| FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
				; RUN: llc -aarch64-sve-vector-bits-min=384 < %s \| FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
				; RUN: llc -aarch64-sve-vector-bits-min=512 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=640 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=768 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=896 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s \| FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512

				target triple = "aarch64-unknown-linux-gnu"

				; Don't use SVE when its registers are no bigger than NEON.
				; NO_SVE-NOT: ptrue

				;;
				;; Masked Stores
				;;
				define void @masked_store_v2f16(<2 x half>* %ap, <2 x half>* %bp) #0 {
				; CHECK-LABEL: masked_store_v2f16:
				; CHECK: ldr s0, [x0]
				; CHECK-NEXT: ldr s1, [x1]
				; CHECK-NEXT: movi [[D0:d[0-9]+]], #0000000000000000
				bsmithUnsubmitted Not Done Reply Inline Actions Why is -NEXT missing from all of the fcmeq lines? I see now that the loads are missing from the CHECK lines, they should probably be in for completeness. (Same for loads tests) bsmith: Why is -NEXT missing from all of the fcmeq lines? I see now that the loads are missing from…
				; CHECK-NEXT: ptrue p[[P0:[0-9]+]].h, vl4
				; CHECK-NEXT: fcmeq v[[P1:[0-9]+]].4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
				; CHECK-NEXT: umov [[W0:w[0-9]+]], v[[P1]].h[0]
				; CHECK-NEXT: umov [[W1:w[0-9]+]], v[[P1]].h[1]
				; CHECK-NEXT: fmov s[[V0:[0-9]+]], [[W0]]
				; CHECK-NEXT: mov v[[V0]].s[1], [[W1]]
				; CHECK-NEXT: shl v[[V0]].2s, v[[V0]].2s, #16
				; CHECK-NEXT: sshr v[[V0]].2s, v[[V0]].2s, #16
				; CHECK-NEXT: fmov [[W1]], s[[V0]]
				; CHECK-NEXT: mov [[W0]], v[[V0]].s[1]
				; CHECK-NEXT: mov [[V1:v[0-9]+]].h[0], [[W1]]
				; CHECK-NEXT: mov [[V1:v[0-9]+]].h[1], [[W0]]
				; CHECK-NEXT: shl v[[V0]].4h, [[V1]].4h, #15
				; CHECK-NEXT: sshr v[[V0]].4h, v[[V0]].4h, #15
				; CHECK-NEXT: cmpne p[[P2:[0-9]+]].h, p[[P0]]/z, z[[P1]].h, #0
				; CHECK-NEXT: st1h { z0.h }, p[[P2]], [x{{[0-9]+}}]
				; CHECK-NEXT: ret
				%a = load <2 x half>, <2 x half>* %ap
				%b = load <2 x half>, <2 x half>* %bp
				%mask = fcmp oeq <2 x half> %a, %b
				call void @llvm.masked.store.v2f16(<2 x half> %a, <2 x half>* %bp, i32 8, <2 x i1> %mask)
				ret void
				}


				define void @masked_store_v2f32(<2 x float>* %ap, <2 x float>* %bp) #0 {
				; CHECK-LABEL: masked_store_v2f32:
				; CHECK: ldr d0, [x0]
				; CHECK-NEXT: ldr d1, [x1]
				; CHECK-NEXT: ptrue p[[P0:[0-9]+]].s, vl2
				; CHECK-NEXT: fcmeq v[[P1:[0-9]+]].2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
				; CHECK-NEXT: cmpne p[[P2:[0-9]+]].s, p[[P0]]/z, z[[P1]].s, #0
				; CHECK-NEXT: st1w { z0.s }, p[[P2]], [x{{[0-9]+}}]
				; CHECK-NEXT: ret
				%a = load <2 x float>, <2 x float>* %ap
				%b = load <2 x float>, <2 x float>* %bp
				%mask = fcmp oeq <2 x float> %a, %b
				call void @llvm.masked.store.v2f32(<2 x float> %a, <2 x float>* %bp, i32 8, <2 x i1> %mask)
				ret void
				}

				define void @masked_store_v4f32(<4 x float>* %ap, <4 x float>* %bp) #0 {
				; CHECK-LABEL: masked_store_v4f32:
				; CHECK: ldr q0, [x0]
				; CHECK-NEXT: ldr q1, [x1]
				; CHECK-NEXT: ptrue p[[P0:[0-9]+]].s, vl4
				; CHECK-NEXT: fcmeq v[[P1:[0-9]+]].4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
				; CHECK-NEXT: cmpne p[[P2:[0-9]+]].s, p[[P0]]/z, z[[P1]].s, #0
				; CHECK-NEXT: st1w { z0.s }, p[[P2]], [x{{[0-9]+}}]
				; CHECK-NEXT: ret
				%a = load <4 x float>, <4 x float>* %ap
				%b = load <4 x float>, <4 x float>* %bp
				%mask = fcmp oeq <4 x float> %a, %b
				call void @llvm.masked.store.v4f32(<4 x float> %a, <4 x float>* %bp, i32 8, <4 x i1> %mask)
				ret void
				}

				define void @masked_store_v8f32(<8 x float>* %ap, <8 x float>* %bp) #0 {
				; CHECK-LABEL: masked_store_v8f32:
				; CHECK: ptrue [[PG0:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
				; CHECK-NEXT: ld1w { [[Z0:z[0-9]+]].s }, [[PG0]]/z, [x0]
				; CHECK-NEXT: ld1w { [[Z1:z[0-9]+]].s }, [[PG0]]/z, [x1]
				; CHECK-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
				; CHECK-NEXT: mov [[Z2:z[0-9]+]].s, [[PG1]]/z, #-1
				; CHECK-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z2]].s, #0
				; CHECK-NEXT: st1w { z0.s }, [[PG2]], [x{{[0-9]+}}]
				; CHECK-NEXT: ret
				%a = load <8 x float>, <8 x float>* %ap
				%b = load <8 x float>, <8 x float>* %bp
				%mask = fcmp oeq <8 x float> %a, %b
				call void @llvm.masked.store.v8f32(<8 x float> %a, <8 x float>* %bp, i32 8, <8 x i1> %mask)
				ret void
				}

				define void @masked_store_v16f32(<16 x float>* %ap, <16 x float>* %bp) #0 {
				; CHECK-LABEL: masked_store_v16f32:
				; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
				; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, [[PG0]]/z, [x0]
				; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, [[PG0]]/z, [x1]
				; VBITS_GE_512-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
				; VBITS_GE_512-NEXT: mov [[Z2:z[0-9]+]].s, [[PG1]]/z, #-1
				; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z1]].s, #0
				; VBITS_GE_512-NEXT: st1w { z0.s }, [[PG2]], [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: ret
				%a = load <16 x float>, <16 x float>* %ap
				%b = load <16 x float>, <16 x float>* %bp
				%mask = fcmp oeq <16 x float> %a, %b
				call void @llvm.masked.store.v16f32(<16 x float> %a, <16 x float>* %ap, i32 8, <16 x i1> %mask)
				ret void
				}

				define void @masked_store_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0 {
				; CHECK-LABEL: masked_store_v32f32:
				; VBITS_GE_1024: ptrue p[[P0:[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
				; VBITS_GE_1024-NEXT: ld1w { [[Z0:z[0-9]+]].s }, [[PG0]]/z, [x0]
				; VBITS_GE_1024-NEXT: ld1w { [[Z1:z[0-9]+]].s }, [[PG0]]/z, [x1]
				; VBITS_GE_1024-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
				; VBITS_GE_1024-NEXT: mov [[Z1:z[0-9]+]].s, [[PG1]]/z, #-1
				; VBITS_GE_1024-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z1]].s, #0
				; VBITS_GE_1024-NEXT: st1w { z0.s }, [[PG2]], [x{{[0-9]+}}]
				; VBITS_GE_1024-NEXT: ret
				%a = load <32 x float>, <32 x float>* %ap
				%b = load <32 x float>, <32 x float>* %bp
				%mask = fcmp oeq <32 x float> %a, %b
				call void @llvm.masked.store.v32f32(<32 x float> %a, <32 x float>* %ap, i32 8, <32 x i1> %mask)
				ret void
				}

				define void @masked_store_v64f32(<64 x float>* %ap, <64 x float>* %bp) #0 {
				; CHECK-LABEL: masked_store_v64f32:
				; VBITS_GE_2048: ptrue p[[P0:[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
				; VBITS_GE_2048-NEXT: ld1w { [[Z0:z[0-9]+]].s }, [[PG0]]/z, [x0]
				; VBITS_GE_2048-NEXT: ld1w { [[Z1:z[0-9]+]].s }, [[PG0]]/z, [x1]
				; VBITS_GE_2048-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
				; VBITS_GE_2048-NEXT: mov [[Z1:z[0-9]+]].s, [[PG1]]/z, #-1
				; VBITS_GE_2048-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z1]].s, #0
				; VBITS_GE_2048-NEXT: st1w { z0.s }, [[PG2]], [x{{[0-9]+}}]
				; VBITS_GE_2048-NEXT: ret
				%a = load <64 x float>, <64 x float>* %ap
				%b = load <64 x float>, <64 x float>* %bp
				%mask = fcmp oeq <64 x float> %a, %b
				call void @llvm.masked.store.v64f32(<64 x float> %a, <64 x float>* %ap, i32 8, <64 x i1> %mask)
				ret void
				}

				define void @masked_store_trunc_v8i64i8(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i8>* %dest) #0 {
				; CHECK-LABEL: masked_store_trunc_v8i64i8:
				; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
				; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
				; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1]
				; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].d, p[[P0]]/z, [[Z0]].d, [[Z1]].d
				; VBITS_GE_512-DAG: uzp1 [[Z1]].s, [[Z1]].s, [[Z1]].s
				; VBITS_GE_512-DAG: uzp1 [[Z1]].h, [[Z1]].h, [[Z1]].h
				; VBITS_GE_512-DAG: uzp1 [[Z1]].b, [[Z1]].b, [[Z1]].b
				; VBITS_GE_512-DAG: cmpne p[[P2:[0-9]+]].b, p{{[0-9]+}}/z, [[Z1]].b, #0
				; VBITS_GE_512-DAG: uzp1 [[Z0]].s, [[Z0]].s, [[Z0]].s
				; VBITS_GE_512-DAG: uzp1 [[Z0]].h, [[Z0]].h, [[Z0]].h
				; VBITS_GE_512-DAG: uzp1 [[Z0]].b, [[Z0]].b, [[Z0]].b
				; VBITS_GE_512-NEXT: st1b { [[Z0]].b }, p[[P2]], [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: ret
				%a = load <8 x i64>, <8 x i64>* %ap
				%b = load <8 x i64>, <8 x i64>* %bp
				%mask = icmp eq <8 x i64> %a, %b
				%val = trunc <8 x i64> %a to <8 x i8>
				call void @llvm.masked.store.v8i8(<8 x i8> %val, <8 x i8>* %dest, i32 8, <8 x i1> %mask)
				ret void
				}

				define void @masked_store_trunc_v8i64i16(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i16>* %dest) #0 {
				; CHECK-LABEL: masked_store_trunc_v8i64i16:
				; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
				; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
				; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1]
				; VBITS_GE_512-DAG: ptrue p{{[0-9]+}}.h, vl8
				; VBITS_GE_512-DAG: cmpeq p[[P1:[0-9]+]].d, p[[P0]]/z, [[Z0]].d, [[Z1]].d
				; VBITS_GE_512-NEXT: mov [[Z1]].d, p[[P0]]/z, #-1
				; VBITS_GE_512-DAG: uzp1 [[Z1]].s, [[Z1]].s, [[Z1]].s
				; VBITS_GE_512-DAG: uzp1 [[Z1]].h, [[Z1]].h, [[Z1]].h
				; VBITS_GE_512-DAG: cmpne p[[P2:[0-9]+]].h, p{{[0-9]+}}/z, [[Z1]].h, #0
				; VBITS_GE_512-DAG: uzp1 [[Z0]].s, [[Z0]].s, [[Z0]].s
				; VBITS_GE_512-DAG: uzp1 [[Z0]].h, [[Z0]].h, [[Z0]].h
				; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, p[[P2]], [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: ret
				%a = load <8 x i64>, <8 x i64>* %ap
				%b = load <8 x i64>, <8 x i64>* %bp
				%mask = icmp eq <8 x i64> %a, %b
				%val = trunc <8 x i64> %a to <8 x i16>
				call void @llvm.masked.store.v8i16(<8 x i16> %val, <8 x i16>* %dest, i32 8, <8 x i1> %mask)
				ret void
				}

				define void @masked_store_trunc_v8i64i32(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i32>* %dest) #0 {
				; CHECK-LABEL: masked_store_trunc_v8i64i32:
				; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
				; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
				; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1]
				; VBITS_GE_512-DAG: ptrue p{{[0-9]+}}.s, vl8
				; VBITS_GE_512-DAG: cmpeq p[[P1:[0-9]+]].d, p[[P0]]/z, [[Z0]].d, [[Z1]].d
				; VBITS_GE_512-NEXT: mov [[Z1]].d, p[[P0]]/z, #-1
				; VBITS_GE_512-DAG: uzp1 [[Z1]].s, [[Z1]].s, [[Z1]].s
				; VBITS_GE_512-DAG: cmpne p[[P2:[0-9]+]].s, p{{[0-9]+}}/z, [[Z1]].s, #0
				; VBITS_GE_512-DAG: uzp1 [[Z0]].s, [[Z0]].s, [[Z0]].s
				; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, p[[P2]], [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: ret
				%a = load <8 x i64>, <8 x i64>* %ap
				%b = load <8 x i64>, <8 x i64>* %bp
				%mask = icmp eq <8 x i64> %a, %b
				%val = trunc <8 x i64> %a to <8 x i32>
				call void @llvm.masked.store.v8i32(<8 x i32> %val, <8 x i32>* %dest, i32 8, <8 x i1> %mask)
				ret void
				}

				define void @masked_store_trunc_v16i32i8(<16 x i32>* %ap, <16 x i32>* %bp, <16 x i8>* %dest) #0 {
				; CHECK-LABEL: masked_store_trunc_v16i32i8:
				; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16
				; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
				; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
				; VBITS_GE_512-DAG: ptrue p{{[0-9]+}}.b, vl16
				; VBITS_GE_512-DAG: cmpeq p[[P1:[0-9]+]].s, p[[P0]]/z, [[Z0]].s, [[Z1]].s
				; VBITS_GE_512-NEXT: mov [[Z1]].s, p[[P0]]/z, #-1
				; VBITS_GE_512-DAG: uzp1 [[Z1]].h, [[Z1]].h, [[Z1]].h
				; VBITS_GE_512-DAG: uzp1 [[Z1]].b, [[Z1]].b, [[Z1]].b
				; VBITS_GE_512-DAG: cmpne p[[P2:[0-9]+]].b, p{{[0-9]+}}/z, [[Z1]].b, #0
				; VBITS_GE_512-DAG: uzp1 [[Z0]].h, [[Z0]].h, [[Z0]].h
				; VBITS_GE_512-DAG: uzp1 [[Z0]].b, [[Z0]].b, [[Z0]].b
				; VBITS_GE_512-NEXT: st1b { [[Z0]].b }, p[[P2]], [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: ret
				%a = load <16 x i32>, <16 x i32>* %ap
				%b = load <16 x i32>, <16 x i32>* %bp
				%mask = icmp eq <16 x i32> %a, %b
				%val = trunc <16 x i32> %a to <16 x i8>
				call void @llvm.masked.store.v16i8(<16 x i8> %val, <16 x i8>* %dest, i32 8, <16 x i1> %mask)
				ret void
				}

				define void @masked_store_trunc_v16i32i16(<16 x i32>* %ap, <16 x i32>* %bp, <16 x i16>* %dest) #0 {
				; CHECK-LABEL: masked_store_trunc_v16i32i16:
				; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16
				; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
				; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
				; VBITS_GE_512-DAG: ptrue p{{[0-9]+}}.h, vl16
				; VBITS_GE_512-DAG: cmpeq p[[P1:[0-9]+]].s, p[[P0]]/z, [[Z0]].s, [[Z1]].s
				; VBITS_GE_512-NEXT: mov [[Z1]].s, p[[P0]]/z, #-1
				; VBITS_GE_512-DAG: uzp1 [[Z1]].h, [[Z1]].h, [[Z1]].h
				; VBITS_GE_512-DAG: cmpne p[[P2:[0-9]+]].h, p{{[0-9]+}}/z, [[Z1]].h, #0
				; VBITS_GE_512-DAG: uzp1 [[Z0]].h, [[Z0]].h, [[Z0]].h
				; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, p[[P2]], [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: ret
				%a = load <16 x i32>, <16 x i32>* %ap
				%b = load <16 x i32>, <16 x i32>* %bp
				%mask = icmp eq <16 x i32> %a, %b
				%val = trunc <16 x i32> %a to <16 x i16>
				call void @llvm.masked.store.v16i16(<16 x i16> %val, <16 x i16>* %dest, i32 8, <16 x i1> %mask)
				ret void
				}

				define void @masked_store_trunc_v32i16i8(<32 x i16>* %ap, <32 x i16>* %bp, <32 x i8>* %dest) #0 {
				; CHECK-LABEL: masked_store_trunc_v32i16i8:
				; VBITS_GE_512: ptrue p[[P0:[0-9]+]].h, vl32
				; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0]
				; VBITS_GE_512-NEXT: ld1h { [[Z1:z[0-9]+]].h }, p0/z, [x1]
				; VBITS_GE_512-DAG: ptrue p{{[0-9]+}}.b, vl32
				; VBITS_GE_512-DAG: cmpeq p[[P1:[0-9]+]].h, p[[P0]]/z, [[Z0]].h, [[Z1]].h
				; VBITS_GE_512-NEXT: mov [[Z1]].h, p[[P0]]/z, #-1
				; VBITS_GE_512-DAG: uzp1 [[Z1]].b, [[Z1]].b, [[Z1]].b
				; VBITS_GE_512-DAG: cmpne p[[P2:[0-9]+]].b, p{{[0-9]+}}/z, [[Z1]].b, #0
				; VBITS_GE_512-DAG: uzp1 [[Z0]].b, [[Z0]].b, [[Z0]].b
				; VBITS_GE_512-NEXT: st1b { [[Z0]].b }, p[[P2]], [x{{[0-9]+}}]
				; VBITS_GE_512-NEXT: ret
				%a = load <32 x i16>, <32 x i16>* %ap
				%b = load <32 x i16>, <32 x i16>* %bp
				%mask = icmp eq <32 x i16> %a, %b
				%val = trunc <32 x i16> %a to <32 x i8>
				call void @llvm.masked.store.v32i8(<32 x i8> %val, <32 x i8>* %dest, i32 8, <32 x i1> %mask)
				ret void
				}

				declare void @llvm.masked.store.v2f16(<2 x half>, <2 x half>*, i32, <2 x i1>)
				declare void @llvm.masked.store.v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
				declare void @llvm.masked.store.v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
				declare void @llvm.masked.store.v8f32(<8 x float>, <8 x float>*, i32, <8 x i1>)
				declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
				declare void @llvm.masked.store.v32f32(<32 x float>, <32 x float>*, i32, <32 x i1>)
				declare void @llvm.masked.store.v64f32(<64 x float>, <64 x float>*, i32, <64 x i1>)

				declare void @llvm.masked.store.v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>)
				declare void @llvm.masked.store.v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>)
				declare void @llvm.masked.store.v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>)
				declare void @llvm.masked.store.v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)
				declare void @llvm.masked.store.v16i16(<16 x i16>, <16 x i16>*, i32, <16 x i1>)
				declare void @llvm.masked.store.v32i8(<32 x i8>, <32 x i8>*, i32, <32 x i1>)

				attributes #0 = { "target-features"="+sve" }

This is an archive of the discontinued LLVM Phabricator instance.

[llvm][sve] Lowering for VLS MLOAD/MSTORE
ClosedPublic

Details

Diff Detail

Unit TestsFailed

Event Timeline

Revision Contents

Diff 346689

llvm/lib/Target/AArch64/AArch64ISelLowering.h

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll

llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll

This is an archive of the discontinued LLVM Phabricator instance.

[llvm][sve] Lowering for VLS MLOAD/MSTOREClosedPublic

Details

Diff Detail

Unit TestsFailed

Event Timeline

Revision Contents

Diff 346689

llvm/lib/Target/AArch64/AArch64ISelLowering.h

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll

llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll

[llvm][sve] Lowering for VLS MLOAD/MSTORE
ClosedPublic