This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Target/AArch64/
-
Target/
-
AArch64/
-
AArch64ISelLowering.h
-
AArch64ISelLowering.cpp
1/2
AArch64InstrInfo.td
-
test/CodeGen/AArch64/
-
CodeGen/
-
AArch64/
2/3
nontemporal.ll

Differential D72919

[AArch64] Add custom store lowering for 256 bit non-temporal stores.
ClosedPublic

Authored by fhahn on Jan 17 2020, 6:21 AM.

Download Raw Diff

Details

Reviewers

dmgreen
samparker
t.p.northover
ab

Commits

rG3a4691fa810b: [AArch64] Add custom store lowering for 256 bit non-temporal stores.
rG535ed62c5fcb: [AArch64] Add custom store lowering for 256 bit non-temporal stores.

Summary

Currently we fail to lower non-termporal stores for 256+ bit vectors
to STNPQ, because type legalization will split them up to 128 bit stores
and because there are no single non-temporal stores, creating STPNQ
in the Load/Store optimizer would be quite tricky.

This patch adds custom lowering for 256 bit non-temporal vector stores
to improve the generated code.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

fhahn created this revision.Jan 17 2020, 6:21 AM

Herald added a project: Restricted Project. · View Herald TranscriptJan 17 2020, 6:21 AM

Herald added subscribers: hiraditya, kristof.beyls. · View Herald Transcript

Harbormaster failed remote builds in B44265: Diff 238761!Jan 17 2020, 6:40 AM

Unit tests: pass. 61937 tests passed, 0 failed and 783 were skipped.

clang-tidy: unknown.

clang-format: fail. Please format your changes with clang-format by running git-clang-format HEAD^ or applying this patch.

Build artifacts: diff.json, clang-format.patch, CMakeCache.txt, console-log.txt, test-results.xml

dmgreen added inline comments.Jan 18 2020, 2:28 AM

llvm/lib/Target/AArch64/AArch64InstrInfo.td
2739	Should this be simm7s16? Same for am_indexed7s128
llvm/test/CodeGen/AArch64/nontemporal.ll
361	Should halves work too?
372	Nit: There's an extra " " here it seems

Thanks for taking a look @dmgreen. I've fixed the offset arguments and added the missing v16f16 handler.

fhahn added inline comments.Jan 19 2020, 8:25 PM

llvm/lib/Target/AArch64/AArch64InstrInfo.td
2739	Yes! I've updated that and now the offsets should be correct.
llvm/test/CodeGen/AArch64/nontemporal.ll
361	Yes, I've added the missing handler for v16f16.

Unit tests: pass. 62002 tests passed, 0 failed and 783 were skipped.

clang-tidy: unknown.

clang-format: fail. Please format your changes with clang-format by running git-clang-format HEAD^ or applying this patch.

Build artifacts: diff.json, clang-format.patch, CMakeCache.txt, console-log.txt, test-results.xml

Harbormaster failed remote builds in B44375: Diff 239022!Jan 19 2020, 8:36 PM

LGTM. Thanks

This revision is now accepted and ready to land.Jan 20 2020, 10:51 AM

Closed by commit rG535ed62c5fcb: [AArch64] Add custom store lowering for 256 bit non-temporal stores. (authored by fhahn). · Explain WhyJan 21 2020, 3:00 PM

This revision was automatically updated to reflect the committed changes.

Thanks Dave!

fhahn mentioned this in D73158: [AArch64TTI] AArch64 supports NT vector stores through STNP..Jan 21 2020, 7:40 PM

fhahn mentioned this in rG39ae86ab72d7: [AArch64TTI] AArch64 supports NT vector stores through STNP..Jan 22 2020, 5:14 PM

fhahn mentioned this in rGd00c050627f4: [AArch64TTI] AArch64 supports NT vector stores through STNP..Jul 14 2020, 4:15 PM

Revision Contents

Path

Size

llvm/

lib/

Target/

AArch64/

AArch64ISelLowering.h

3 lines

AArch64ISelLowering.cpp

36 lines

AArch64InstrInfo.td

6 lines

test/

CodeGen/

AArch64/

nontemporal.ll

148 lines

Diff 239439

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Show First 20 Lines • Show All 266 Lines • ▼ Show 20 Lines	enum NodeType : unsigned {
ST4LANEpost,		ST4LANEpost,

STG,		STG,
STZG,		STZG,
ST2G,		ST2G,
STZ2G,		STZ2G,

LDP,		LDP,
STP		STP,
		STNP
};		};

} // end namespace AArch64ISD		} // end namespace AArch64ISD

namespace {		namespace {

// Any instruction that defines a 32-bit result zeros out the high half of the		// Any instruction that defines a 32-bit result zeros out the high half of the
// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may		// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
▲ Show 20 Lines • Show All 541 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 519 Lines • ▼ Show 20 Lines	AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);		setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);		setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);		setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);

// 128-bit loads and stores can be done without expanding		// 128-bit loads and stores can be done without expanding
setOperationAction(ISD::LOAD, MVT::i128, Custom);		setOperationAction(ISD::LOAD, MVT::i128, Custom);
setOperationAction(ISD::STORE, MVT::i128, Custom);		setOperationAction(ISD::STORE, MVT::i128, Custom);

		// 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
		// custom lowering, as there are no un-paired non-temporal stores and
		// legalization will break up 256 bit inputs.
		setOperationAction(ISD::STORE, MVT::v32i8, Custom);
		setOperationAction(ISD::STORE, MVT::v16i16, Custom);
		setOperationAction(ISD::STORE, MVT::v16f16, Custom);
		setOperationAction(ISD::STORE, MVT::v8i32, Custom);
		setOperationAction(ISD::STORE, MVT::v8f32, Custom);
		setOperationAction(ISD::STORE, MVT::v4f64, Custom);
		setOperationAction(ISD::STORE, MVT::v4i64, Custom);

// Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.		// Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
// This requires the Performance Monitors extension.		// This requires the Performance Monitors extension.
if (Subtarget->hasPerfMon())		if (Subtarget->hasPerfMon())
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);		setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);

if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&		if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {		getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
// Issue __sincos_stret if available.		// Issue __sincos_stret if available.
▲ Show 20 Lines • Show All 841 Lines • ▼ Show 20 Lines	const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::SST1_SCALED: return "AArch64ISD::SST1_SCALED";		case AArch64ISD::SST1_SCALED: return "AArch64ISD::SST1_SCALED";
case AArch64ISD::SST1_SXTW: return "AArch64ISD::SST1_SXTW";		case AArch64ISD::SST1_SXTW: return "AArch64ISD::SST1_SXTW";
case AArch64ISD::SST1_UXTW: return "AArch64ISD::SST1_UXTW";		case AArch64ISD::SST1_UXTW: return "AArch64ISD::SST1_UXTW";
case AArch64ISD::SST1_SXTW_SCALED: return "AArch64ISD::SST1_SXTW_SCALED";		case AArch64ISD::SST1_SXTW_SCALED: return "AArch64ISD::SST1_SXTW_SCALED";
case AArch64ISD::SST1_UXTW_SCALED: return "AArch64ISD::SST1_UXTW_SCALED";		case AArch64ISD::SST1_UXTW_SCALED: return "AArch64ISD::SST1_UXTW_SCALED";
case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM";		case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM";
case AArch64ISD::LDP: return "AArch64ISD::LDP";		case AArch64ISD::LDP: return "AArch64ISD::LDP";
case AArch64ISD::STP: return "AArch64ISD::STP";		case AArch64ISD::STP: return "AArch64ISD::STP";
		case AArch64ISD::STNP: return "AArch64ISD::STNP";
}		}
return nullptr;		return nullptr;
}		}

MachineBasicBlock *		MachineBasicBlock *
AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,		AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
MachineBasicBlock *MBB) const {		MachineBasicBlock *MBB) const {
// We materialise the F128CSEL pseudo-instruction as some control flow and a		// We materialise the F128CSEL pseudo-instruction as some control flow and a
▲ Show 20 Lines • Show All 1,672 Lines • ▼ Show 20 Lines	if (Align < MemVT.getStoreSize() &&
StoreNode->getMemOperand()->getFlags(),		StoreNode->getMemOperand()->getFlags(),
nullptr)) {		nullptr)) {
return scalarizeVectorStore(StoreNode, DAG);		return scalarizeVectorStore(StoreNode, DAG);
}		}

if (StoreNode->isTruncatingStore()) {		if (StoreNode->isTruncatingStore()) {
return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);		return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
}		}
		// 256 bit non-temporal stores can be lowered to STNP. Do this as part of
		// the custom lowering, as there are no un-paired non-temporal stores and
		// legalization will break up 256 bit inputs.
		if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
		MemVT.getVectorElementCount().Min % 2u == 0 &&
		((MemVT.getScalarSizeInBits() == 8u \|\|
		MemVT.getScalarSizeInBits() == 16u \|\|
		MemVT.getScalarSizeInBits() == 32u \|\|
		MemVT.getScalarSizeInBits() == 64u))) {
		SDValue Lo =
		DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
		MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
		StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
		SDValue Hi = DAG.getNode(
		ISD::EXTRACT_SUBVECTOR, Dl,
		MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
		StoreNode->getValue(),
		DAG.getConstant(MemVT.getVectorElementCount().Min / 2, Dl, MVT::i64));
		SDValue Result = DAG.getMemIntrinsicNode(
		AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
		{StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
		StoreNode->getMemoryVT(), StoreNode->getMemOperand());
		return Result;
		}
} else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {		} else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
assert(StoreNode->getValue()->getValueType(0) == MVT::i128);		assert(StoreNode->getValue()->getValueType(0) == MVT::i128);
SDValue Lo =		SDValue Lo =
DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),		DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
DAG.getConstant(0, Dl, MVT::i64));		DAG.getConstant(0, Dl, MVT::i64));
SDValue Hi =		SDValue Hi =
DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),		DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
DAG.getConstant(1, Dl, MVT::i64));		DAG.getConstant(1, Dl, MVT::i64));
▲ Show 20 Lines • Show All 10,280 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64InstrInfo.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 239 Lines • ▼ Show 20 Lines

	def SDT_AArch64ITOF : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;			def SDT_AArch64ITOF : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;

	def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,			def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
	SDTCisPtrTy<1>]>;			SDTCisPtrTy<1>]>;

	def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;			def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
	def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;			def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
				def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;

	// Generates the general dynamic sequences, i.e.			// Generates the general dynamic sequences, i.e.
	// adrp x0, :tlsdesc:var			// adrp x0, :tlsdesc:var
	// ldr x1, [x0, #:tlsdesc_lo12:var]			// ldr x1, [x0, #:tlsdesc_lo12:var]
	// add x0, x0, #:tlsdesc_lo12:var			// add x0, x0, #:tlsdesc_lo12:var
	// .tlsdesccall var			// .tlsdesccall var
	// blr x1			// blr x1

	▲ Show 20 Lines • Show All 283 Lines • ▼ Show 20 Lines
	]>;			]>;
	def AArch64sunpkhi : SDNode<"AArch64ISD::SUNPKHI", SDT_AArch64unpk>;			def AArch64sunpkhi : SDNode<"AArch64ISD::SUNPKHI", SDT_AArch64unpk>;
	def AArch64sunpklo : SDNode<"AArch64ISD::SUNPKLO", SDT_AArch64unpk>;			def AArch64sunpklo : SDNode<"AArch64ISD::SUNPKLO", SDT_AArch64unpk>;
	def AArch64uunpkhi : SDNode<"AArch64ISD::UUNPKHI", SDT_AArch64unpk>;			def AArch64uunpkhi : SDNode<"AArch64ISD::UUNPKHI", SDT_AArch64unpk>;
	def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>;			def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>;

	def AArch64ldp : SDNode<"AArch64ISD::LDP", SDT_AArch64ldp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;			def AArch64ldp : SDNode<"AArch64ISD::LDP", SDT_AArch64ldp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
	def AArch64stp : SDNode<"AArch64ISD::STP", SDT_AArch64stp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;			def AArch64stp : SDNode<"AArch64ISD::STP", SDT_AArch64stp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
				def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;

	def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>;			def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>;

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	// AArch64 Instruction Predicate Definitions.			// AArch64 Instruction Predicate Definitions.
	▲ Show 20 Lines • Show All 2,174 Lines • ▼ Show 20 Lines
	defm STNPX : StorePairNoAlloc<0b10, 0, GPR64z, simm7s8, "stnp">;			defm STNPX : StorePairNoAlloc<0b10, 0, GPR64z, simm7s8, "stnp">;
	defm STNPS : StorePairNoAlloc<0b00, 1, FPR32Op, simm7s4, "stnp">;			defm STNPS : StorePairNoAlloc<0b00, 1, FPR32Op, simm7s4, "stnp">;
	defm STNPD : StorePairNoAlloc<0b01, 1, FPR64Op, simm7s8, "stnp">;			defm STNPD : StorePairNoAlloc<0b01, 1, FPR64Op, simm7s8, "stnp">;
	defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128Op, simm7s16, "stnp">;			defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128Op, simm7s16, "stnp">;

	def : Pat<(AArch64stp GPR64z:$Rt, GPR64z:$Rt2, (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),			def : Pat<(AArch64stp GPR64z:$Rt, GPR64z:$Rt2, (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
	(STPXi GPR64z:$Rt, GPR64z:$Rt2, GPR64sp:$Rn, simm7s8:$offset)>;			(STPXi GPR64z:$Rt, GPR64z:$Rt2, GPR64sp:$Rn, simm7s8:$offset)>;

				def : Pat<(AArch64stnp FPR128:$Rt, FPR128:$Rt2, (am_indexed7s128 GPR64sp:$Rn, simm7s16:$offset)),
				dmgreenUnsubmitted Not Done Reply Inline Actions Should this be simm7s16? Same for am_indexed7s128 dmgreen: Should this be simm7s16? Same for am_indexed7s128
				fhahnAuthorUnsubmitted Done Reply Inline Actions Yes! I've updated that and now the offsets should be correct. fhahn: Yes! I've updated that and now the offsets should be correct.
				(STNPQi FPR128:$Rt, FPR128:$Rt2, GPR64sp:$Rn, simm7s16:$offset)>;


	//---			//---
	// (Register offset)			// (Register offset)

	// Integer			// Integer
	defm STRBB : Store8RO< 0b00, 0, 0b00, GPR32, "strb", i32, truncstorei8>;			defm STRBB : Store8RO< 0b00, 0, 0b00, GPR32, "strb", i32, truncstorei8>;
	defm STRHH : Store16RO<0b01, 0, 0b00, GPR32, "strh", i32, truncstorei16>;			defm STRHH : Store16RO<0b01, 0, 0b00, GPR32, "strh", i32, truncstorei16>;
	defm STRW : Store32RO<0b10, 0, 0b00, GPR32, "str", i32, store>;			defm STRW : Store32RO<0b10, 0, 0b00, GPR32, "str", i32, store>;
	defm STRX : Store64RO<0b11, 0, 0b00, GPR64, "str", i64, store>;			defm STRX : Store64RO<0b11, 0, 0b00, GPR64, "str", i64, store>;
	▲ Show 20 Lines • Show All 4,587 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/nontemporal.ll

	; RUN: llc < %s -mtriple aarch64-apple-darwin -asm-verbose=false -disable-post-ra \| FileCheck %s			; RUN: llc < %s -mtriple aarch64-apple-darwin -asm-verbose=false -disable-post-ra \| FileCheck %s

	define void @test_stnp_v4i64(<4 x i64>* %p, <4 x i64> %v) #0 {			define void @test_stnp_v4i64(<4 x i64>* %p, <4 x i64> %v) #0 {
	; CHECK-LABEL: test_stnp_v4i64:			; CHECK-LABEL: test_stnp_v4i64:
	; CHECK-NEXT: mov d[[HI1:[0-9]+]], v1[1]			; CHECK-NEXT: stnp q0, q1, [x0]
	; CHECK-NEXT: mov d[[HI0:[0-9]+]], v0[1]
	; CHECK-NEXT: stnp d1, d[[HI1]], [x0, #16]
	; CHECK-NEXT: stnp d0, d[[HI0]], [x0]
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	store <4 x i64> %v, <4 x i64>* %p, align 1, !nontemporal !0			store <4 x i64> %v, <4 x i64>* %p, align 1, !nontemporal !0
	ret void			ret void
	}			}

	define void @test_stnp_v4i32(<4 x i32>* %p, <4 x i32> %v) #0 {			define void @test_stnp_v4i32(<4 x i32>* %p, <4 x i32> %v) #0 {
	; CHECK-LABEL: test_stnp_v4i32:			; CHECK-LABEL: test_stnp_v4i32:
	; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1]			; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1]
	▲ Show 20 Lines • Show All 312 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: bl _dummy			; CHECK-NEXT: bl _dummy
	%tmp0 = alloca <4 x float>, i32 2			%tmp0 = alloca <4 x float>, i32 2
	%tmp1 = getelementptr <4 x float>, <4 x float>* %tmp0, i32 1			%tmp1 = getelementptr <4 x float>, <4 x float>* %tmp0, i32 1
	store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0			store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0
	call void @dummy(<4 x float>* %tmp0)			call void @dummy(<4 x float>* %tmp0)
	ret void			ret void
	}			}

				define void @test_stnp_v32i8(<32 x i8> %v, <32 x i8>* %ptr) {
				; CHECK-LABEL: _test_stnp_v32i8:
				; CHECK-NEXT: .cfi_startproc
				; CHECK-NEXT: stnp q0, q1, [x0]
				; CHECK-NEXT: ret

				entry:
				store <32 x i8> %v, <32 x i8>* %ptr, align 4, !nontemporal !0
				ret void
				}

				define void @test_stnp_v32i16(<32 x i16> %v, <32 x i16>* %ptr) {
				; CHECK-LABEL: _test_stnp_v32i16:
				; CHECK-NEXT: .cfi_startproc
				; CHECK-NEXT: stnp q2, q3, [x0, #32]
				; CHECK-NEXT: stnp q0, q1, [x0]
				; CHECK-NEXT: ret

				entry:
				store <32 x i16> %v, <32 x i16>* %ptr, align 4, !nontemporal !0
				ret void
				}

				define void @test_stnp_v32f16(<32 x half> %v, <32 x half>* %ptr) {
				; CHECK-LABEL: _test_stnp_v32f16:
				; CHECK-NEXT: .cfi_startproc
				; CHECK-NEXT: stnp q2, q3, [x0, #32]
				; CHECK-NEXT: stnp q0, q1, [x0]
				dmgreenUnsubmitted Not Done Reply Inline Actions Should halves work too? dmgreen: Should halves work too?
				fhahnAuthorUnsubmitted Done Reply Inline Actions Yes, I've added the missing handler for v16f16. fhahn: Yes, I've added the missing handler for v16f16.
				; CHECK-NEXT: ret

				entry:
				store <32 x half> %v, <32 x half>* %ptr, align 4, !nontemporal !0
				ret void
				}

				define void @test_stnp_v16i32(<16 x i32> %v, <16 x i32>* %ptr) {
				; CHECK-LABEL: _test_stnp_v16i32:
				; CHECK-NEXT: .cfi_startproc
				; CHECK-NEXT: stnp q2, q3, [x0, #32]
				dmgreenUnsubmitted Done Reply Inline Actions Nit: There's an extra " " here it seems dmgreen: Nit: There's an extra " " here it seems
				; CHECK-NEXT: stnp q0, q1, [x0]
				; CHECK-NEXT: ret

				entry:
				store <16 x i32> %v, <16 x i32>* %ptr, align 4, !nontemporal !0
				ret void
				}

				define void @test_stnp_v16f32(<16 x float> %v, <16 x float>* %ptr) {
				; CHECK-LABEL: _test_stnp_v16f32:
				; CHECK-NEXT: .cfi_startproc
				; CHECK-NEXT: stnp q2, q3, [x0, #32]
				; CHECK-NEXT: stnp q0, q1, [x0]
				; CHECK-NEXT: ret

				entry:
				store <16 x float> %v, <16 x float>* %ptr, align 4, !nontemporal !0
				ret void
				}

				define void @test_stnp_v17f32(<17 x float> %v, <17 x float>* %ptr) {
				; CHECK-LABEL: _test_stnp_v17f32:
				; CHECK-NEXT: .cfi_startproc
				; CHECK-NEXT: ldr s16, [sp, #16]
				; CHECK-NEXT: mov.s v0[1], v1[0]
				; CHECK-NEXT: mov.s v4[1], v5[0]
				; CHECK-NEXT: ldr s1, [sp]
				; CHECK-NEXT: add x8, sp, #20
				; CHECK-NEXT: ld1.s { v16 }[1], [x8]
				; CHECK-NEXT: add x8, sp, #4
				; CHECK-NEXT: ld1.s { v1 }[1], [x8]
				; CHECK-NEXT: add x8, sp, #24
				; CHECK-NEXT: ld1.s { v16 }[2], [x8]
				; CHECK-NEXT: add x8, sp, #8
				; CHECK-NEXT: ld1.s { v1 }[2], [x8]
				; CHECK-NEXT: add x8, sp, #28
				; CHECK-NEXT: ld1.s { v16 }[3], [x8]
				; CHECK-NEXT: add x8, sp, #12
				; CHECK-NEXT: mov.s v0[2], v2[0]
				; CHECK-NEXT: ldr s2, [sp, #32]
				; CHECK-NEXT: mov.s v4[2], v6[0]
				; CHECK-NEXT: mov.s v0[3], v3[0]
				; CHECK-NEXT: mov.s v4[3], v7[0]
				; CHECK-NEXT: mov d3, v4[1]
				; CHECK-NEXT: mov d5, v0[1]
				; CHECK-NEXT: ld1.s { v1 }[3], [x8]
				; CHECK-NEXT: stnp d4, d3, [x0, #16]
				; CHECK-NEXT: stnp d0, d5, [x0]
				; CHECK-NEXT: mov d0, v16[1]
				; CHECK-NEXT: mov d3, v1[1]
				; CHECK-NEXT: stnp d16, d0, [x0, #48]
				; CHECK-NEXT: stnp d1, d3, [x0, #32]
				; CHECK-NEXT: str s2, [x0, #64]
				; CHECK-NEXT: ret

				entry:
				store <17 x float> %v, <17 x float>* %ptr, align 4, !nontemporal !0
				ret void
				}
				define void @test_stnp_v16i32_invalid_offset(<16 x i32> %v, <16 x i32>* %ptr) {
				; CHECK-LABEL: _test_stnp_v16i32_invalid_offset:
				; CHECK-NEXT: .cfi_startproc
				; CHECK-NEXT: mov w8, #32000
				; CHECK-NEXT: mov w9, #32032
				; CHECK-NEXT: add x8, x0, x8
				; CHECK-NEXT: add x9, x0, x9
				; CHECK-NEXT: stnp q2, q3, [x9]
				; CHECK-NEXT: stnp q0, q1, [x8]
				; CHECK-NEXT: ret

				entry:
				%gep = getelementptr <16 x i32>, <16 x i32>* %ptr, i32 500
				store <16 x i32> %v, <16 x i32>* %gep, align 4, !nontemporal !0
				ret void
				}

				define void @test_stnp_v16f64(<16 x double> %v, <16 x double>* %ptr) {
				; CHECK-LABEL: _test_stnp_v16f64:
				; CHECK-NEXT: .cfi_startproc
				; CHECK-NEXT: stnp q6, q7, [x0, #96]
				; CHECK-NEXT: stnp q4, q5, [x0, #64]
				; CHECK-NEXT: stnp q2, q3, [x0, #32]
				; CHECK-NEXT: stnp q0, q1, [x0]
				; CHECK-NEXT: ret

				entry:
				store <16 x double> %v, <16 x double>* %ptr, align 4, !nontemporal !0
				ret void
				}

				define void @test_stnp_v16i64(<16 x i64> %v, <16 x i64>* %ptr) {
				; CHECK-LABEL: _test_stnp_v16i64:
				; CHECK-NEXT: .cfi_startproc
				; CHECK-NEXT: stnp q6, q7, [x0, #96]
				; CHECK-NEXT: stnp q4, q5, [x0, #64]
				; CHECK-NEXT: stnp q2, q3, [x0, #32]
				; CHECK-NEXT: stnp q0, q1, [x0]
				; CHECK-NEXT: ret

				entry:
				store <16 x i64> %v, <16 x i64>* %ptr, align 4, !nontemporal !0
				ret void
				}

	!0 = !{ i32 1 }			!0 = !{ i32 1 }

	attributes #0 = { nounwind }			attributes #0 = { nounwind }