This is an archive of the discontinued LLVM Phabricator instance.

[AArch64] Custom lower some extloads
AbandonedPublic

Authored by mssimpso on Jan 27 2016, 1:14 PM.

Download Raw Diff

Details

Reviewers

gberry
jmolloy

Summary

This patch adds custom lowering for sign- and any-extended loads of v2i8, v4i8 and v2i16. Instead of generating multiple loads followed by vector inserts, we now generate a single scalar load followed by a vector shuffle. This works was adapted from r213897 (the corresponding patch for the X86 target).

Diff Detail

Event Timeline

mssimpso updated this revision to Diff 46172.Jan 27 2016, 1:14 PM

mssimpso retitled this revision from to [AArch64] Custom lower some extloads.

mssimpso updated this object.

mssimpso added reviewers: mcrosier, gberry, jmolloy, chandlerc.

mssimpso added a subscriber: llvm-commits.

Herald added subscribers: mcrosier, rengolin, aemerson. · View Herald TranscriptJan 27 2016, 1:14 PM

bmakam added a subscriber: bmakam.Jan 27 2016, 1:54 PM

junbuml added a subscriber: junbuml.Jan 28 2016, 7:09 AM

Hi Chad,

A couple of comments.

Thanks,

James

lib/Target/AArch64/AArch64ISelLowering.cpp
2334	Is this guaranteed to iterate in size order? Perhaps we sholud add a comment indicating that we rely upon this? (and an assert that the size of every item is greater than that of the preceding item?)
2343	I'm worried as to how this assert can fire. The logic for getting here looks the same for extload and sextload, so how is this OK for extloads and not for sextloads (and where do sextloads that would fire this assert get filtered out?)
2390	Why can't we zextload here (as well as sext and extloading)? Is there an instruction missing in the ISA for zexting?

This revision now requires changes to proceed.Feb 15 2016, 6:55 AM

Just getting this off my radar and I don't think Matt plans on continuing this work. If that's not the case, feel free to add me back as a reviewer.

Thanks, Chad. I have no plans as of now for this patch. The small performance improvement I thought I saw with this I think was due to alignment instead. We can always resurrect it if needed. And thanks James for the initial review!

chandlerc removed a reviewer: chandlerc.May 12 2016, 1:59 PM

Revision Contents

Path

Size

lib/

Target/

AArch64/

AArch64ISelLowering.h

1 line

AArch64ISelLowering.cpp

122 lines

test/

CodeGen/

AArch64/

neon-truncStore-extLoad.ll

37 lines

Diff 46172

lib/Target/AArch64/AArch64ISelLowering.h

Show First 20 Lines • Show All 491 Lines • ▼ Show 20 Lines	private:
SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorAND(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerVectorAND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
		SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;

SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,		SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
std::vector<SDNode > Created) const override;		std::vector<SDNode > Created) const override;
unsigned combineRepeatedFPDivisors() const override;		unsigned combineRepeatedFPDivisors() const override;

ConstraintType getConstraintType(StringRef Constraint) const override;		ConstraintType getConstraintType(StringRef Constraint) const override;
unsigned getRegisterByName(const char* RegName, EVT VT,		unsigned getRegisterByName(const char* RegName, EVT VT,
SelectionDAG &DAG) const override;		SelectionDAG &DAG) const override;
▲ Show 20 Lines • Show All 53 Lines • Show Last 20 Lines

lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 615 Lines • ▼ Show 20 Lines	if (Subtarget->hasNEON()) {
for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {		for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
setOperationAction(ISD::FFLOOR, Ty, Legal);		setOperationAction(ISD::FFLOOR, Ty, Legal);
setOperationAction(ISD::FNEARBYINT, Ty, Legal);		setOperationAction(ISD::FNEARBYINT, Ty, Legal);
setOperationAction(ISD::FCEIL, Ty, Legal);		setOperationAction(ISD::FCEIL, Ty, Legal);
setOperationAction(ISD::FRINT, Ty, Legal);		setOperationAction(ISD::FRINT, Ty, Legal);
setOperationAction(ISD::FTRUNC, Ty, Legal);		setOperationAction(ISD::FTRUNC, Ty, Legal);
setOperationAction(ISD::FROUND, Ty, Legal);		setOperationAction(ISD::FROUND, Ty, Legal);
}		}

		// We support custom legalization of extended loads that we can load as
		// scalars and then extend in-register. This prevents us from generating
		// multiple loads and insertions.
		for (MVT VT : MVT::integer_vector_valuetypes()) {
		setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
		setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
		setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
		setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
		setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
		setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
		}
}		}

// Prefer likely predicted branches to selects on out-of-order cores.		// Prefer likely predicted branches to selects on out-of-order cores.
if (Subtarget->isCortexA57())		if (Subtarget->isCortexA57())
PredictableSelectIsExpensive = true;		PredictableSelectIsExpensive = true;
}		}

void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {		void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
▲ Show 20 Lines • Show All 1,651 Lines • ▼ Show 20 Lines	case Intrinsic::aarch64_neon_smin:
return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),		return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));		Op.getOperand(1), Op.getOperand(2));
case Intrinsic::aarch64_neon_umin:		case Intrinsic::aarch64_neon_umin:
return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),		return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));		Op.getOperand(1), Op.getOperand(2));
}		}
}		}

		SDValue AArch64TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {

		MVT RegVT = Op.getSimpleValueType();
		assert(RegVT.isVector() && "We only custom lower vector sext loads");
		assert(RegVT.isInteger() && "We only custom lower integer vector sext loads");

		LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
		SDLoc dl(Ld);
		EVT MemVT = Ld->getMemoryVT();
		const TargetLowering &TLI = DAG.getTargetLoweringInfo();
		unsigned RegSz = RegVT.getSizeInBits();

		// The extension type should be any-extend or sign-extend.
		ISD::LoadExtType Ext = Ld->getExtensionType();
		assert((Ext == ISD::EXTLOAD \|\| Ext == ISD::SEXTLOAD) &&
		"Only anyext and sext are currently implemented");
		assert(MemVT != RegVT && "Cannot extend to the same type");
		assert(MemVT.isVector() && "Must load a vector from memory");

		// The number of vector elements and their total size.
		unsigned NumElems = RegVT.getVectorNumElements();
		unsigned MemSz = MemVT.getSizeInBits();
		assert(RegSz > MemSz && "Register size must be greater than the mem size");

		// All sizes must be a power of two.
		assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
		"Non-power-of-two elements are not custom lowered");

		// We attempt to load the original value using scalar loads. First, find the
		// largest scalar type that divides the total loaded size.
		MVT SclrLoadTy = MVT::i8;
		for (MVT VT : MVT::integer_vector_valuetypes())
		jmolloyUnsubmitted Not Done Reply Inline Actions Is this guaranteed to iterate in size order? Perhaps we sholud add a comment indicating that we rely upon this? (and an assert that the size of every item is greater than that of the preceding item?) jmolloy: Is this guaranteed to iterate in size order? Perhaps we sholud add a comment indicating that we…
		if (TLI.isTypeLegal(VT))
		if (MemSz % VT.getScalarType().getSizeInBits() == 0)
		SclrLoadTy = VT.getScalarType();

		// Calculate the number of scalar loads that we need to perform in order to
		// load our vector from memory.
		unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();

		assert((Ext != ISD::SEXTLOAD \|\| NumLoads == 1) &&
		jmolloyUnsubmitted Not Done Reply Inline Actions I'm worried as to how this assert can fire. The logic for getting here looks the same for extload and sextload, so how is this OK for extloads and not for sextloads (and where do sextloads that would fire this assert get filtered out?) jmolloy: I'm worried as to how this assert can fire. The logic for getting here looks the same for…
		"Can only lower sext loads with a single scalar load!");

		// We represent our vector as a sequence of elements that are the largest
		// scalars that we can load.
		EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
		RegSz / SclrLoadTy.getSizeInBits());

		// We represent the data using the same element type that is stored in
		// memory.
		EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
		RegSz / MemVT.getScalarSizeInBits());

		assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
		"Invalid vector type");

		// We will perform the extensions using vector shuffles, so we need to ensure
		// the type is not illegal.
		assert(TLI.isTypeLegal(WideVecVT) &&
		"We only lower types that form legal widened vector types");

		SmallVector<SDValue, 8> Chains;
		SDValue Ptr = Ld->getBasePtr();
		SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
		TLI.getPointerTy(DAG.getDataLayout()));
		SDValue Res = DAG.getUNDEF(LoadUnitVecVT);

		// Perform the scalar single loads.
		for (unsigned i = 0; i < NumLoads; ++i) {
		SDValue ScalarLoad =
		DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
		Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
		Ld->getAlignment());
		Chains.push_back(ScalarLoad.getValue(1));
		Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
		ScalarLoad, DAG.getIntPtrConstant(i, dl));
		Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
		}

		SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);

		// Bitcast the loaded value to a vector of the original element type, in the
		// size of the target vector type.
		SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
		unsigned SizeRatio = RegSz / MemSz;

		// Sign extend the vector. This will be legalized to a shuffle and shifts.
		if (Ext == ISD::SEXTLOAD) {
		jmolloyUnsubmitted Not Done Reply Inline Actions Why can't we zextload here (as well as sext and extloading)? Is there an instruction missing in the ISA for zexting? jmolloy: Why can't we zextload here (as well as sext and extloading)? Is there an instruction missing in…
		SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
		DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
		return Shuff;
		}

		// If we are not sign extending the vector, just shuffle the loaded elements
		// into the different locations.
		SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
		for (unsigned i = 0; i != NumElems; ++i)
		ShuffleVec[i * SizeRatio] = i;

		SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
		DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);

		// Finally, bitcast to the result requested type.
		Shuff = DAG.getBitcast(RegVT, Shuff);
		DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
		return Shuff;
		}

SDValue AArch64TargetLowering::LowerOperation(SDValue Op,		SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
switch (Op.getOpcode()) {		switch (Op.getOpcode()) {
default:		default:
llvm_unreachable("unimplemented operand");		llvm_unreachable("unimplemented operand");
return SDValue();		return SDValue();
case ISD::BITCAST:		case ISD::BITCAST:
return LowerBITCAST(Op, DAG);		return LowerBITCAST(Op, DAG);
▲ Show 20 Lines • Show All 87 Lines • ▼ Show 20 Lines	SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::FP_TO_UINT:		case ISD::FP_TO_UINT:
return LowerFP_TO_INT(Op, DAG);		return LowerFP_TO_INT(Op, DAG);
case ISD::FSINCOS:		case ISD::FSINCOS:
return LowerFSINCOS(Op, DAG);		return LowerFSINCOS(Op, DAG);
case ISD::MUL:		case ISD::MUL:
return LowerMUL(Op, DAG);		return LowerMUL(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN:		case ISD::INTRINSIC_WO_CHAIN:
return LowerINTRINSIC_WO_CHAIN(Op, DAG);		return LowerINTRINSIC_WO_CHAIN(Op, DAG);
		case ISD::LOAD:
		return LowerLOAD(Op, DAG);
}		}
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Calling Convention Implementation		// Calling Convention Implementation
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "AArch64GenCallingConv.inc"		#include "AArch64GenCallingConv.inc"
▲ Show 20 Lines • Show All 7,823 Lines • Show Last 20 Lines

test/CodeGen/AArch64/neon-truncStore-extLoad.ll

	Show All 23 Lines
	; CHECK-LABEL: truncStore.v8i16:			; CHECK-LABEL: truncStore.v8i16:
	; CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h			; CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
	; CHECK: {{st1 { v[0-9]+.8b }\|str d[0-9]+}}, [x{{[0-9]+\|sp}}]			; CHECK: {{st1 { v[0-9]+.8b }\|str d[0-9]+}}, [x{{[0-9]+\|sp}}]
	%b = trunc <8 x i16> %a to <8 x i8>			%b = trunc <8 x i16> %a to <8 x i8>
	store <8 x i8> %b, <8 x i8>* %result			store <8 x i8> %b, <8 x i8>* %result
	ret void			ret void
	}			}

	; A vector LoadExt can not be selected.			define <2 x i8> @loadExt.v2i8(<2 x i8>* %ref) {
	; Test a vector load IR and a sext/zext IR can be selected correctly.			; CHECK-LABEL: loadExt.v2i8:
	define <4 x i32> @loadSExt.v4i8(<4 x i8>* %ref) {			; CHECK: ld1 { [[REG:v[0-9]+]].h }[0], [x0]
	; CHECK-LABEL: loadSExt.v4i8:			; CHECK: ins [[REG]].b[4], [[REG]].b[1]
	; CHECK: ldrsb			%a = load <2 x i8>, <2 x i8>* %ref
	%a = load <4 x i8>, <4 x i8>* %ref			ret <2 x i8> %a
	%conv = sext <4 x i8> %a to <4 x i32>
	ret <4 x i32> %conv
	}			}

	define <4 x i32> @loadZExt.v4i8(<4 x i8>* %ref) {			define <4 x i8> @loadExt.v4i8(<4 x i8>* %ref) {
	; CHECK-LABEL: loadZExt.v4i8:			; CHECK-LABEL: loadExt.v4i8:
	; CHECK: ldrb			; CHECK: ld1 { [[REG:v[0-9]+]].s }[0], [x0]
				; CHECK: zip1 {{v[0-9]+}}.8b, [[REG]].8b, {{v[0-9]+}}.8b
	%a = load <4 x i8>, <4 x i8>* %ref			%a = load <4 x i8>, <4 x i8>* %ref
	%conv = zext <4 x i8> %a to <4 x i32>			ret <4 x i8> %a
	ret <4 x i32> %conv
	}			}

	define i32 @loadExt.i32(<4 x i8>* %ref) {			define <2 x i16> @loadExt.v2i16(<2 x i16>* %ref) {
	; CHECK-LABEL: loadExt.i32:			; CHECK-LABEL: loadExt.v2i16:
	; CHECK: ldrb			; CHECK: ld1 { [[REG:v[0-9]+]].s }[0], [x0]
	%a = load <4 x i8>, <4 x i8>* %ref			; CHECK: zip1 {{v[0-9]+}}.4h, [[REG]].4h, {{v[0-9]+}}.4h
	%vecext = extractelement <4 x i8> %a, i32 0			%a = load <2 x i16>, <2 x i16>* %ref
	%conv = zext i8 %vecext to i32			ret <2 x i16> %a
	ret i32 %conv
	}			}