Diff 119339

lib/Target/ARM/ARMISelLowering.cpp

	Show First 20 Lines • Show All 92 Lines • ▼ Show 20 Lines

	// fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))			// fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
	if (N0.getNode()->hasOneUse())			if (N0.getNode()->hasOneUse())
	if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))			if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
	return Result;			return Result;
	return SDValue();			return SDValue();
	}			}

				static SDValue PerformUnfoldSHL(SDNode *N,
				john.brawnUnsubmitted Not Done Reply Inline Actions 'Unfold' is probably the wrong description of this as we may get these kind of instruction sequences without any folding having happened e.g. int fn(int a, int b) { return b + ((a << 1) \| 510); } john.brawn: 'Unfold' is probably the wrong description of this as we may get these kind of instruction…
				TargetLowering::DAGCombinerInfo &DCI,
				const ARMSubtarget *ST) {
				if (ST->isThumb() && ST->isThumb1Only())
				return SDValue();
				john.brawnUnsubmitted Not Done Reply Inline Actions Could do with a comment here saying something like "no 16-bit thumb instructions with shifted operand". john.brawn: Could do with a comment here saying something like "no 16-bit thumb instructions with shifted…

				if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR)
				return SDValue();
				john.brawnUnsubmitted Not Done Reply Inline Actions We should be doing this for XOR and AND as well. john.brawn: We should be doing this for XOR and AND as well.

				if (N->getOperand(0).getOpcode() != ISD::SHL)
				return SDValue();

				SDValue SHL = N->getOperand(0);

				auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
				auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
				if (!C1ShlC2 \|\| !C2)
				return SDValue();

				DEBUG(dbgs() << "Try to unfold: "; N->dump());
				// DAG combiner will fold:
				// (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
				// (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)

				// Many instructions can perform the shift for free, but it requires both
				// the operands to be registers. If c1 << c2 is too large, a mov immediate
				// instruction will needed. So, unfold back to the original pattern if:
				// - if c1 and c2 are small enough that they don't require mov imms.
				// - the user(s) of the node can perform an shl

				APInt C2Int = C2->getAPIntValue();
				APInt C1Int = C1ShlC2->getAPIntValue();

				// Check that C1 could have been shl'd by C2.
				APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(),
				C2Int.getBitWidth() - C2->getZExtValue());
				if ((C1Int & Mask) != C1Int)
				return SDValue();

				// Undo c1 << c2
				C1Int.lshrInPlace(C2Int);

				// The immediates are encoded as an 8-bit value that can be rotated.
				unsigned Zeros = C1Int.countLeadingZeros() + C1Int.countTrailingZeros();
				if (C1Int.getBitWidth() - Zeros > 8)
				return SDValue();

				Zeros = C2Int.countLeadingZeros() + C2Int.countTrailingZeros();
				if (C2Int.getBitWidth() - Zeros > 8)
				return SDValue();

				// Check that all the users could perform the shl themselves.
				SDValue BinOp = SDValue(N, 0);
				for (auto U : N->uses()) {
				switch(U->getOpcode()) {
				default:
				return SDValue();
				case ISD::ADD:
				case ISD::SUB:
				case ISD::AND:
				case ISD::OR:
				case ISD::XOR:
				case ISD::SETCC:
				case ARMISD::CMP:
				john.brawnUnsubmitted Not Done Reply Inline Actions You're setting BinOp here then immediately setting it to something else a few lines later, you should be setting it just once. john.brawn: You're setting BinOp here then immediately setting it to something else a few lines later, you…
				break;
				}
				}

				SelectionDAG &DAG = DCI.DAG;
				SDLoc dl(N);
				SDValue X = SHL.getOperand(0);
				BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
				DAG.getConstant(C1Int, dl, MVT::i32));
				SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));

				DEBUG(dbgs() << "Unfolding: "; N->dump());
				DEBUG(dbgs() << "Into shift operation:\n";
				BinOp.dump();
				john.brawnUnsubmitted Not Done Reply Inline Actions It would make more sense to do this check nearer to the start of the function. Also you could check that all uses don't already have a shifted operand, as in that case doing this transform doesn't have any benefit. That should mean you don't have to adjust the load-combine tests. john.brawn: It would make more sense to do this check nearer to the start of the function. Also you could…
				Res.dump());

				DAG.ReplaceAllUsesWith(SDValue(N, 0), Res);
				return SDValue(N, 0);
				}

	/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.			/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
	///			///
	static SDValue PerformADDCombine(SDNode *N,			static SDValue PerformADDCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,			TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget) {			const ARMSubtarget *Subtarget) {
	SDValue N0 = N->getOperand(0);			SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);			SDValue N1 = N->getOperand(1);

				// Only works one way, because it needs an immediate operand.
				if (SDValue Result = PerformUnfoldSHL(N, DCI, Subtarget))
				return Result;

	// First try with the default operand order.			// First try with the default operand order.
	if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))			if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
	return Result;			return Result;

	// If that didn't work, try again with the operands commuted.			// If that didn't work, try again with the operands commuted.
	return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);			return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
	}			}

	▲ Show 20 Lines • Show All 56 Lines • ▼ Show 20 Lines
	return DAG.getNode(Opcode, DL, VT,			return DAG.getNode(Opcode, DL, VT,
	DAG.getNode(ISD::MUL, DL, VT, N00, N1),			DAG.getNode(ISD::MUL, DL, VT, N00, N1),
	DAG.getNode(ISD::MUL, DL, VT, N01, N1));			DAG.getNode(ISD::MUL, DL, VT, N01, N1));
	}			}

	static SDValue PerformMULCombine(SDNode *N,			static SDValue PerformMULCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,			TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget) {			const ARMSubtarget *Subtarget) {
	SelectionDAG &DAG = DCI.DAG;			SelectionDAG &DAG = DCI.DAG;
				john.brawnUnsubmitted Not Done Reply Inline Actions Moving this comment looks a little odd. john.brawn: Moving this comment looks a little odd.

	if (Subtarget->isThumb1Only())			if (Subtarget->isThumb1Only())
	return SDValue();			return SDValue();

	if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())			if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
	return SDValue();			return SDValue();

	EVT VT = N->getValueType(0);			EVT VT = N->getValueType(0);
	Show All 10 Lines
	unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);			unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);

	ShiftAmt = ShiftAmt & (32 - 1);			ShiftAmt = ShiftAmt & (32 - 1);
	SDValue V = N->getOperand(0);			SDValue V = N->getOperand(0);
	SDLoc DL(N);			SDLoc DL(N);

	SDValue Res;			SDValue Res;
	MulAmt >>= ShiftAmt;			MulAmt >>= ShiftAmt;

				john.brawnUnsubmitted Not Done Reply Inline Actions This should be called something else as there's already a PerformBFICombine and the name here violates the rule that the argument to PerformXCombine is a node of type X (here it's an OR of an AND). Probably PerformORCombineToBFI. john.brawn: This should be called something else as there's already a PerformBFICombine and the name here…
	▲ Show 20 Lines • Show All 92 Lines • ▼ Show 20 Lines
	if (!Subtarget->isThumb1Only()) {			if (!Subtarget->isThumb1Only()) {
	// fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))			// fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
	if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))			if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
	return Result;			return Result;
	if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))			if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
	return Result;			return Result;
	}			}

	// The code below optimizes (or (and X, Y), Z).
	// The AND operand needs to have a single user to make these optimizations
	// profitable.
	SDValue N0 = N->getOperand(0);			SDValue N0 = N->getOperand(0);
	if (N0.getOpcode() != ISD::AND \|\| !N0.hasOneUse())
	return SDValue();
	SDValue N1 = N->getOperand(1);			SDValue N1 = N->getOperand(1);

	// (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.			// (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
	if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&			if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
	DAG.getTargetLoweringInfo().isTypeLegal(VT)) {			DAG.getTargetLoweringInfo().isTypeLegal(VT)) {

				// The code below optimizes (or (and X, Y), Z).
				// The AND operand needs to have a single user to make these optimizations
				// profitable.
				if (N0.getOpcode() != ISD::AND \|\| !N0.hasOneUse())
				return SDValue();

	APInt SplatUndef;			APInt SplatUndef;
	unsigned SplatBitSize;			unsigned SplatBitSize;
	bool HasAnyUndefs;			bool HasAnyUndefs;

	APInt SplatBits0, SplatBits1;			APInt SplatBits0, SplatBits1;
	BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));			BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
	BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));			BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
	// Ensure that the second operand of both ands are constants			// Ensure that the second operand of both ands are constants
	Show All 21 Lines

	// Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when			// Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
	// reasonable.			// reasonable.

	// BFI is only available on V6T2+			// BFI is only available on V6T2+
	if (Subtarget->isThumb1Only() \|\| !Subtarget->hasV6T2Ops())			if (Subtarget->isThumb1Only() \|\| !Subtarget->hasV6T2Ops())
	return SDValue();			return SDValue();

	if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))			if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
	return Res;			if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
				return Res;
				}

				if (SDValue Result = PerformUnfoldSHL(N, DCI, Subtarget))
				return Result;

	return SDValue();			return SDValue();
	}			}

	static SDValue PerformXORCombine(SDNode *N,			static SDValue PerformXORCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,			TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget) {			const ARMSubtarget *Subtarget) {
	EVT VT = N->getValueType(0);			EVT VT = N->getValueType(0);
	▲ Show 20 Lines • Show All 92 Lines • Show Last 20 Lines

test/CodeGen/ARM/unfold-shifts.ll

This file was added.

				; RUN: llc -mtriple armv6t2 %s -o - \| FileCheck %s
				; RUN: llc -mtriple thumbv6t2 %s -o - \| FileCheck %s --check-prefix=CHECK-T2
				; RUN: llc -mtriple armv7 %s -o - \| FileCheck %s
				; RUN: llc -mtriple thumbv7 %s -o - \| FileCheck %s --check-prefix=CHECK-T2
				; RUN: llc -mtriple thumbv7m %s -o - \| FileCheck %s --check-prefix=CHECK-T2
				; RUN: llc -mtriple thumbv8m.main %s -o - \| FileCheck %s --check-prefix=CHECK-T2

				; CHECK-LABEL: unfold1
				; CHECK-NOT: mov
				; CHECK: orr r0, r0, #255
				; CHECK: add r0, r1, r0, lsl #1
				; CHECK-T2-NOT: mov
				; CHECK-T2: orr r0, r0, #255
				; CHECK-T2: add.w r0, r1, r0, lsl #1
				define arm_aapcscc i32 @unfold1(i32 %a, i32 %b) {
				entry:
				%or = shl i32 %a, 1
				%shl = or i32 %or, 510
				%add = add nsw i32 %shl, %b
				ret i32 %add
				}

				; CHECK-LABEL: unfold2
				; CHECK-NOT: mov
				; CHECK: orr r0, r0, #4080
				; CHECK: sub r0, r1, r0, lsl #2
				; CHECK-T2-NOT: mov
				; CHECK-T2: orr r0, r0, #4080
				; CHECK-T2: sub.w r0, r1, r0, lsl #2
				define arm_aapcscc i32 @unfold2(i32 %a, i32 %b) {
				entry:
				%or = shl i32 %a, 2
				%shl = or i32 %or, 16320
				%sub = sub nsw i32 %b, %shl
				ret i32 %sub
				}

				; CHECK-LABEL: unfold3
				; CHECK-NOT: mov
				; CHECK: orr r0, r0, #65280
				; CHECK: and r0, r1, r0, lsl #4
				; CHECK-T2-NOT: mov
				; CHECK-T2: orr r0, r0, #65280
				; CHECK-T2: and.w r0, r1, r0, lsl #4
				define arm_aapcscc i32 @unfold3(i32 %a, i32 %b) {
				entry:
				%or = shl i32 %a, 4
				%shl = or i32 %or, 1044480
				%and = and i32 %shl, %b
				ret i32 %and
				}

				; CHECK-LABEL: unfold4
				; CHECK-NOT: mov
				; CHECK: orr r0, r0, #1044480
				; CHECK: eor r0, r1, r0, lsl #5
				; CHECK-T2-NOT: mov
				; CHECK-T2: orr r0, r0, #1044480
				; CHECK-T2: eor.w r0, r1, r0, lsl #5
				define arm_aapcscc i32 @unfold4(i32 %a, i32 %b) {
				entry:
				%or = shl i32 %a, 5
				%shl = or i32 %or, 33423360
				%xor = xor i32 %shl, %b
				ret i32 %xor
				}

				; CHECK-LABEL: unfold5
				; CHECK-NOT: mov
				; CHECK: add r0, r0, #496
				; CHECK: orr r0, r1, r0, lsl #6
				; CHECK-T2: add.w r0, r0, #496
				; CHECK-T2: orr.w r0, r1, r0, lsl #6
				define arm_aapcscc i32 @unfold5(i32 %a, i32 %b) {
				entry:
				%add = shl i32 %a, 6
				%shl = add i32 %add, 31744
				%or = or i32 %shl, %b
				ret i32 %or
				}

				; CHECK-LABEL: unfold6
				; CHECK-NOT: mov
				; CHECK: add r0, r0, #7936
				; CHECK: and r0, r1, r0, lsl #8
				; CHECK-T2-NOT: mov
				; CHECK-T2: add.w r0, r0, #7936
				; CHECK-T2: and.w r0, r1, r0, lsl #8
				define arm_aapcscc i32 @unfold6(i32 %a, i32 %b) {
				entry:
				%add = shl i32 %a, 8
				%shl = add i32 %add, 2031616
				%and = and i32 %shl, %b
				ret i32 %and
				}

				; CHECK-LABEL: unfold7
				; CHECK-NOT: mov
				; CHECK: add r0, r0, #126976
				; CHECK: eor r0, r1, r0, lsl #9
				; CHECK-T2-NOT: mov
				; CHECK-T2: add.w r0, r0, #126976
				; CHECK-T2: eor.w r0, r1, r0, lsl #9
				define arm_aapcscc i32 @unfold7(i32 %a, i32 %b) {
				entry:
				%add = shl i32 %a, 9
				%shl = add i32 %add, 65011712
				%xor = xor i32 %shl, %b
				ret i32 %xor
				}

				; CHECK-LABEL: unfold8
				; CHECK-NOT: mov r2
				; CHECK: orr r2, r0, #4080
				; CHECK: cmp r1, r2, lsl #10
				; CHECK-T2-NOT: mov.w r2
				; CHECK-T2: orr r2, r0, #4080
				; CHECK-T2: cmp.w r1, r2, lsl #10
				define arm_aapcscc i32 @unfold8(i32 %a, i32 %b) {
				entry:
				%or = shl i32 %a, 10
				%shl = or i32 %or, 4177920
				%cmp = icmp sgt i32 %shl, %b
				%conv = zext i1 %cmp to i32
				ret i32 %conv
				}

				; CHECK-LABEL: unfold9
				; CHECK-NOT: mov r2
				; CHECK: add r2, r0, #7936
				; CHECK: cmp r1, r2, lsl #11
				; CHECK-T2-NOT: mov.w r2
				; CHECK-T2: add.w r2, r0, #7936
				; CHECK-T2: cmp.w r1, r2, lsl #11
				define arm_aapcscc i32 @unfold9(i32 %a, i32 %b) {
				entry:
				%add = shl i32 %a, 11
				%shl = add i32 %add, 16252928
				%cmp = icmp sgt i32 %shl, %b
				%conv = zext i1 %cmp to i32
				ret i32 %conv
				}

This is an archive of the discontinued LLVM Phabricator instance.

[ARM] add, or, and and xor with shl combining
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 119339

lib/Target/ARM/ARMISelLowering.cpp

test/CodeGen/ARM/unfold-shifts.ll

This is an archive of the discontinued LLVM Phabricator instance.

[ARM] add, or, and and xor with shl combiningClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 119339

lib/Target/ARM/ARMISelLowering.cpp

test/CodeGen/ARM/unfold-shifts.ll

[ARM] add, or, and and xor with shl combining
ClosedPublic