This is an archive of the discontinued LLVM Phabricator instance.

[X86][SSE] Disable shouldFoldConstantShiftPairToMask for btver1/btver2 targets (PR40758)
ClosedPublic

Authored by RKSimon on Apr 24 2019, 7:18 AM.

Download Raw Diff

Details

Reviewers

craig.topper
spatel
andreadb

Commits

rG5d6ef94c369a: [X86][SSE] Disable shouldFoldConstantShiftPairToMask for btver1/btver2 targets…
rL359293: [X86][SSE] Disable shouldFoldConstantShiftPairToMask for btver1/btver2 targets…

Summary

As detailed on PR40758, Bobcat/Jaguar can perform vector immediate shifts on the same pipes as vector ANDs with the same latency - so it doesn't make sense to replace a shl+lshr with a shift+and pair as it requires an additional mask (with the extra constant pool, loading and register pressure costs).

Diff Detail

Repository: rL LLVM

Event Timeline

RKSimon created this revision.Apr 24 2019, 7:18 AM

Herald added a project: Restricted Project. · View Herald TranscriptApr 24 2019, 7:18 AM

spatel added inline comments.Apr 24 2019, 9:45 AM

lib/Target/X86/X86.td
427–430 ↗	(On Diff #196446)	Is there a possibility that we would use this for scalar transforms too? If not, better to make this explicitly about vectors: "FeatureFastVectorShifts" ?

RKSimon marked an inline comment as done.Apr 24 2019, 12:01 PM

RKSimon added inline comments.

lib/Target/X86/X86.td
427–430 ↗	(On Diff #196446)	Yes AMD targets at least should benefit from the scalar case as well - I'll investigate.

LGTM

This revision is now accepted and ready to land.Apr 25 2019, 12:36 PM

Closed by commit rL359293: [X86][SSE] Disable shouldFoldConstantShiftPairToMask for btver1/btver2 targets… (authored by RKSimon). · Explain WhyApr 26 2019, 3:47 AM

This revision was automatically updated to reflect the committed changes.

RKSimon mentioned this in rL360530: [X86] Add scalar shl+lshr -> shift+mask tests (PR40758).May 11 2019, 12:14 PM

RKSimon mentioned this in rG91e697c145a0: [X86] Add scalar shl+lshr -> shift+mask tests (PR40758).May 11 2019, 12:17 PM

RKSimon mentioned this in D61830: [X86][SSE] Disable shouldFoldConstantShiftPairToMask for scalar shifts on AMD targets (PR40758).May 11 2019, 1:18 PM

RKSimon mentioned this in rL360684: [X86] Disable shouldFoldConstantShiftPairToMask for scalar shifts on AMD….May 14 2019, 8:19 AM

RKSimon mentioned this in rGc2d9cfd9250d: [X86] Disable shouldFoldConstantShiftPairToMask for scalar shifts on AMD….May 14 2019, 8:22 AM

sidorovd mentioned this in rGaaba479b6097: [X86] Add scalar shl+lshr -> shift+mask tests (PR40758).May 30 2019, 9:11 AM

sidorovd mentioned this in rGa915fd49484e: [X86] Disable shouldFoldConstantShiftPairToMask for scalar shifts on AMD….May 30 2019, 9:19 AM

sidorovd mentioned this in rG2a83c71ee6dd: [X86] Add scalar shl+lshr -> shift+mask tests (PR40758).May 30 2019, 10:11 AM

sidorovd mentioned this in rG3e131336d3a7: [X86] Disable shouldFoldConstantShiftPairToMask for scalar shifts on AMD….May 30 2019, 10:20 AM

Revision Contents

Path

Size

llvm/

trunk/

lib/

CodeGen/

SelectionDAG/

DAGCombiner.cpp

3 lines

Target/

X86/

X86.td

8 lines

X86ISelLowering.cpp

13 lines

X86Subtarget.h

4 lines

test/

CodeGen/

X86/

sse2-vector-shifts.ll

20 lines

Diff 196830

llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 6,876 Lines • ▼ Show 20 Lines	if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
DAG.getConstant(C1 - C2, DL, N1.getValueType()));		DAG.getConstant(C1 - C2, DL, N1.getValueType()));
}		}
}		}

// fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or		// fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or
// (and (srl x, (sub c1, c2), MASK)		// (and (srl x, (sub c1, c2), MASK)
// Only fold this if the inner shift has no other uses -- if it does, folding		// Only fold this if the inner shift has no other uses -- if it does, folding
// this will increase the total number of instructions.		// this will increase the total number of instructions.
		// TODO - drop hasOneUse requirement if c1 == c2?
		// TODO - support non-uniform vector shift amounts.
if (N1C && N0.getOpcode() == ISD::SRL && N0.hasOneUse() &&		if (N1C && N0.getOpcode() == ISD::SRL && N0.hasOneUse() &&
TLI.shouldFoldConstantShiftPairToMask(N, Level)) {		TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {		if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
if (N0C1->getAPIntValue().ult(OpSizeInBits)) {		if (N0C1->getAPIntValue().ult(OpSizeInBits)) {
uint64_t c1 = N0C1->getZExtValue();		uint64_t c1 = N0C1->getZExtValue();
uint64_t c2 = N1C->getZExtValue();		uint64_t c2 = N1C->getZExtValue();
APInt Mask = APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - c1);		APInt Mask = APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - c1);
SDValue Shift;		SDValue Shift;
▲ Show 20 Lines • Show All 290 Lines • ▼ Show 20 Lines	if (auto N001C = isConstOrConstSplat(N0.getOperand(0).getOperand(1))) {
N0.getOperand(0).getOperand(0),		N0.getOperand(0).getOperand(0),
DAG.getConstant(c1 + c2, DL,		DAG.getConstant(c1 + c2, DL,
ShiftCountVT)));		ShiftCountVT)));
}		}
}		}
}		}

// fold (srl (shl x, c), c) -> (and x, cst2)		// fold (srl (shl x, c), c) -> (and x, cst2)
		// TODO - (srl (shl x, c1), c2).
if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 &&		if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 &&
isConstantOrConstantVector(N1, /* NoOpaques */ true)) {		isConstantOrConstantVector(N1, /* NoOpaques */ true)) {
SDLoc DL(N);		SDLoc DL(N);
SDValue Mask =		SDValue Mask =
DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1);		DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1);
AddToWorklist(Mask.getNode());		AddToWorklist(Mask.getNode());
return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask);		return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask);
}		}
▲ Show 20 Lines • Show All 12,890 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/X86/X86.td

Show First 20 Lines • Show All 418 Lines • ▼ Show 20 Lines
// instructions if a CPU implements horizontal operations (introduced with		// instructions if a CPU implements horizontal operations (introduced with
// SSE3) with better latency/throughput than the alternative sequence.		// SSE3) with better latency/throughput than the alternative sequence.
def FeatureFastHorizontalOps		def FeatureFastHorizontalOps
: SubtargetFeature<		: SubtargetFeature<
"fast-hops", "HasFastHorizontalOps", "true",		"fast-hops", "HasFastHorizontalOps", "true",
"Prefer horizontal vector math instructions (haddp, phsub, etc.) over "		"Prefer horizontal vector math instructions (haddp, phsub, etc.) over "
"normal vector instructions with shuffles", [FeatureSSE3]>;		"normal vector instructions with shuffles", [FeatureSSE3]>;

		def FeatureFastVectorShiftMasks
		: SubtargetFeature<
		"fast-vector-shift-masks", "HasFastVectorShiftMasks", "true",
		"Prefer a left/right vector logical shift pair over a shift+and pair">;

// Merge branches using three-way conditional code.		// Merge branches using three-way conditional code.
def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch",		def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch",
"ThreewayBranchProfitable", "true",		"ThreewayBranchProfitable", "true",
"Merge branches to a three-way "		"Merge branches to a three-way "
"conditional branch">;		"conditional branch">;

// Bonnell		// Bonnell
def ProcIntelAtom : SubtargetFeature<"", "X86ProcFamily", "IntelAtom", "">;		def ProcIntelAtom : SubtargetFeature<"", "X86ProcFamily", "IntelAtom", "">;
▲ Show 20 Lines • Show All 335 Lines • ▼ Show 20 Lines	list<SubtargetFeature> BtVer1InheritableFeatures = [FeatureX87,
FeatureNOPL,		FeatureNOPL,
Feature64Bit,		Feature64Bit,
FeatureCMPXCHG16B,		FeatureCMPXCHG16B,
FeaturePRFCHW,		FeaturePRFCHW,
FeatureLZCNT,		FeatureLZCNT,
FeaturePOPCNT,		FeaturePOPCNT,
FeatureSlowSHLD,		FeatureSlowSHLD,
FeatureLAHFSAHF,		FeatureLAHFSAHF,
FeatureFast15ByteNOP];		FeatureFast15ByteNOP,
		FeatureFastVectorShiftMasks];
list<SubtargetFeature> BtVer1Features = BtVer1InheritableFeatures;		list<SubtargetFeature> BtVer1Features = BtVer1InheritableFeatures;

// Jaguar		// Jaguar
list<SubtargetFeature> BtVer2AdditionalFeatures = [FeatureAVX,		list<SubtargetFeature> BtVer2AdditionalFeatures = [FeatureAVX,
FeatureAES,		FeatureAES,
FeaturePCLMUL,		FeaturePCLMUL,
FeatureBMI,		FeatureBMI,
FeatureF16C,		FeatureF16C,
▲ Show 20 Lines • Show All 431 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 5,007 Lines • ▼ Show 20 Lines	bool X86TargetLowering::hasAndNot(SDValue Y) const {
if (VT == MVT::v4i32)		if (VT == MVT::v4i32)
return true;		return true;

return Subtarget.hasSSE2();		return Subtarget.hasSSE2();
}		}

bool X86TargetLowering::shouldFoldConstantShiftPairToMask(		bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
const SDNode *N, CombineLevel Level) const {		const SDNode *N, CombineLevel Level) const {
// TODO - some targets prefer immediate vector shifts to shift+mask.		assert((N->getOpcode() == ISD::SHL &&
		N->getOperand(0).getOpcode() == ISD::SRL) \|\|
		(N->getOpcode() == ISD::SRL &&
		N->getOperand(0).getOpcode() == ISD::SHL) &&
		"Expected shift-shift mask");

		if (Subtarget.hasFastVectorShiftMasks() && N->getValueType(0).isVector()) {
		// Only fold if the shift values are equal - so it folds to AND.
		// TODO - we should fold if either is non-uniform but we don't do the
		// fold for non-splats yet.
		return N->getOperand(1) == N->getOperand(0).getOperand(1);
		}
return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);		return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
}		}

bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {		bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
EVT VT = Y.getValueType();		EVT VT = Y.getValueType();

// For vectors, we don't have a preference, but we probably want a mask.		// For vectors, we don't have a preference, but we probably want a mask.
if (VT.isVector())		if (VT.isVector())
▲ Show 20 Lines • Show All 39,031 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/X86/X86Subtarget.h

Show First 20 Lines • Show All 387 Lines • ▼ Show 20 Lines	protected:
bool HasPCONFIG = false;		bool HasPCONFIG = false;

/// Processor has a single uop BEXTR implementation.		/// Processor has a single uop BEXTR implementation.
bool HasFastBEXTR = false;		bool HasFastBEXTR = false;

/// Try harder to combine to horizontal vector ops if they are fast.		/// Try harder to combine to horizontal vector ops if they are fast.
bool HasFastHorizontalOps = false;		bool HasFastHorizontalOps = false;

		/// Prefer a left/right vector logical shifts pair over a shift+and pair.
		bool HasFastVectorShiftMasks = false;

/// Use a retpoline thunk rather than indirect calls to block speculative		/// Use a retpoline thunk rather than indirect calls to block speculative
/// execution.		/// execution.
bool UseRetpolineIndirectCalls = false;		bool UseRetpolineIndirectCalls = false;

/// Use a retpoline thunk or remove any indirect branch to block speculative		/// Use a retpoline thunk or remove any indirect branch to block speculative
/// execution.		/// execution.
bool UseRetpolineIndirectBranches = false;		bool UseRetpolineIndirectBranches = false;

▲ Show 20 Lines • Show All 235 Lines • ▼ Show 20 Lines	public:
}		}
bool hasFastGather() const { return HasFastGather; }		bool hasFastGather() const { return HasFastGather; }
bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }		bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }		bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
bool hasFastLZCNT() const { return HasFastLZCNT; }		bool hasFastLZCNT() const { return HasFastLZCNT; }
bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }		bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
bool hasFastBEXTR() const { return HasFastBEXTR; }		bool hasFastBEXTR() const { return HasFastBEXTR; }
bool hasFastHorizontalOps() const { return HasFastHorizontalOps; }		bool hasFastHorizontalOps() const { return HasFastHorizontalOps; }
		bool hasFastVectorShiftMasks() const { return HasFastVectorShiftMasks; }
bool hasMacroFusion() const { return HasMacroFusion; }		bool hasMacroFusion() const { return HasMacroFusion; }
bool hasBranchFusion() const { return HasBranchFusion; }		bool hasBranchFusion() const { return HasBranchFusion; }
bool hasERMSB() const { return HasERMSB; }		bool hasERMSB() const { return HasERMSB; }
bool hasSlowDivide32() const { return HasSlowDivide32; }		bool hasSlowDivide32() const { return HasSlowDivide32; }
bool hasSlowDivide64() const { return HasSlowDivide64; }		bool hasSlowDivide64() const { return HasSlowDivide64; }
bool padShortFunctions() const { return PadShortFunctions; }		bool padShortFunctions() const { return PadShortFunctions; }
bool slowTwoMemOps() const { return SlowTwoMemOps; }		bool slowTwoMemOps() const { return SlowTwoMemOps; }
bool LEAusesAG() const { return LEAUsesAG; }		bool LEAusesAG() const { return LEAUsesAG; }
▲ Show 20 Lines • Show All 204 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/sse2-vector-shifts.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 \| FileCheck %s			; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 \| FileCheck %s --check-prefixes=CHECK,MASK
				; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2,+fast-vector-shift-masks \| FileCheck %s --check-prefixes=CHECK,SHIFT
				; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=btver1 \| FileCheck %s --check-prefixes=CHECK,SHIFT

	; SSE2 Logical Shift Left			; SSE2 Logical Shift Left

	define <8 x i16> @test_sllw_1(<8 x i16> %InVec) {			define <8 x i16> @test_sllw_1(<8 x i16> %InVec) {
	; CHECK-LABEL: test_sllw_1:			; CHECK-LABEL: test_sllw_1:
	; CHECK: # %bb.0: # %entry			; CHECK: # %bb.0: # %entry
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	entry:			entry:
	▲ Show 20 Lines • Show All 284 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: andps {{.*}}(%rip), %xmm0			; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%shl0 = ashr <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>			%shl0 = ashr <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
	%shl1 = shl <4 x i32> %shl0, <i32 4, i32 4, i32 4, i32 4>			%shl1 = shl <4 x i32> %shl0, <i32 4, i32 4, i32 4, i32 4>
	ret <4 x i32> %shl1			ret <4 x i32> %shl1
	}			}

	define <4 x i32> @shl_srl_v4i32(<4 x i32> %x) nounwind {			define <4 x i32> @shl_srl_v4i32(<4 x i32> %x) nounwind {
	; CHECK-LABEL: shl_srl_v4i32:			; MASK-LABEL: shl_srl_v4i32:
	; CHECK: # %bb.0:			; MASK: # %bb.0:
	; CHECK-NEXT: pslld $3, %xmm0			; MASK-NEXT: pslld $3, %xmm0
	; CHECK-NEXT: pand {{.*}}(%rip), %xmm0			; MASK-NEXT: pand {{.*}}(%rip), %xmm0
	; CHECK-NEXT: retq			; MASK-NEXT: retq
				;
				; SHIFT-LABEL: shl_srl_v4i32:
				; SHIFT: # %bb.0:
				; SHIFT-NEXT: psrld $2, %xmm0
				; SHIFT-NEXT: pslld $5, %xmm0
				; SHIFT-NEXT: retq
	%shl0 = lshr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>			%shl0 = lshr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
	%shl1 = shl <4 x i32> %shl0, <i32 5, i32 5, i32 5, i32 5>			%shl1 = shl <4 x i32> %shl0, <i32 5, i32 5, i32 5, i32 5>
	ret <4 x i32> %shl1			ret <4 x i32> %shl1
	}			}

	define <4 x i32> @shl_zext_srl_v4i32(<4 x i16> %x) nounwind {			define <4 x i32> @shl_zext_srl_v4i32(<4 x i16> %x) nounwind {
	; CHECK-LABEL: shl_zext_srl_v4i32:			; CHECK-LABEL: shl_zext_srl_v4i32:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	▲ Show 20 Lines • Show All 57 Lines • Show Last 20 Lines