Diff 202285

lib/Transforms/Vectorize/SLPVectorizer.cpp

Show First 20 Lines • Show All 911 Lines • ▼ Show 20 Lines	class VLOperands {
}		}

/// \returns true if the data structure is empty.		/// \returns true if the data structure is empty.
bool empty() const { return OpsVec.empty(); }		bool empty() const { return OpsVec.empty(); }

/// Clears the data.		/// Clears the data.
void clear() { OpsVec.clear(); }		void clear() { OpsVec.clear(); }

		/// \Returns true if there are enough operands identical to \p Op to fill
		/// the whole vector.
		/// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
		bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
		ABataevUnsubmitted Not Done Reply Inline Actions The function can be made `const`, I think. ABataev: The function can be made `const`, I think.
		vporpoAuthorUnsubmitted Done Reply Inline Actions It is modifying the 'IsUsed' so that we skip the operands that have already been identified as a broadcast, so it needs to call the non-const getData(), therefore it cannot be const. vporpo: It is modifying the 'IsUsed' so that we skip the operands that have already been identified as…
		bool OpAPO = getData(OpIdx, Lane).APO;
		ABataevUnsubmitted Done Reply Inline Actions `auto`->`bool` ABataev: `auto`->`bool`
		for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
		if (Ln == Lane)
		continue;
		// This is set to true if we found a candidate for broadcast at Lane.
		bool FoundCandidate = false;
		for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
		OperandData &Data = getData(OpI, Ln);
		ABataevUnsubmitted Done Reply Inline Actions `auto &`->`const OperandData &` ABataev: `auto &`->`const OperandData &`
		if (Data.APO != OpAPO \|\| Data.IsUsed)
		ABataevUnsubmitted Not Done Reply Inline Actions What if the `Data.V` is `Undef`? I think, we can use broadcast in this case too. ABataev: What if the `Data.V` is `Undef`? I think, we can use broadcast in this case too.
		vporpoAuthorUnsubmitted Done Reply Inline Actions Hmm, currently there is no restriction on `Undef`s, as `Data.V` can be any value. But maybe undefs should be matched with a lower priority ? vporpo: Hmm, currently there is no restriction on `Undef`s, as `Data.V` can be any value. But maybe…
		continue;
		if (Data.V == Op) {
		FoundCandidate = true;
		Data.IsUsed = true;
		break;
		}
		}
		if (!FoundCandidate)
		return false;
		}
		return true;
		}

public:		public:
/// Initialize with all the operands of the instruction vector \p RootVL.		/// Initialize with all the operands of the instruction vector \p RootVL.
VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,		VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
ScalarEvolution &SE)		ScalarEvolution &SE)
: DL(DL), SE(SE) {		: DL(DL), SE(SE) {
// Append all the operands of RootVL.		// Append all the operands of RootVL.
appendOperandsOfVL(RootVL);		appendOperandsOfVL(RootVL);
}		}
Show All 38 Lines	void reorder() {

// Initialize the modes.		// Initialize the modes.
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {		for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
Value *OpLane0 = getValue(OpIdx, FirstLane);		Value *OpLane0 = getValue(OpIdx, FirstLane);
// Keep track if we have instructions with all the same opcode on one		// Keep track if we have instructions with all the same opcode on one
// side.		// side.
if (isa<LoadInst>(OpLane0))		if (isa<LoadInst>(OpLane0))
ReorderingModes[OpIdx] = ReorderingMode::Load;		ReorderingModes[OpIdx] = ReorderingMode::Load;
else if (isa<Instruction>(OpLane0))		else if (isa<Instruction>(OpLane0)) {
		// Check if OpLane0 should be broadcast.
		if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
		ReorderingModes[OpIdx] = ReorderingMode::Splat;
		else
ReorderingModes[OpIdx] = ReorderingMode::Opcode;		ReorderingModes[OpIdx] = ReorderingMode::Opcode;
		}
else if (isa<Constant>(OpLane0))		else if (isa<Constant>(OpLane0))
ReorderingModes[OpIdx] = ReorderingMode::Constant;		ReorderingModes[OpIdx] = ReorderingMode::Constant;
else if (isa<Argument>(OpLane0))		else if (isa<Argument>(OpLane0))
// Our best hope is a Splat. It may save some cost in some cases.		// Our best hope is a Splat. It may save some cost in some cases.
ReorderingModes[OpIdx] = ReorderingMode::Splat;		ReorderingModes[OpIdx] = ReorderingMode::Splat;
else		else
// NOTE: This should be unreachable.		// NOTE: This should be unreachable.
ReorderingModes[OpIdx] = ReorderingMode::Failed;		ReorderingModes[OpIdx] = ReorderingMode::Failed;
}		}

// If the initial strategy fails for any of the operand indexes, then we		// If the initial strategy fails for any of the operand indexes, then we
// perform reordering again in a second pass. This helps avoid assigning		// perform reordering again in a second pass. This helps avoid assigning
// high priority to the failed strategy, and should improve reordering for		// high priority to the failed strategy, and should improve reordering for
// the non-failed operand indexes.		// the non-failed operand indexes.
for (int Pass = 0; Pass != 2; ++Pass) {		for (int Pass = 0; Pass != 2; ++Pass) {
// Skip the second pass if the first pass did not fail.		// Skip the second pass if the first pass did not fail.
bool StrategyFailed = false;		bool StrategyFailed = false;
// Mark the operand data as free to use for all but the first pass.		// Mark all operand data as free to use.
if (Pass > 0)
clearUsed();		clearUsed();
// We keep the original operand order for the FirstLane, so reorder the		// We keep the original operand order for the FirstLane, so reorder the
// rest of the lanes. We are visiting the nodes in a circular fashion,		// rest of the lanes. We are visiting the nodes in a circular fashion,
// using FirstLane as the center point and increasing the radius		// using FirstLane as the center point and increasing the radius
// distance.		// distance.
for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {		for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
// Visit the lane on the right and then the lane on the left.		// Visit the lane on the right and then the lane on the left.
for (int Direction : {+1, -1}) {		for (int Direction : {+1, -1}) {
int Lane = FirstLane + Direction * Distance;		int Lane = FirstLane + Direction * Distance;
▲ Show 20 Lines • Show All 5,852 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/broadcast.ll

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -slp-threshold=-999 < %s \| FileCheck %s		; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -slp-threshold=-999 < %s \| FileCheck %s


; S[0] = %v1 + %v2		; S[0] = %v1 + %v2
; S[1] = %v2 + %v1		; S[1] = %v2 + %v1
; S[2] = %v2 + %v1		; S[2] = %v2 + %v1
; S[3] = %v1 + %v2		; S[3] = %v1 + %v2
;		;
; TODO: We should broadcast %v1 and %v2		; We broadcast %v1 and %v2
;		;

define void @bcast_vals(i64 %A, i64 %B, i64 *%S) {		define void @bcast_vals(i64 %A, i64 %B, i64 *%S) {
; CHECK-LABEL: @bcast_vals(		; CHECK-LABEL: @bcast_vals(
; CHECK-NEXT: entry:		; CHECK-NEXT: entry:
; CHECK-NEXT: [[A0:%.]] = load i64, i64 [[A:%.*]], align 8		; CHECK-NEXT: [[A0:%.]] = load i64, i64 [[A:%.*]], align 8
; CHECK-NEXT: [[B0:%.]] = load i64, i64 [[B:%.*]], align 8		; CHECK-NEXT: [[B0:%.]] = load i64, i64 [[B:%.*]], align 8
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> undef, i64 [[A0]], i32 0		; CHECK-NEXT: [[V1:%.*]] = sub i64 [[A0]], 1
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> [[TMP0]], i64 [[B0]], i32 1		; CHECK-NEXT: [[V2:%.*]] = sub i64 [[B0]], 1
; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[TMP1]], <i64 1, i64 1>		; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> undef, i64 [[V1]], i32 0
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>		; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[V1]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[SHUFFLE]], i32 1		; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[V1]], i32 2
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i32 0		; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[V1]], i32 3
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[SHUFFLE]], i32 0		; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> undef, i64 [[V2]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP5]], i32 1		; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i64> [[TMP4]], i64 [[V2]], i32 1
; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>		; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> [[TMP5]], i64 [[V2]], i32 2
; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i64> [[SHUFFLE]], [[SHUFFLE1]]		; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[V2]], i32 3
		; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i64> [[TMP3]], [[TMP7]]
		ABataevUnsubmitted Not Done Reply Inline Actions Hmm, it is very strange that the new result is more cost effective than the previous one. We change 2 insertelements, 1 vector sub and, actually, one shuffle (2currently), to 2 scalar subs and 8 insertelements (2 broadcasts). Is this really so cost effective? ABataev: Hmm, it is very strange that the new result is more cost effective than the previous one. We…
		vporpoAuthorUnsubmitted Done Reply Inline Actions I am not sure what you mean. The cost of 2 insert elements + one 2-wide sub should be the same as the 2 scalar subs, so nothing to gain from the old code so far. Next, in the old code we have one shuffle for the left input to the 4-wide add and another shuffle for the right input. This is more expensive than the 2 broadcasts we have in the new code, because broadcasts are cheaper than shuffles. vporpo: I am not sure what you mean. The cost of 2 insert elements + one 2-wide sub should be the same…
		ABataevUnsubmitted Done Reply Inline Actions Ah, yes, I see, you're right. ABataev: Ah, yes, I see, you're right.
; CHECK-NEXT: [[IDXS0:%.]] = getelementptr inbounds i64, i64 [[S:%.*]], i64 0		; CHECK-NEXT: [[IDXS0:%.]] = getelementptr inbounds i64, i64 [[S:%.*]], i64 0
; CHECK-NEXT: [[IDXS1:%.]] = getelementptr inbounds i64, i64 [[S]], i64 1		; CHECK-NEXT: [[IDXS1:%.]] = getelementptr inbounds i64, i64 [[S]], i64 1
; CHECK-NEXT: [[IDXS2:%.]] = getelementptr inbounds i64, i64 [[S]], i64 2		; CHECK-NEXT: [[IDXS2:%.]] = getelementptr inbounds i64, i64 [[S]], i64 2
; CHECK-NEXT: [[IDXS3:%.]] = getelementptr inbounds i64, i64 [[S]], i64 3		; CHECK-NEXT: [[IDXS3:%.]] = getelementptr inbounds i64, i64 [[S]], i64 3
; CHECK-NEXT: [[TMP8:%.]] = bitcast i64 [[IDXS0]] to <4 x i64>*		; CHECK-NEXT: [[TMP9:%.]] = bitcast i64 [[IDXS0]] to <4 x i64>*
; CHECK-NEXT: store <4 x i64> [[TMP7]], <4 x i64>* [[TMP8]], align 8		; CHECK-NEXT: store <4 x i64> [[TMP8]], <4 x i64>* [[TMP9]], align 8
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
entry:		entry:
%A0 = load i64, i64 *%A, align 8		%A0 = load i64, i64 *%A, align 8
%B0 = load i64, i64 *%B, align 8		%B0 = load i64, i64 *%B, align 8

%v1 = sub i64 %A0, 1		%v1 = sub i64 %A0, 1
%v2 = sub i64 %B0, 1		%v2 = sub i64 %B0, 1
Show All 15 Lines	entry:
ret void		ret void
}		}

; S[0] = %v1 + %v2		; S[0] = %v1 + %v2
; S[1] = %v3 + %v1		; S[1] = %v3 + %v1
; S[2] = %v5 + %v1		; S[2] = %v5 + %v1
; S[3] = %v1 + %v4		; S[3] = %v1 + %v4
;		;
; TODO: We should broadcast %v1.		; We broadcast %v1.

;		;
define void @bcast_vals2(i16 %A, i16 %B, i16 %C, i16 %D, i16 %E, i32 %S) {		define void @bcast_vals2(i16 %A, i16 %B, i16 %C, i16 %D, i16 %E, i32 %S) {
; CHECK-LABEL: @bcast_vals2(		; CHECK-LABEL: @bcast_vals2(
; CHECK-NEXT: entry:		; CHECK-NEXT: entry:
; CHECK-NEXT: [[A0:%.]] = load i16, i16 [[A:%.*]], align 8		; CHECK-NEXT: [[A0:%.]] = load i16, i16 [[A:%.*]], align 8
; CHECK-NEXT: [[B0:%.]] = load i16, i16 [[B:%.*]], align 8		; CHECK-NEXT: [[B0:%.]] = load i16, i16 [[B:%.*]], align 8
; CHECK-NEXT: [[C0:%.]] = load i16, i16 [[C:%.*]], align 8		; CHECK-NEXT: [[C0:%.]] = load i16, i16 [[C:%.*]], align 8
; CHECK-NEXT: [[D0:%.]] = load i16, i16 [[D:%.*]], align 8		; CHECK-NEXT: [[D0:%.]] = load i16, i16 [[D:%.*]], align 8
; CHECK-NEXT: [[E0:%.]] = load i16, i16 [[E:%.*]], align 8		; CHECK-NEXT: [[E0:%.]] = load i16, i16 [[E:%.*]], align 8
; CHECK-NEXT: [[V1:%.*]] = sext i16 [[A0]] to i32		; CHECK-NEXT: [[V1:%.*]] = sext i16 [[A0]] to i32
; CHECK-NEXT: [[V2:%.*]] = sext i16 [[B0]] to i32		; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[B0]], i32 0
; CHECK-NEXT: [[V3:%.*]] = sext i16 [[C0]] to i32		; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> [[TMP0]], i16 [[C0]], i32 1
; CHECK-NEXT: [[V4:%.*]] = sext i16 [[D0]] to i32		; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[E0]], i32 2
; CHECK-NEXT: [[V5:%.*]] = sext i16 [[E0]] to i32		; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[D0]], i32 3
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[V1]], i32 0		; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[V3]], i32 1		; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[V1]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V5]], i32 2		; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[V1]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[V1]], i32 3		; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[V1]], i32 2
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[V2]], i32 0		; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[V1]], i32 3
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[V1]], i32 1		; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[V1]], i32 2
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[V4]], i32 3
; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP3]], [[TMP7]]
; CHECK-NEXT: [[IDXS0:%.]] = getelementptr inbounds i32, i32 [[S:%.*]], i64 0		; CHECK-NEXT: [[IDXS0:%.]] = getelementptr inbounds i32, i32 [[S:%.*]], i64 0
; CHECK-NEXT: [[IDXS1:%.]] = getelementptr inbounds i32, i32 [[S]], i64 1		; CHECK-NEXT: [[IDXS1:%.]] = getelementptr inbounds i32, i32 [[S]], i64 1
; CHECK-NEXT: [[IDXS2:%.]] = getelementptr inbounds i32, i32 [[S]], i64 2		; CHECK-NEXT: [[IDXS2:%.]] = getelementptr inbounds i32, i32 [[S]], i64 2
; CHECK-NEXT: [[IDXS3:%.]] = getelementptr inbounds i32, i32 [[S]], i64 3		; CHECK-NEXT: [[IDXS3:%.]] = getelementptr inbounds i32, i32 [[S]], i64 3
; CHECK-NEXT: [[TMP9:%.]] = bitcast i32 [[IDXS0]] to <4 x i32>*		; CHECK-NEXT: [[TMP10:%.]] = bitcast i32 [[IDXS0]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 8		; CHECK-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP10]], align 8
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
entry:		entry:
%A0 = load i16, i16 *%A, align 8		%A0 = load i16, i16 *%A, align 8
%B0 = load i16, i16 *%B, align 8		%B0 = load i16, i16 *%B, align 8
%C0 = load i16, i16 *%C, align 8		%C0 = load i16, i16 *%C, align 8
%D0 = load i16, i16 *%D, align 8		%D0 = load i16, i16 *%D, align 8
%E0 = load i16, i16 *%E, align 8		%E0 = load i16, i16 *%E, align 8
Show All 23 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[SLP] Fix regression in broadcasts caused by operand reordering patch D59973.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 202285

lib/Transforms/Vectorize/SLPVectorizer.cpp

test/Transforms/SLPVectorizer/X86/broadcast.ll

This is an archive of the discontinued LLVM Phabricator instance.

[SLP] Fix regression in broadcasts caused by operand reordering patch D59973.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 202285

lib/Transforms/Vectorize/SLPVectorizer.cpp

test/Transforms/SLPVectorizer/X86/broadcast.ll

[SLP] Fix regression in broadcasts caused by operand reordering patch D59973.
ClosedPublic