This is an archive of the discontinued LLVM Phabricator instance.

[SLP] Don't vectorize loads of non-packed types (like i1, i2).
ClosedPublic

Authored by mzolotukhin on Sep 29 2015, 6:29 PM.

Download Raw Diff

Details

Reviewers

nadav
aschwaighofer
hfinkel

Commits

rGfc783e91e0c0: [SLP] Don't vectorize loads of non-packed types (like i1, i2).
rL248943: [SLP] Don't vectorize loads of non-packed types (like i1, i2).

Summary

Given an array of i2 elements, 4 consecutive scalar loads will be lowered to
i8-sized loads and thus will access 4 consecutive bytes in memory. If we
vectorize these loads into a single <4 x i2> load, it'll access only 1 byte in
memory. Hence, we should prohibit vectorization in such cases.

PS: Initial patch was proposed by Arnold.

Diff Detail

Repository: rL LLVM

Event Timeline

mzolotukhin updated this revision to Diff 36059.Sep 29 2015, 6:29 PM

mzolotukhin retitled this revision from to [SLP] Don't vectorize loads of non-packed types (like i1, i2)..

mzolotukhin updated this object.

mzolotukhin added reviewers: aschwaighofer, nadav, hfinkel.

mzolotukhin added a subscriber: llvm-commits.

mzolotukhin updated this object.Sep 29 2015, 6:31 PM

LGTM.

Thank you.

Closed by commit rL248943: [SLP] Don't vectorize loads of non-packed types (like i1, i2). (authored by mzolotukhin). · Explain WhySep 30 2015, 2:07 PM

This revision was automatically updated to reflect the committed changes.

What about SLP-vectorization of stores? I suspect that we have the same bug for stores.

Hi Nadav,

Currently we can't get into such situation with stores, but that's just by luck - the minimal vector size is 128 bit, and maximum number of accesses we can bundle together is 16. So the minimal element size that could be used in vector store is i8, which doesn't have such a problem.

Next, I'm going to reveal the incorrect behavior on stores by replacing these hardcoded parameters with cl options (see e.g. D13278), fix it, and make sure we honor these constraints when vectorizing phis (currently we don't).

Thanks,
Michael

mzolotukhin mentioned this in D14260: Optimize store of "bitcast" from vector to aggregate..Jan 29 2016, 1:07 PM

Revision Contents

Path

Size

llvm/

trunk/

lib/

Transforms/

Vectorize/

SLPVectorizer.cpp

19 lines

test/

Transforms/

SLPVectorizer/

X86/

bad_types.ll

26 lines

Diff 36142

llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp

Show First 20 Lines • Show All 1,152 Lines • ▼ Show 20 Lines	case Instruction::ExtractElement: {
DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");		DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
} else {		} else {
BS.cancelScheduling(VL);		BS.cancelScheduling(VL);
}		}
newTreeEntry(VL, Reuse);		newTreeEntry(VL, Reuse);
return;		return;
}		}
case Instruction::Load: {		case Instruction::Load: {
		// Check that a vectorized load would load the same memory as a scalar
		// load.
		// For example we don't want vectorize loads that are smaller than 8 bit.
		// Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats
		// loading/storing it as an i8 struct. If we vectorize loads/stores from
		// such a struct we read/write packed bits disagreeing with the
		// unvectorized version.
		const DataLayout &DL = F->getParent()->getDataLayout();
		Type *ScalarTy = VL[0]->getType();

		if (DL.getTypeSizeInBits(ScalarTy) !=
		DL.getTypeAllocSizeInBits(ScalarTy)) {
		BS.cancelScheduling(VL);
		newTreeEntry(VL, false);
		DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
		return;
		}
// Check if the loads are consecutive or of we need to swizzle them.		// Check if the loads are consecutive or of we need to swizzle them.
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {		for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
LoadInst *L = cast<LoadInst>(VL[i]);		LoadInst *L = cast<LoadInst>(VL[i]);
if (!L->isSimple()) {		if (!L->isSimple()) {
BS.cancelScheduling(VL);		BS.cancelScheduling(VL);
newTreeEntry(VL, false);		newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");		DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
return;		return;
}		}
const DataLayout &DL = F->getParent()->getDataLayout();
if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {		if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) {		if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) {
++NumLoadsWantToChangeOrder;		++NumLoadsWantToChangeOrder;
}		}
BS.cancelScheduling(VL);		BS.cancelScheduling(VL);
newTreeEntry(VL, false);		newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");		DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
return;		return;
▲ Show 20 Lines • Show All 2,934 Lines • Show Last 20 Lines

llvm/trunk/test/Transforms/SLPVectorizer/X86/bad_types.ll

	Show First 20 Lines • Show All 41 Lines • ▼ Show 20 Lines

	exit:			exit:
	%a.phi = phi i64 [ 0, %entry ], [ %a.and, %if.then ]			%a.phi = phi i64 [ 0, %entry ], [ %a.and, %if.then ]
	%b.phi = phi i64 [ 0, %entry ], [ %b.and, %if.then ]			%b.phi = phi i64 [ 0, %entry ], [ %b.and, %if.then ]
	tail call void @f(i64 %a.phi, i64 %b.phi)			tail call void @f(i64 %a.phi, i64 %b.phi)
	ret void			ret void
	}			}

				define i8 @test3(i8 *%addr) {
				; Check that we do not vectorize types that are padded to a bigger ones.
				;
				; CHECK-LABEL: @test3
				; CHECK-NOT: <4 x i2>
				; CHECK: ret i8
				entry:
				%a = bitcast i8* %addr to i2*
				%a0 = getelementptr inbounds i2, i2* %a, i64 0
				%a1 = getelementptr inbounds i2, i2* %a, i64 1
				%a2 = getelementptr inbounds i2, i2* %a, i64 2
				%a3 = getelementptr inbounds i2, i2* %a, i64 3
				%l0 = load i2, i2* %a0, align 1
				%l1 = load i2, i2* %a1, align 1
				%l2 = load i2, i2* %a2, align 1
				%l3 = load i2, i2* %a3, align 1
				br label %bb1
				bb1: ; preds = %entry
				%p0 = phi i2 [ %l0, %entry ]
				%p1 = phi i2 [ %l1, %entry ]
				%p2 = phi i2 [ %l2, %entry ]
				%p3 = phi i2 [ %l3, %entry ]
				%r = zext i2 %p2 to i8
				ret i8 %r
				}

	declare void @f(i64, i64)			declare void @f(i64, i64)