This is an archive of the discontinued LLVM Phabricator instance.

SROA: extract instead of shuffle when performing vector/array type punning
AbandonedPublic

Authored by jfb on Feb 7 2015, 5:16 PM.

Download Raw Diff

Details

Reviewers

chandlerc
jvoung

Summary

The resulting code is shorter and simpler to optimize. The existing code was more general, and still serves as the fallback case when the incoming vector type and the outgoing scalar are incompatible. This code should trigger more often than through type punning, but that's the user code I saw it trigger on.

Diff Detail

Event Timeline

jfb updated this revision to Diff 19540.Feb 7 2015, 5:16 PM

jfb retitled this revision from to SROA: extract instead of shuffle when performing vector/array type punning.

jfb updated this object.

jfb edited the test plan for this revision. (Show Details)

jfb added reviewers: chandlerc, jvoung.

jfb added a subscriber: Unknown Object (MLST).

majnemer added a subscriber: majnemer.Feb 7 2015, 6:15 PM

majnemer added inline comments.

lib/Transforms/Scalar/SROA.cpp
3075–3078	TODO(name) is not typical LLVM style.

TODO -> FIXME

jfb added inline comments.Feb 7 2015, 7:05 PM

lib/Transforms/Scalar/SROA.cpp
3075–3078	Done.

One small inline comment.

lib/Transforms/Scalar/SROA.cpp
2616–2621	Block comment here.

Add block comments to functions extractVector and rewriteVectorizedLoadInst.

On the mailing list @chandlerc said:

Not sure this is the right approach. It is a lot of complexity, and we still have the fallback.

Have you looked at teaching instcombine to transform the code produced by SROA today into the element extract? That would seem a better layering at the least, although I'm still on the fence about whether we want to in general perform this operation.

This seems acceptable to me, I just want to make sure I record this on Phabricator. Here are a few thoughts:

instcombine will "clean up" what SROA currently does by pattern matching what SROA currently produced.
instcombine may also end up cleaning other code that looks the same (good!).
The code in instcombine may diverge from SROA, so I'll write a test that makes sure SROA followed by instcombine is "clean".
I could instead keep the code in SROA and remove the fallback and (from my beautiful ASCII art example) return the i40 directly instead of having the fallback. I'm just not sure this code gets hit much, so I figured addressing the case I saw first made sense, but can address both if you think it worthwhile.

I'll get started on instcombine and send a separate patch.

jfb mentioned this in D7734: InstCombine: extract instead of shuffle when performing vector/array type punning.Feb 18 2015, 12:55 PM

I finally got back to this and implemented the solution we discussed in InstCombine: D7734. Abandoning this revision in favor of the new one.

jfb mentioned this in rL230560: InstCombine: extract instead of shuffle when performing vector/array type….Feb 25 2015, 2:33 PM

Revision Contents

Path

Size

lib/

Transforms/

Scalar/

SROA.cpp

86 lines

test/

Transforms/

SROA/

vector-promotion.ll

25 lines

Diff 19544

lib/Transforms/Scalar/SROA.cpp

Show First 20 Lines • Show All 2,271 Lines • ▼ Show 20 Lines	if (ShAmt \|\| Ty->getBitWidth() < IntTy->getBitWidth()) {
Old = IRB.CreateAnd(Old, Mask, Name + ".mask");		Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
DEBUG(dbgs() << " masked: " << *Old << "\n");		DEBUG(dbgs() << " masked: " << *Old << "\n");
V = IRB.CreateOr(Old, V, Name + ".insert");		V = IRB.CreateOr(Old, V, Name + ".insert");
DEBUG(dbgs() << " inserted: " << *V << "\n");		DEBUG(dbgs() << " inserted: " << *V << "\n");
}		}
return V;		return V;
}		}

static Value extractVector(IRBuilderTy &IRB, Value V, unsigned BeginIndex,		// \brief Extract a continuous range of elements from a vector.
unsigned EndIndex, const Twine &Name) {		//
		// \param V Vector value to extract from.
		// \param TargetTy Type to which the return value will be converted. Used to
		// optimize the vector extraction when possible.
		static Value extractVector(const DataLayout &DL, IRBuilderTy &IRB, Value V,
		unsigned BeginIndex, unsigned EndIndex,
		Type *TargetTy, const Twine &Name) {
VectorType *VecTy = cast<VectorType>(V->getType());		VectorType *VecTy = cast<VectorType>(V->getType());
unsigned NumElements = EndIndex - BeginIndex;		unsigned NumElements = EndIndex - BeginIndex;
assert(NumElements <= VecTy->getNumElements() && "Too many elements!");		assert(NumElements <= VecTy->getNumElements() && "Too many elements!");

if (NumElements == VecTy->getNumElements())		if (NumElements == VecTy->getNumElements())
return V;		return V;

if (NumElements == 1) {		if (NumElements == 1) {
V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),		V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
Name + ".extract");		Name + ".extract");
DEBUG(dbgs() << " extract: " << *V << "\n");		DEBUG(dbgs() << " extract: " << *V << "\n");
return V;		return V;
}		}

SmallVector<Constant *, 8> Mask;		SmallVector<Constant *, 16> Mask;

		// Try to cast the vector to another vector type of the same bitwidth, and
		// extract an element. This will work if the vector types are compatible, and
		// the begin index is aligned to a value in the casted vector type. If the
		// begin index isn't aligned then we can shuffle the original vector (keeping
		// the same vector type) before extracting.
		//
		// This code will bail out if the target type is fundamentally incompatible
		// with vectors of the source type.
		//
		// Example of <16 x i8>, target type i32:
		// Index range [4,8): v-----------v Will work.
		// +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
		// <16 x i8>: \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \|
		// <4 x i32>: \| \| \| \| \|
		// +-----------+-----------+-----------+-----------+
		// Index range [6,10): ^-----------^ Needs an extra shuffle.
		// Target type i40: ^--------------^ Won't work, bail.
		if (unsigned TargetElemBitWidth = TargetTy->getPrimitiveSizeInBits()) {
		unsigned VecBitWidth = VecTy->getBitWidth();
		unsigned SrcElemBitWidth =
		VecTy->getElementType()->getPrimitiveSizeInBits();
		assert(SrcElemBitWidth && "vector elements must have a bitwidth");
		unsigned SrcNumElems = VecTy->getNumElements();
		unsigned TargetNumElems = VecBitWidth / TargetElemBitWidth;
		bool VecBitWidthsEqual = VecBitWidth == TargetNumElems * TargetElemBitWidth;
		bool BeginIsAligned =
		0 == ((SrcElemBitWidth * BeginIndex) % TargetElemBitWidth);
		if (VecBitWidthsEqual && VectorType::isValidElementType(TargetTy)) {
		VectorType *CastVecTy = VectorType::get(TargetTy, TargetNumElems);
		if (canConvertValue(DL, VecTy, CastVecTy)) {
		if (!BeginIsAligned) {
		// Shuffle the input so [0,NumElements) contains the output, and
		// [NumElems,SrcNumElems) is undef.
		Mask.reserve(SrcNumElems);
		unsigned i = BeginIndex;
		while (i != EndIndex)
		Mask.push_back(IRB.getInt32(i++));
		while (i++ != SrcNumElems)
		Mask.push_back(IRB.getInt32(SrcNumElems)); // undef
		V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
		ConstantVector::get(Mask),
		Name + ".extract");
		DEBUG(dbgs() << " shuffle: " << *V << "\n");
		BeginIndex = 0;
		}
		unsigned SrcElemsPerTargetElem = TargetElemBitWidth / SrcElemBitWidth;
		assert(SrcElemsPerTargetElem);
		BeginIndex /= SrcElemsPerTargetElem;
		V = IRB.CreateExtractElement(convertValue(DL, IRB, V, CastVecTy),
		IRB.getInt32(BeginIndex),
		Name + ".extract");
		DEBUG(dbgs() << " extract: " << *V << "\n");
		return V;
		}
		}
		}

Mask.reserve(NumElements);		Mask.reserve(NumElements);
for (unsigned i = BeginIndex; i != EndIndex; ++i)		for (unsigned i = BeginIndex; i != EndIndex; ++i)
Mask.push_back(IRB.getInt32(i));		Mask.push_back(IRB.getInt32(i));
V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),		V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
ConstantVector::get(Mask), Name + ".extract");		ConstantVector::get(Mask), Name + ".extract");
DEBUG(dbgs() << " shuffle: " << *V << "\n");		DEBUG(dbgs() << " shuffle: " << *V << "\n");
return V;		return V;
}		}
▲ Show 20 Lines • Show All 239 Lines • ▼ Show 20 Lines	#endif
}		}

void deleteIfTriviallyDead(Value *V) {		void deleteIfTriviallyDead(Value *V) {
Instruction *I = cast<Instruction>(V);		Instruction *I = cast<Instruction>(V);
if (isInstructionTriviallyDead(I))		if (isInstructionTriviallyDead(I))
Pass.DeadInsts.insert(I);		Pass.DeadInsts.insert(I);
}		}

Value *rewriteVectorizedLoadInst() {		// \brief Rewrite a vector load instruction to a load followed by the
		// extraction of a subset of the vector's elements.
		//
		// \param TargetTy Type to which the return value will be converted. Used to
		// optimize the vector extraction when possible.
		Value rewriteVectorizedLoadInst(Type TargetTy) {
		echristoUnsubmitted Not Done Reply Inline Actions Block comment here. echristo: Block comment here.
unsigned BeginIndex = getIndex(NewBeginOffset);		unsigned BeginIndex = getIndex(NewBeginOffset);
unsigned EndIndex = getIndex(NewEndOffset);		unsigned EndIndex = getIndex(NewEndOffset);
assert(EndIndex > BeginIndex && "Empty vector!");		assert(EndIndex > BeginIndex && "Empty vector!");

Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");		Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
return extractVector(IRB, V, BeginIndex, EndIndex, "vec");		return extractVector(DL, IRB, V, BeginIndex, EndIndex, TargetTy, "vec");
}		}

Value *rewriteIntegerLoad(LoadInst &LI) {		Value *rewriteIntegerLoad(LoadInst &LI) {
assert(IntTy && "We cannot insert an integer to the alloca");		assert(IntTy && "We cannot insert an integer to the alloca");
assert(!LI.isVolatile());		assert(!LI.isVolatile());
Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");		Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
V = convertValue(DL, IRB, V, IntTy);		V = convertValue(DL, IRB, V, IntTy);
assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");		assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
Show All 9 Lines	bool visitLoadInst(LoadInst &LI) {
Value *OldOp = LI.getOperand(0);		Value *OldOp = LI.getOperand(0);
assert(OldOp == OldPtr);		assert(OldOp == OldPtr);

Type TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize 8)		Type TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize 8)
: LI.getType();		: LI.getType();
bool IsPtrAdjusted = false;		bool IsPtrAdjusted = false;
Value *V;		Value *V;
if (VecTy) {		if (VecTy) {
V = rewriteVectorizedLoadInst();		V = rewriteVectorizedLoadInst(TargetTy);
} else if (IntTy && LI.getType()->isIntegerTy()) {		} else if (IntTy && LI.getType()->isIntegerTy()) {
V = rewriteIntegerLoad(LI);		V = rewriteIntegerLoad(LI);
} else if (NewBeginOffset == NewAllocaBeginOffset &&		} else if (NewBeginOffset == NewAllocaBeginOffset &&
canConvertValue(DL, NewAllocaTy, LI.getType())) {		canConvertValue(DL, NewAllocaTy, LI.getType())) {
V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), LI.isVolatile(),		V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), LI.isVolatile(),
LI.getName());		LI.getName());
} else {		} else {
Type *LTy = TargetTy->getPointerTo();		Type *LTy = TargetTy->getPointerTo();
▲ Show 20 Lines • Show All 405 Lines • ▼ Show 20 Lines	bool visitMemTransferInst(MemTransferInst &II) {
if (!IsDest) {		if (!IsDest) {
std::swap(SrcPtr, DstPtr);		std::swap(SrcPtr, DstPtr);
std::swap(SrcAlign, DstAlign);		std::swap(SrcAlign, DstAlign);
}		}

Value *Src;		Value *Src;
if (VecTy && !IsWholeAlloca && !IsDest) {		if (VecTy && !IsWholeAlloca && !IsDest) {
Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");		Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");		// FIXME: in some cases we can figure out a better target type which would
		// allow generating an extract directly.
		Type *TargetTy = OtherPtrTy->getPointerElementType();
		Src = extractVector(DL, IRB, Src, BeginIndex, EndIndex, TargetTy, "vec");
		majnemerUnsubmitted Not Done Reply Inline Actions TODO(name) is not typical LLVM style. majnemer: TODO(name) is not typical LLVM style.
		jfbAuthorUnsubmitted Not Done Reply Inline Actions Done. jfb: Done.
} else if (IntTy && !IsWholeAlloca && !IsDest) {		} else if (IntTy && !IsWholeAlloca && !IsDest) {
Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");		Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
Src = convertValue(DL, IRB, Src, IntTy);		Src = convertValue(DL, IRB, Src, IntTy);
uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;		uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");		Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
} else {		} else {
Src =		Src =
IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), "copyload");		IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), "copyload");
▲ Show 20 Lines • Show All 1,458 Lines • Show Last 20 Lines

test/Transforms/SROA/vector-promotion.ll

Show First 20 Lines • Show All 617 Lines • ▼ Show 20 Lines	; CHECK-NOT: store
%cast3 = bitcast <3 x float>* %cast2 to <4 x float>*		%cast3 = bitcast <3 x float>* %cast2 to <4 x float>*
%vec = load <4 x float>* %cast3		%vec = load <4 x float>* %cast3
; CHECK-NOT: load		; CHECK-NOT: load

; CHECK: %[[ret:.*]] = bitcast <4 x i32> undef to <4 x float>		; CHECK: %[[ret:.*]] = bitcast <4 x i32> undef to <4 x float>
; CHECK-NEXT: ret <4 x float> %[[ret]]		; CHECK-NEXT: ret <4 x float> %[[ret]]
ret <4 x float> %vec		ret <4 x float> %vec
}		}

		%U4xi32 = type { <4 x i32> }

		define i32 @type_pun(<16 x i8> %in) {
		; Ensure that type punning using a union of vector and same-sized array
		; generates an extract.
		;
		; CHECK-LABEL: @type_pun(
		; CHECK-NOT: alloca
		; CHECK-NEXT: %[[BC1:.*]] = bitcast <16 x i8> %in to <4 x i32>
		; CHECK-NEXT: %[[EXT1:.*]] = extractelement <4 x i32> %[[BC1]], i32 0
		; CHECK-NEXT: %[[BC2:.*]] = bitcast <16 x i8> %in to <4 x i32>
		; CHECK-NEXT: %[[EXT2:.*]] = extractelement <4 x i32> %[[BC2]], i32 2
		; CHECK-NEXT: %[[SUM:.*]] = add i32 %[[EXT1]], %[[EXT2]]
		; CHECK-NEXT: ret i32 %[[SUM]]
		%stack = alloca %U4xi32, align 16
		%vec = bitcast %U4xi32* %stack to <16 x i8>*
		store <16 x i8> %in, <16 x i8>* %vec, align 16
		%idx1 = getelementptr inbounds %U4xi32* %stack, i32 0, i32 0, i32 0
		%elem1 = load i32* %idx1, align 4
		%idx2 = getelementptr inbounds %U4xi32* %stack, i32 0, i32 0, i32 2
		%elem2 = load i32* %idx2, align 4
		%sum = add i32 %elem1, %elem2
		ret i32 %sum
		}