This is an archive of the discontinued LLVM Phabricator instance.

InstCombine: fix extraction when performing vector/array punning
ClosedPublic

Authored by evgeny777 on Feb 3 2017, 6:46 AM.

Download Raw Diff

Details

Reviewers

spatel
chandlerc
ABataev
mkuper
jfb
jvoung

Commits

rG958fcd750269: InstCombine: fix extraction when performing vector/array punning
rL295429: InstCombine: fix extraction when performing vector/array punning

Summary

Current implementation of visitShuffleVectorInst emits incorrect IR, when shufflevector instruction is followed by multiple bitcast instructions. Consider the following example:

define void @test(<16 x i8> %w, i32* %o1, float* %o2) {
  %v = shufflevector <16 x i8> %w, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
  %f = bitcast <4 x i8> %v to float
  %i = bitcast <4 x i8> %v to i32
  store i32 %i, i32* %o1, align 4
  store float %f, float* %o2, align 4
  ret void
}

If you try to run opt -instcombine over it, you'll get the following IR:

define void @test(<16 x i8> %w, i32* %o1, float* %o2) {
  %v.bc = bitcast <16 x i8> %w to <4 x i32>
  %v.extract = extractelement <4 x i32> %v.bc, i32 3
  %v.extract1 = shufflevector <16 x i8> %w, <16 x i8> undef, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %v.bc2 = bitcast <16 x i8> %v.extract1 to <4 x float>
  %v.extract3 = extractelement <4 x float> %v.bc2, i32 0
  store i32 %v.extract, i32* %o1, align 4
  store float %v.extract3, float* %o2, align 4
  ret void
}

The weird shuffle %v.extract1 is emitted, because BegIdx is modified inside for-loop and becomes 3 instead of 12 (as it should be)

If you apply this patch you'll get following:

define void @test(<16 x i8> %w, i32* %o1, float* %o2) {
  %v.bc = bitcast <16 x i8> %w to <4 x i32>
  %v.extract = extractelement <4 x i32> %v.bc, i32 3
  %v.bc1 = bitcast <16 x i8> %w to <4 x float>
  %v.extract2 = extractelement <4 x float> %v.bc1, i32 3
  store i32 %v.extract, i32* %o1, align 4
  store float %v.extract2, float* %o2, align 4
  ret void
}

The broken optimization causes incorrect calculation of DCT matrix with arm-neon extensions enabled and clang -O3

Diff Detail

Repository: rL LLVM

Event Timeline

evgeny777 created this revision.Feb 3 2017, 6:46 AM

Herald added a subscriber: aemerson. · View Herald TranscriptFeb 3 2017, 6:46 AM

evgeny777 added a subscriber: llvm-commits.Feb 3 2017, 11:03 PM

Any comments on it?

evgeny777 added reviewers: spatel, mkuper, ABataev.Feb 14 2017, 11:58 PM

LGTM, thanks!

This revision is now accepted and ready to land.Feb 15 2017, 11:00 AM

Closed by commit rL295429: InstCombine: fix extraction when performing vector/array punning (authored by evgeny777). · Explain WhyFeb 16 2017, 11:47 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Transforms/

InstCombine/

InstCombineVectorOps.cpp

2 lines

test/

Transforms/

InstCombine/

shufflevec-bitcast.ll

16 lines

Diff 88862

llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp

Show First 20 Lines • Show All 1,203 Lines • ▼ Show 20 Lines	Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
// <16 x i8>: \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \|		// <16 x i8>: \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \|
// <4 x i32>: \| \| \| \| \|		// <4 x i32>: \| \| \| \| \|
// +-----------+-----------+-----------+-----------+		// +-----------+-----------+-----------+-----------+
// Index range [6,10): ^-----------^ Needs an extra shuffle.		// Index range [6,10): ^-----------^ Needs an extra shuffle.
// Target type i40: ^--------------^ Won't work, bail.		// Target type i40: ^--------------^ Won't work, bail.
if (isShuffleExtractingFromLHS(SVI, Mask)) {		if (isShuffleExtractingFromLHS(SVI, Mask)) {
Value *V = LHS;		Value *V = LHS;
unsigned MaskElems = Mask.size();		unsigned MaskElems = Mask.size();
unsigned BegIdx = Mask.front();
VectorType *SrcTy = cast<VectorType>(V->getType());		VectorType *SrcTy = cast<VectorType>(V->getType());
unsigned VecBitWidth = SrcTy->getBitWidth();		unsigned VecBitWidth = SrcTy->getBitWidth();
unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType());		unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType());
assert(SrcElemBitWidth && "vector elements must have a bitwidth");		assert(SrcElemBitWidth && "vector elements must have a bitwidth");
unsigned SrcNumElems = SrcTy->getNumElements();		unsigned SrcNumElems = SrcTy->getNumElements();
SmallVector<BitCastInst *, 8> BCs;		SmallVector<BitCastInst *, 8> BCs;
DenseMap<Type , Value > NewBCs;		DenseMap<Type , Value > NewBCs;
for (User *U : SVI.users())		for (User *U : SVI.users())
if (BitCastInst *BC = dyn_cast<BitCastInst>(U))		if (BitCastInst *BC = dyn_cast<BitCastInst>(U))
if (!BC->use_empty())		if (!BC->use_empty())
// Only visit bitcasts that weren't previously handled.		// Only visit bitcasts that weren't previously handled.
BCs.push_back(BC);		BCs.push_back(BC);
for (BitCastInst *BC : BCs) {		for (BitCastInst *BC : BCs) {
		unsigned BegIdx = Mask.front();
Type *TgtTy = BC->getDestTy();		Type *TgtTy = BC->getDestTy();
unsigned TgtElemBitWidth = DL.getTypeSizeInBits(TgtTy);		unsigned TgtElemBitWidth = DL.getTypeSizeInBits(TgtTy);
if (!TgtElemBitWidth)		if (!TgtElemBitWidth)
continue;		continue;
unsigned TgtNumElems = VecBitWidth / TgtElemBitWidth;		unsigned TgtNumElems = VecBitWidth / TgtElemBitWidth;
bool VecBitWidthsEqual = VecBitWidth == TgtNumElems * TgtElemBitWidth;		bool VecBitWidthsEqual = VecBitWidth == TgtNumElems * TgtElemBitWidth;
bool BegIsAligned = 0 == ((SrcElemBitWidth * BegIdx) % TgtElemBitWidth);		bool BegIsAligned = 0 == ((SrcElemBitWidth * BegIdx) % TgtElemBitWidth);
if (!VecBitWidthsEqual)		if (!VecBitWidthsEqual)
▲ Show 20 Lines • Show All 226 Lines • Show Last 20 Lines

llvm/trunk/test/Transforms/InstCombine/shufflevec-bitcast.ll

				; RUN: opt < %s -instcombine -S \| FileCheck %s

				define void @test(<16 x i8> %w, i32* %o1, float* %o2) {

				; CHECK: %v.bc = bitcast <16 x i8> %w to <4 x i32>
				; CHECK-NEXT: %v.extract = extractelement <4 x i32> %v.bc, i32 3
				; CHECK-NEXT: %v.bc{{[0-9]*}} = bitcast <16 x i8> %w to <4 x float>
				; CHECK-NEXT: %v.extract{{[0-9]}} = extractelement <4 x float> %v.bc{{[0-9]}}, i32 3

				%v = shufflevector <16 x i8> %w, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
				%f = bitcast <4 x i8> %v to float
				%i = bitcast <4 x i8> %v to i32
				store i32 %i, i32* %o1, align 4
				store float %f, float* %o2, align 4
				ret void
				}