Skip to content

Commit d52c990

Browse files
committedFeb 25, 2015
InstCombine: extract instead of shuffle when performing vector/array type punning
Summary: SROA generates code that isn't quite as easy to optimize and contains unusual-sized shuffles, but that code is generally correct. As discussed in D7487 the right place to clean things up is InstCombine, which will pick up the type-punning pattern and transform it into a more obvious bitcast+extractelement, while leaving the other patterns SROA encounters as-is. Test Plan: make check Reviewers: jvoung, chandlerc Subscribers: llvm-commits llvm-svn: 230560
1 parent de37434 commit d52c990

File tree

2 files changed

+253
-5
lines changed

2 files changed

+253
-5
lines changed
 

‎llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp

+116-5
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
//===----------------------------------------------------------------------===//
1414

1515
#include "InstCombineInternal.h"
16+
#include "llvm/ADT/DenseMap.h"
1617
#include "llvm/IR/PatternMatch.h"
1718
using namespace llvm;
1819
using namespace PatternMatch;
@@ -853,10 +854,32 @@ static void RecognizeIdentityMask(const SmallVectorImpl<int> &Mask,
853854
}
854855
}
855856

857+
// Returns true if the shuffle is extracting a contiguous range of values from
858+
// LHS, for example:
859+
// +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
860+
// Input: |AA|BB|CC|DD|EE|FF|GG|HH|II|JJ|KK|LL|MM|NN|OO|PP|
861+
// Shuffles to: |EE|FF|GG|HH|
862+
// +--+--+--+--+
863+
static bool isShuffleExtractingFromLHS(ShuffleVectorInst &SVI,
864+
SmallVector<int, 16> &Mask) {
865+
unsigned LHSElems =
866+
cast<VectorType>(SVI.getOperand(0)->getType())->getNumElements();
867+
unsigned MaskElems = Mask.size();
868+
unsigned BegIdx = Mask.front();
869+
unsigned EndIdx = Mask.back();
870+
if (BegIdx > EndIdx || EndIdx >= LHSElems || EndIdx - BegIdx != MaskElems - 1)
871+
return false;
872+
for (unsigned I = 0; I != MaskElems; ++I)
873+
if (static_cast<unsigned>(Mask[I]) != BegIdx + I)
874+
return false;
875+
return true;
876+
}
877+
856878
Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
857879
Value *LHS = SVI.getOperand(0);
858880
Value *RHS = SVI.getOperand(1);
859881
SmallVector<int, 16> Mask = SVI.getShuffleMask();
882+
Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
860883

861884
bool MadeChange = false;
862885

@@ -892,18 +915,17 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
892915
SmallVector<Constant*, 16> Elts;
893916
for (unsigned i = 0, e = LHSWidth; i != VWidth; ++i) {
894917
if (Mask[i] < 0) {
895-
Elts.push_back(UndefValue::get(Type::getInt32Ty(SVI.getContext())));
918+
Elts.push_back(UndefValue::get(Int32Ty));
896919
continue;
897920
}
898921

899922
if ((Mask[i] >= (int)e && isa<UndefValue>(RHS)) ||
900923
(Mask[i] < (int)e && isa<UndefValue>(LHS))) {
901924
Mask[i] = -1; // Turn into undef.
902-
Elts.push_back(UndefValue::get(Type::getInt32Ty(SVI.getContext())));
925+
Elts.push_back(UndefValue::get(Int32Ty));
903926
} else {
904927
Mask[i] = Mask[i] % e; // Force to LHS.
905-
Elts.push_back(ConstantInt::get(Type::getInt32Ty(SVI.getContext()),
906-
Mask[i]));
928+
Elts.push_back(ConstantInt::get(Int32Ty, Mask[i]));
907929
}
908930
}
909931
SVI.setOperand(0, SVI.getOperand(1));
@@ -929,6 +951,96 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
929951
return ReplaceInstUsesWith(SVI, V);
930952
}
931953

954+
// SROA generates shuffle+bitcast when the extracted sub-vector is bitcast to
955+
// a non-vector type. We can instead bitcast the original vector followed by
956+
// an extract of the desired element:
957+
//
958+
// %sroa = shufflevector <16 x i8> %in, <16 x i8> undef,
959+
// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
960+
// %1 = bitcast <4 x i8> %sroa to i32
961+
// Becomes:
962+
// %bc = bitcast <16 x i8> %in to <4 x i32>
963+
// %ext = extractelement <4 x i32> %bc, i32 0
964+
//
965+
// If the shuffle is extracting a contiguous range of values from the input
966+
// vector then each use which is a bitcast of the extracted size can be
967+
// replaced. This will work if the vector types are compatible, and the begin
968+
// index is aligned to a value in the casted vector type. If the begin index
969+
// isn't aligned then we can shuffle the original vector (keeping the same
970+
// vector type) before extracting.
971+
//
972+
// This code will bail out if the target type is fundamentally incompatible
973+
// with vectors of the source type.
974+
//
975+
// Example of <16 x i8>, target type i32:
976+
// Index range [4,8): v-----------v Will work.
977+
// +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
978+
// <16 x i8>: | | | | | | | | | | | | | | | | |
979+
// <4 x i32>: | | | | |
980+
// +-----------+-----------+-----------+-----------+
981+
// Index range [6,10): ^-----------^ Needs an extra shuffle.
982+
// Target type i40: ^--------------^ Won't work, bail.
983+
if (isShuffleExtractingFromLHS(SVI, Mask)) {
984+
Value *V = LHS;
985+
unsigned MaskElems = Mask.size();
986+
unsigned BegIdx = Mask.front();
987+
VectorType *SrcTy = cast<VectorType>(V->getType());
988+
unsigned VecBitWidth = SrcTy->getBitWidth();
989+
unsigned SrcElemBitWidth =
990+
SrcTy->getElementType()->getPrimitiveSizeInBits();
991+
assert(SrcElemBitWidth && "vector elements must have a bitwidth");
992+
unsigned SrcNumElems = SrcTy->getNumElements();
993+
SmallVector<BitCastInst *, 8> BCs;
994+
DenseMap<Type *, Value *> NewBCs;
995+
for (User *U : SVI.users())
996+
if (BitCastInst *BC = dyn_cast<BitCastInst>(U))
997+
if (!BC->use_empty())
998+
// Only visit bitcasts that weren't previously handled.
999+
BCs.push_back(BC);
1000+
for (BitCastInst *BC : BCs) {
1001+
Type *TgtTy = BC->getDestTy();
1002+
unsigned TgtElemBitWidth = TgtTy->getPrimitiveSizeInBits();
1003+
if (!TgtElemBitWidth)
1004+
continue;
1005+
unsigned TgtNumElems = VecBitWidth / TgtElemBitWidth;
1006+
bool VecBitWidthsEqual = VecBitWidth == TgtNumElems * TgtElemBitWidth;
1007+
bool BegIsAligned = 0 == ((SrcElemBitWidth * BegIdx) % TgtElemBitWidth);
1008+
if (!VecBitWidthsEqual)
1009+
continue;
1010+
if (!VectorType::isValidElementType(TgtTy))
1011+
continue;
1012+
VectorType *CastSrcTy = VectorType::get(TgtTy, TgtNumElems);
1013+
if (!BegIsAligned) {
1014+
// Shuffle the input so [0,NumElements) contains the output, and
1015+
// [NumElems,SrcNumElems) is undef.
1016+
SmallVector<Constant *, 16> ShuffleMask(SrcNumElems,
1017+
UndefValue::get(Int32Ty));
1018+
for (unsigned I = 0, E = MaskElems, Idx = BegIdx; I != E; ++Idx, ++I)
1019+
ShuffleMask[I] = ConstantInt::get(Int32Ty, Idx);
1020+
V = Builder->CreateShuffleVector(V, UndefValue::get(V->getType()),
1021+
ConstantVector::get(ShuffleMask),
1022+
SVI.getName() + ".extract");
1023+
BegIdx = 0;
1024+
}
1025+
unsigned SrcElemsPerTgtElem = TgtElemBitWidth / SrcElemBitWidth;
1026+
assert(SrcElemsPerTgtElem);
1027+
BegIdx /= SrcElemsPerTgtElem;
1028+
bool BCAlreadyExists = NewBCs.find(CastSrcTy) != NewBCs.end();
1029+
auto *NewBC =
1030+
BCAlreadyExists
1031+
? NewBCs[CastSrcTy]
1032+
: Builder->CreateBitCast(V, CastSrcTy, SVI.getName() + ".bc");
1033+
if (!BCAlreadyExists)
1034+
NewBCs[CastSrcTy] = NewBC;
1035+
auto *Ext = Builder->CreateExtractElement(
1036+
NewBC, ConstantInt::get(Int32Ty, BegIdx), SVI.getName() + ".extract");
1037+
// The shufflevector isn't being replaced: the bitcast that used it
1038+
// is. InstCombine will visit the newly-created instructions.
1039+
ReplaceInstUsesWith(*BC, Ext);
1040+
MadeChange = true;
1041+
}
1042+
}
1043+
9321044
// If the LHS is a shufflevector itself, see if we can combine it with this
9331045
// one without producing an unusual shuffle.
9341046
// Cases that might be simplified:
@@ -1099,7 +1211,6 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
10991211
// or is a splat, do the replacement.
11001212
if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) {
11011213
SmallVector<Constant*, 16> Elts;
1102-
Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
11031214
for (unsigned i = 0, e = newMask.size(); i != e; ++i) {
11041215
if (newMask[i] < 0) {
11051216
Elts.push_back(UndefValue::get(Int32Ty));
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
; RUN: opt < %s -instcombine -S | FileCheck %s
2+
3+
; Ensure that type punning using a union of vector and same-sized array
4+
; generates an extract instead of a shuffle with an uncommon vector size:
5+
;
6+
; typedef uint32_t v4i32 __attribute__((vector_size(16)));
7+
; union { v4i32 v; uint32_t a[4]; };
8+
;
9+
; This cleans up behind SROA, which inserts the uncommon vector size when
10+
; cleaning up the alloca/store/GEP/load.
11+
12+
13+
; Extracting the zeroth element in an i32 array.
14+
define i32 @type_pun_zeroth(<16 x i8> %in) {
15+
; CHECK-LABEL: @type_pun_zeroth(
16+
; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x i32>
17+
; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x i32> %[[BC]], i32 0
18+
; CHECK-NEXT: ret i32 %[[EXT]]
19+
%sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
20+
%1 = bitcast <4 x i8> %sroa to i32
21+
ret i32 %1
22+
}
23+
24+
; Extracting the first element in an i32 array.
25+
define i32 @type_pun_first(<16 x i8> %in) {
26+
; CHECK-LABEL: @type_pun_first(
27+
; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x i32>
28+
; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x i32> %[[BC]], i32 1
29+
; CHECK-NEXT: ret i32 %[[EXT]]
30+
%sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
31+
%1 = bitcast <4 x i8> %sroa to i32
32+
ret i32 %1
33+
}
34+
35+
; Extracting an i32 that isn't aligned to any natural boundary.
36+
define i32 @type_pun_misaligned(<16 x i8> %in) {
37+
; CHECK-LABEL: @type_pun_misaligned(
38+
; CHECK-NEXT: %[[SHUF:.*]] = shufflevector <16 x i8> %in, <16 x i8> undef, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
39+
; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %[[SHUF]] to <4 x i32>
40+
; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x i32> %[[BC]], i32 0
41+
; CHECK-NEXT: ret i32 %[[EXT]]
42+
%sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
43+
%1 = bitcast <4 x i8> %sroa to i32
44+
ret i32 %1
45+
}
46+
47+
; Type punning to an array of pointers.
48+
define i32* @type_pun_pointer(<16 x i8> %in) {
49+
; CHECK-LABEL: @type_pun_pointer(
50+
; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x i32>
51+
; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x i32> %[[BC]], i32 0
52+
; CHECK-NEXT: %[[I2P:.*]] = inttoptr i32 %[[EXT]] to i32*
53+
; CHECK-NEXT: ret i32* %[[I2P]]
54+
%sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
55+
%1 = bitcast <4 x i8> %sroa to i32
56+
%2 = inttoptr i32 %1 to i32*
57+
ret i32* %2
58+
}
59+
60+
; Type punning to an array of 32-bit floating-point values.
61+
define float @type_pun_float(<16 x i8> %in) {
62+
; CHECK-LABEL: @type_pun_float(
63+
; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x float>
64+
; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x float> %[[BC]], i32 0
65+
; CHECK-NEXT: ret float %[[EXT]]
66+
%sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
67+
%1 = bitcast <4 x i8> %sroa to float
68+
ret float %1
69+
}
70+
71+
; Type punning to an array of 64-bit floating-point values.
72+
define double @type_pun_double(<16 x i8> %in) {
73+
; CHECK-LABEL: @type_pun_double(
74+
; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <2 x double>
75+
; CHECK-NEXT: %[[EXT:.*]] = extractelement <2 x double> %[[BC]], i32 0
76+
; CHECK-NEXT: ret double %[[EXT]]
77+
%sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
78+
%1 = bitcast <8 x i8> %sroa to double
79+
ret double %1
80+
}
81+
82+
; Type punning to same-size floating-point and integer values.
83+
; Verify that multiple uses with different bitcast types are properly handled.
84+
define { float, i32 } @type_pun_float_i32(<16 x i8> %in) {
85+
; CHECK-LABEL: @type_pun_float_i32(
86+
; CHECK-NEXT: %[[BCI:.*]] = bitcast <16 x i8> %in to <4 x i32>
87+
; CHECK-NEXT: %[[EXTI:.*]] = extractelement <4 x i32> %[[BCI]], i32 0
88+
; CHECK-NEXT: %[[BCF:.*]] = bitcast <16 x i8> %in to <4 x float>
89+
; CHECK-NEXT: %[[EXTF:.*]] = extractelement <4 x float> %[[BCF]], i32 0
90+
; CHECK-NEXT: %1 = insertvalue { float, i32 } undef, float %[[EXTF]], 0
91+
; CHECK-NEXT: %2 = insertvalue { float, i32 } %1, i32 %[[EXTI]], 1
92+
; CHECK-NEXT: ret { float, i32 } %2
93+
%sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
94+
%f = bitcast <4 x i8> %sroa to float
95+
%i = bitcast <4 x i8> %sroa to i32
96+
%1 = insertvalue { float, i32 } undef, float %f, 0
97+
%2 = insertvalue { float, i32 } %1, i32 %i, 1
98+
ret { float, i32 } %2
99+
}
100+
101+
; Type punning two i32 values, with control flow.
102+
; Verify that the bitcast is shared and dominates usage.
103+
define i32 @type_pun_i32_ctrl(<16 x i8> %in) {
104+
; CHECK-LABEL: @type_pun_i32_ctrl(
105+
entry: ; CHECK-NEXT: entry:
106+
; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x i32>
107+
; CHECK-NEXT: br
108+
%sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
109+
br i1 undef, label %left, label %right
110+
left: ; CHECK: left:
111+
; CHECK-NEXT: %[[EXTL:.*]] = extractelement <4 x i32> %[[BC]], i32 0
112+
; CHECK-NEXT: br
113+
%lhs = bitcast <4 x i8> %sroa to i32
114+
br label %tail
115+
right: ; CHECK: right:
116+
; CHECK-NEXT: %[[EXTR:.*]] = extractelement <4 x i32> %[[BC]], i32 0
117+
; CHECK-NEXT: br
118+
%rhs = bitcast <4 x i8> %sroa to i32
119+
br label %tail
120+
tail: ; CHECK: tail:
121+
; CHECK-NEXT: %i = phi i32 [ %[[EXTL]], %left ], [ %[[EXTR]], %right ]
122+
; CHECK-NEXT: ret i32 %i
123+
%i = phi i32 [ %lhs, %left ], [ %rhs, %right ]
124+
ret i32 %i
125+
}
126+
127+
; Extracting a type that won't fit in a vector isn't handled. The function
128+
; should stay the same.
129+
define i40 @type_pun_unhandled(<16 x i8> %in) {
130+
; CHECK-LABEL: @type_pun_unhandled(
131+
; CHECK-NEXT: %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <5 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8>
132+
; CHECK-NEXT: %1 = bitcast <5 x i8> %sroa to i40
133+
; CHECK-NEXT: ret i40 %1
134+
%sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <5 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8>
135+
%1 = bitcast <5 x i8> %sroa to i40
136+
ret i40 %1
137+
}

0 commit comments

Comments
 (0)
Please sign in to comment.