Index: lib/Transforms/InstCombine/InstCombineVectorOps.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "InstCombineInternal.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/IR/PatternMatch.h" using namespace llvm; using namespace PatternMatch; @@ -853,10 +854,32 @@ } } +// Returns true if the shuffle is extracting a contiguous range of values from +// LHS, for example: +// +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ +// Input: |AA|BB|CC|DD|EE|FF|GG|HH|II|JJ|KK|LL|MM|NN|OO|PP| +// Shuffles to: |EE|FF|GG|HH| +// +--+--+--+--+ +static bool isShuffleExtractingFromLHS(ShuffleVectorInst &SVI, + SmallVector &Mask) { + unsigned LHSElems = + cast(SVI.getOperand(0)->getType())->getNumElements(); + unsigned MaskElems = Mask.size(); + unsigned BegIdx = Mask.front(); + unsigned EndIdx = Mask.back(); + if (BegIdx > EndIdx || EndIdx >= LHSElems || EndIdx - BegIdx != MaskElems - 1) + return false; + for (unsigned I = 0; I != MaskElems; ++I) + if (static_cast(Mask[I]) != BegIdx + I) + return false; + return true; +} + Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { Value *LHS = SVI.getOperand(0); Value *RHS = SVI.getOperand(1); SmallVector Mask = SVI.getShuffleMask(); + Type *Int32Ty = Type::getInt32Ty(SVI.getContext()); bool MadeChange = false; @@ -892,18 +915,17 @@ SmallVector Elts; for (unsigned i = 0, e = LHSWidth; i != VWidth; ++i) { if (Mask[i] < 0) { - Elts.push_back(UndefValue::get(Type::getInt32Ty(SVI.getContext()))); + Elts.push_back(UndefValue::get(Int32Ty)); continue; } if ((Mask[i] >= (int)e && isa(RHS)) || (Mask[i] < (int)e && isa(LHS))) { Mask[i] = -1; // Turn into undef. - Elts.push_back(UndefValue::get(Type::getInt32Ty(SVI.getContext()))); + Elts.push_back(UndefValue::get(Int32Ty)); } else { Mask[i] = Mask[i] % e; // Force to LHS. - Elts.push_back(ConstantInt::get(Type::getInt32Ty(SVI.getContext()), - Mask[i])); + Elts.push_back(ConstantInt::get(Int32Ty, Mask[i])); } } SVI.setOperand(0, SVI.getOperand(1)); @@ -929,6 +951,96 @@ return ReplaceInstUsesWith(SVI, V); } + // SROA generates shuffle+bitcast when the extracted sub-vector is bitcast to + // a non-vector type. We can instead bitcast of the original vector followed + // by an extract of the desired element: + // + // %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, + // <4 x i32> + // %1 = bitcast <4 x i8> %sroa to i32 + // Becomes: + // %bc = bitcast <16 x i8> %in to <4 x i32> + // %ext = extractelement <4 x i32> %bc, i32 0 + // + // If the shuffle is extracting contiguous range of values from the input + // vector then each use which is a bitcast of the extracted size can be + // replaced. This will work if the vector types are compatible, and the begin + // index is aligned to a value in the casted vector type. If the begin index + // isn't aligned then we can shuffle the original vector (keeping the same + // vector type) before extracting. + // + // This code will bail out if the target type is fundamentally incompatible + // with vectors of the source type. + // + // Example of <16 x i8>, target type i32: + // Index range [4,8): v-----------v Will work. + // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + // <16 x i8>: | | | | | | | | | | | | | | | | | + // <4 x i32>: | | | | | + // +-----------+-----------+-----------+-----------+ + // Index range [6,10): ^-----------^ Needs an extra shuffle. + // Target type i40: ^--------------^ Won't work, bail. + if (isShuffleExtractingFromLHS(SVI, Mask)) { + Value *V = LHS; + unsigned MaskElems = Mask.size(); + unsigned BegIdx = Mask.front(); + VectorType *SrcTy = cast(V->getType()); + unsigned VecBitWidth = SrcTy->getBitWidth(); + unsigned SrcElemBitWidth = + SrcTy->getElementType()->getPrimitiveSizeInBits(); + assert(SrcElemBitWidth && "vector elements must have a bitwidth"); + unsigned SrcNumElems = SrcTy->getNumElements(); + SmallVector BCs; + DenseMap NewBCs; + for (User *U : SVI.users()) + if (BitCastInst *BC = dyn_cast(U)) + if (!BC->use_empty()) + // Only visit bitcasts that weren't previously handled. + BCs.push_back(BC); + for (BitCastInst *BC : BCs) { + Type *TgtTy = BC->getDestTy(); + unsigned TgtElemBitWidth = TgtTy->getPrimitiveSizeInBits(); + if (!TgtElemBitWidth) + continue; + unsigned TgtNumElems = VecBitWidth / TgtElemBitWidth; + bool VecBitWidthsEqual = VecBitWidth == TgtNumElems * TgtElemBitWidth; + bool BegIsAligned = 0 == ((SrcElemBitWidth * BegIdx) % TgtElemBitWidth); + if (!VecBitWidthsEqual) + continue; + if (!VectorType::isValidElementType(TgtTy)) + continue; + VectorType *CastSrcTy = VectorType::get(TgtTy, TgtNumElems); + if (!BegIsAligned) { + // Shuffle the input so [0,NumElements) contains the output, and + // [NumElems,SrcNumElems) is undef. + Constant *Undef = ConstantInt::get(Int32Ty, SrcNumElems); + SmallVector ShuffleMask(SrcNumElems, Undef); + for (unsigned I = 0, E = MaskElems, Idx = BegIdx; I != E; ++Idx, ++I) + ShuffleMask[I] = ConstantInt::get(Int32Ty, Idx); + V = Builder->CreateShuffleVector(V, UndefValue::get(V->getType()), + ConstantVector::get(ShuffleMask), + SVI.getName() + ".extract"); + BegIdx = 0; + } + unsigned SrcElemsPerTgtElem = TgtElemBitWidth / SrcElemBitWidth; + assert(SrcElemsPerTgtElem); + BegIdx /= SrcElemsPerTgtElem; + bool BCAlreadyExists = NewBCs.find(CastSrcTy) != NewBCs.end(); + auto *NewBC = + BCAlreadyExists + ? NewBCs[CastSrcTy] + : Builder->CreateBitCast(V, CastSrcTy, SVI.getName() + ".bc"); + if (!BCAlreadyExists) + NewBCs[CastSrcTy] = NewBC; + auto *Ext = Builder->CreateExtractElement( + NewBC, ConstantInt::get(Int32Ty, BegIdx), SVI.getName() + ".extract"); + // The shufflevector isn't being replace: the bitcast that used it + // is. InstCombine will visit the newly-created instructions. + ReplaceInstUsesWith(*BC, Ext); + MadeChange = true; + } + } + // If the LHS is a shufflevector itself, see if we can combine it with this // one without producing an unusual shuffle. // Cases that might be simplified: @@ -1099,7 +1211,6 @@ // or is a splat, do the replacement. if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) { SmallVector Elts; - Type *Int32Ty = Type::getInt32Ty(SVI.getContext()); for (unsigned i = 0, e = newMask.size(); i != e; ++i) { if (newMask[i] < 0) { Elts.push_back(UndefValue::get(Int32Ty)); Index: test/Transforms/InstCombine/type_pun.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/type_pun.ll @@ -0,0 +1,137 @@ +; RUN: opt < %s -instcombine -S | FileCheck %s + +; Ensure that type punning using a union of vector and same-sized array +; generates an extract instead of a shuffle with an uncommon vector size: +; +; typedef uint32_t v4i32 __attribute__((vector_size(16))); +; union { v4i32 v; uint32_t a[4]; }; +; +; This cleans up behind SROA, which inserts the uncommon vector size when +; cleaning up the alloca/store/GEP/load. + + +; Extracting the zeroth element in an i32 array. +define i32 @type_pun_zeroth(<16 x i8> %in) { +; CHECK-LABEL: @type_pun_zeroth( +; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x i32> +; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x i32> %[[BC]], i32 0 +; CHECK-NEXT: ret i32 %[[EXT]] + %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> + %1 = bitcast <4 x i8> %sroa to i32 + ret i32 %1 +} + +; Extracting the first element in an i32 array. +define i32 @type_pun_first(<16 x i8> %in) { +; CHECK-LABEL: @type_pun_first( +; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x i32> +; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x i32> %[[BC]], i32 1 +; CHECK-NEXT: ret i32 %[[EXT]] + %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> + %1 = bitcast <4 x i8> %sroa to i32 + ret i32 %1 +} + +; Extracting an i32 that isn't aligned to any natural boundary. +define i32 @type_pun_misaligned(<16 x i8> %in) { +; CHECK-LABEL: @type_pun_misaligned( +; CHECK-NEXT: %[[SHUF:.*]] = shufflevector <16 x i8> %in, <16 x i8> undef, <16 x i32> +; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %[[SHUF]] to <4 x i32> +; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x i32> %[[BC]], i32 0 +; CHECK-NEXT: ret i32 %[[EXT]] + %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> + %1 = bitcast <4 x i8> %sroa to i32 + ret i32 %1 +} + +; Type punning to an array of pointers. +define i32* @type_pun_pointer(<16 x i8> %in) { +; CHECK-LABEL: @type_pun_pointer( +; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x i32> +; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x i32> %[[BC]], i32 0 +; CHECK-NEXT: %[[I2P:.*]] = inttoptr i32 %[[EXT]] to i32* +; CHECK-NEXT: ret i32* %[[I2P]] + %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> + %1 = bitcast <4 x i8> %sroa to i32 + %2 = inttoptr i32 %1 to i32* + ret i32* %2 +} + +; Type punning to an array of 32-bit floating-point values. +define float @type_pun_float(<16 x i8> %in) { +; CHECK-LABEL: @type_pun_float( +; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x float> +; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x float> %[[BC]], i32 0 +; CHECK-NEXT: ret float %[[EXT]] + %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> + %1 = bitcast <4 x i8> %sroa to float + ret float %1 +} + +; Type punning to an array of 64-bit floating-point values. +define double @type_pun_double(<16 x i8> %in) { +; CHECK-LABEL: @type_pun_double( +; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <2 x double> +; CHECK-NEXT: %[[EXT:.*]] = extractelement <2 x double> %[[BC]], i32 0 +; CHECK-NEXT: ret double %[[EXT]] + %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <8 x i32> + %1 = bitcast <8 x i8> %sroa to double + ret double %1 +} + +; Type punning to same-size floating-point and integer values. +; Verify that multiple uses with different bitcast types are properly handled. +define { float, i32 } @type_pun_float_i32(<16 x i8> %in) { +; CHECK-LABEL: @type_pun_float_i32( +; CHECK-NEXT: %[[BCI:.*]] = bitcast <16 x i8> %in to <4 x i32> +; CHECK-NEXT: %[[EXTI:.*]] = extractelement <4 x i32> %[[BCI]], i32 0 +; CHECK-NEXT: %[[BCF:.*]] = bitcast <16 x i8> %in to <4 x float> +; CHECK-NEXT: %[[EXTF:.*]] = extractelement <4 x float> %[[BCF]], i32 0 +; CHECK-NEXT: %1 = insertvalue { float, i32 } undef, float %[[EXTF]], 0 +; CHECK-NEXT: %2 = insertvalue { float, i32 } %1, i32 %[[EXTI]], 1 +; CHECK-NEXT: ret { float, i32 } %2 + %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> + %f = bitcast <4 x i8> %sroa to float + %i = bitcast <4 x i8> %sroa to i32 + %1 = insertvalue { float, i32 } undef, float %f, 0 + %2 = insertvalue { float, i32 } %1, i32 %i, 1 + ret { float, i32 } %2 +} + +; Type punning two i32 values, with control flow. +; Verify that the bitcast is shared and dominates usage. +define i32 @type_pun_i32_ctrl(<16 x i8> %in) { +; CHECK-LABEL: @type_pun_i32_ctrl( +entry: ; CHECK-NEXT: entry: +; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x i32> +; CHECK-NEXT: br + %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> + br i1 undef, label %left, label %right +left: ; CHECK: left: +; CHECK-NEXT: %[[EXTL:.*]] = extractelement <4 x i32> %[[BC]], i32 0 +; CHECK-NEXT: br + %lhs = bitcast <4 x i8> %sroa to i32 + br label %tail +right: ; CHECK: right: +; CHECK-NEXT: %[[EXTR:.*]] = extractelement <4 x i32> %[[BC]], i32 0 +; CHECK-NEXT: br + %rhs = bitcast <4 x i8> %sroa to i32 + br label %tail +tail: ; CHECK: tail: +; CHECK-NEXT: %i = phi i32 [ %[[EXTL]], %left ], [ %[[EXTR]], %right ] +; CHECK-NEXT: ret i32 %i + %i = phi i32 [ %lhs, %left ], [ %rhs, %right ] + ret i32 %i +} + +; Extracting a type that won't fit in a vector isn't handled. The function +; should stay the same. +define i40 @type_pun_unhandled(<16 x i8> %in) { +; CHECK-LABEL: @type_pun_unhandled( +; CHECK-NEXT: %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <5 x i32> +; CHECK-NEXT: %1 = bitcast <5 x i8> %sroa to i40 +; CHECK-NEXT: ret i40 %1 + %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <5 x i32> + %1 = bitcast <5 x i8> %sroa to i40 + ret i40 %1 +}