Index: lib/Transforms/InstCombine/InstCombineVectorOps.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -853,10 +853,32 @@ } } +// Returns true if the shuffle is extracting a contiguous range of values from +// LHS, for example: +// +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ +// Input: |AA|BB|CC|DD|EE|FF|GG|HH|II|JJ|KK|LL|MM|NN|OO|PP| +// Shuffles to: |EE|FF|GG|HH| +// +--+--+--+--+ +static bool ShuffleIsExtractingFromLHS(ShuffleVectorInst &SVI, + SmallVector &Mask) { + unsigned LHSElems = + cast(SVI.getOperand(0)->getType())->getNumElements(); + unsigned MaskElems = Mask.size(); + unsigned BegIdx = Mask.front(); + unsigned EndIdx = Mask.back(); + if (BegIdx > EndIdx || EndIdx >= LHSElems || EndIdx - BegIdx != MaskElems - 1) + return false; + for (unsigned I = 0; I != MaskElems; ++I) + if (static_cast(Mask[I]) != BegIdx + I) + return false; + return true; +} + Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { Value *LHS = SVI.getOperand(0); Value *RHS = SVI.getOperand(1); SmallVector Mask = SVI.getShuffleMask(); + Type *Int32Ty = Type::getInt32Ty(SVI.getContext()); bool MadeChange = false; @@ -892,18 +914,17 @@ SmallVector Elts; for (unsigned i = 0, e = LHSWidth; i != VWidth; ++i) { if (Mask[i] < 0) { - Elts.push_back(UndefValue::get(Type::getInt32Ty(SVI.getContext()))); + Elts.push_back(UndefValue::get(Int32Ty)); continue; } if ((Mask[i] >= (int)e && isa(RHS)) || (Mask[i] < (int)e && isa(LHS))) { Mask[i] = -1; // Turn into undef. - Elts.push_back(UndefValue::get(Type::getInt32Ty(SVI.getContext()))); + Elts.push_back(UndefValue::get(Int32Ty)); } else { Mask[i] = Mask[i] % e; // Force to LHS. - Elts.push_back(ConstantInt::get(Type::getInt32Ty(SVI.getContext()), - Mask[i])); + Elts.push_back(ConstantInt::get(Int32Ty, Mask[i])); } } SVI.setOperand(0, SVI.getOperand(1)); @@ -929,6 +950,88 @@ return ReplaceInstUsesWith(SVI, V); } + // SROA generates shuffles followed by bitcast, which we can replace by a + // bitcast of the original vector followed by an extract: + // + // %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, + // <4 x i32> + // %1 = bitcast <4 x i8> %sroa to i32 + // Becomes: + // %bc = bitcast <16 x i8> %in to <4 x i32> + // %ext = extractelement <4 x i32> %bc, i32 0 + // + // If the shuffle is extracting contiguous range of values from the input + // vector then each use which is a bitcast of the extracted size can be + // replaced. This will work if the vector types are compatible, and the begin + // index is aligned to a value in the casted vector type. If the begin index + // isn't aligned then we can shuffle the original vector (keeping the same + // vector type) before extracting. + // + // This code will bail out if the target type is fundamentally incompatible + // with vectors of the source type. + // + // Example of <16 x i8>, target type i32: + // Index range [4,8): v-----------v Will work. + // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + // <16 x i8>: | | | | | | | | | | | | | | | | | + // <4 x i32>: | | | | | + // +-----------+-----------+-----------+-----------+ + // Index range [6,10): ^-----------^ Needs an extra shuffle. + // Target type i40: ^--------------^ Won't work, bail. + if (ShuffleIsExtractingFromLHS(SVI, Mask)) { + unsigned MaskElems = Mask.size(); + unsigned BegIdx = Mask.front(); + VectorType *SrcTy = cast(LHS->getType()); + unsigned VecBitWidth = SrcTy->getBitWidth(); + unsigned SrcElemBitWidth = + SrcTy->getElementType()->getPrimitiveSizeInBits(); + assert(SrcElemBitWidth && "vector elements must have a bitwidth"); + unsigned SrcNumElems = SrcTy->getNumElements(); + SmallVector BCs; + for (User *U : SVI.users()) + if (BitCastInst *BC = dyn_cast(U)) + if (BC->hasNUsesOrMore(1)) + // Only visit bitcasts that weren't previously handled. + BCs.push_back(BC); + for (BitCastInst *BC : BCs) { + Type *TgtTy = BC->getDestTy(); + unsigned TgtElemBitWidth = TgtTy->getPrimitiveSizeInBits(); + if (!TgtElemBitWidth) + continue; + unsigned TgtNumElems = VecBitWidth / TgtElemBitWidth; + bool VecBitWidthsEqual = VecBitWidth == TgtNumElems * TgtElemBitWidth; + bool BegIsAligned = 0 == ((SrcElemBitWidth * BegIdx) % TgtElemBitWidth); + if (!VecBitWidthsEqual) + continue; + if (!VectorType::isValidElementType(TgtTy)) + continue; + VectorType *CastSrcTy = VectorType::get(TgtTy, TgtNumElems); + if (!BegIsAligned) { + // Shuffle the input so [0,NumElements) contains the output, and + // [NumElems,SrcNumElems) is undef. + Constant *Undef = ConstantInt::get(Int32Ty, SrcNumElems); + SmallVector ShuffleMask(SrcNumElems, Undef); + for (unsigned I = 0, E = MaskElems, Idx = BegIdx; I != E; ++Idx, ++I) + ShuffleMask[I] = ConstantInt::get(Int32Ty, Idx); + LHS = new ShuffleVectorInst(LHS, UndefValue::get(LHS->getType()), + ConstantVector::get(ShuffleMask), + SVI.getName() + ".extract", BC); + BegIdx = 0; + } + unsigned SrcElemsPerTgtElem = TgtElemBitWidth / SrcElemBitWidth; + assert(SrcElemsPerTgtElem); + BegIdx /= SrcElemsPerTgtElem; + Instruction *Ext = ExtractElementInst::Create( + CastInst::Create(Instruction::BitCast, LHS, CastSrcTy, + SVI.getName() + ".bc", BC), + ConstantInt::get(Int32Ty, BegIdx), SVI.getName() + ".extract", BC); + // The shufflevector isn't being replace: the bitcast that used it + // is. InstCombine will visit the newly-created instructions. + ReplaceInstUsesWith(*BC, Ext); + MadeChange = true; + } + } + // If the LHS is a shufflevector itself, see if we can combine it with this // one without producing an unusual shuffle. // Cases that might be simplified: @@ -1099,7 +1202,6 @@ // or is a splat, do the replacement. if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) { SmallVector Elts; - Type *Int32Ty = Type::getInt32Ty(SVI.getContext()); for (unsigned i = 0, e = newMask.size(); i != e; ++i) { if (newMask[i] < 0) { Elts.push_back(UndefValue::get(Int32Ty)); Index: test/Transforms/InstCombine/type_pun.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/type_pun.ll @@ -0,0 +1,92 @@ +; RUN: opt < %s -instcombine -S | FileCheck %s + +; Ensure that type punning using a union of vector and same-sized array +; generates an extract instead of a shuffle with an uncommon vector size: +; +; typedef uint32_t v4i32 __attribute__((vector_size(16))); +; union { v4i32 v; uint32_t a[4]; }; +; +; This cleans up behind SROA, which inserts the uncommon vector size when +; cleaning up the alloca/store/GEP/load. + + +; Extracting the zeroth element in an i32 array. +define i32 @type_pun_zeroth(<16 x i8> %in) { +; CHECK-LABEL: @type_pun_zeroth( +; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x i32> +; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x i32> %[[BC]], i32 0 +; CHECK-NEXT: ret i32 %[[EXT]] + %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> + %1 = bitcast <4 x i8> %sroa to i32 + ret i32 %1 +} + +; Extracting the first element in an i32 array. +define i32 @type_pun_first(<16 x i8> %in) { +; CHECK-LABEL: @type_pun_first( +; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x i32> +; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x i32> %[[BC]], i32 1 +; CHECK-NEXT: ret i32 %[[EXT]] + %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> + %1 = bitcast <4 x i8> %sroa to i32 + ret i32 %1 +} + +; Extracting an i32 that isn't aligned to any natural boundary. +define i32 @type_pun_misaligned(<16 x i8> %in) { +; CHECK-LABEL: @type_pun_misaligned( +; CHECK-NEXT: %[[SHUF:.*]] = shufflevector <16 x i8> %in, <16 x i8> undef, <16 x i32> +; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %[[SHUF]] to <4 x i32> +; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x i32> %[[BC]], i32 0 +; CHECK-NEXT: ret i32 %[[EXT]] + %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> + %1 = bitcast <4 x i8> %sroa to i32 + ret i32 %1 +} + +; Type punning to an array of pointers. +define i32* @type_pun_pointer(<16 x i8> %in) { +; CHECK-LABEL: @type_pun_pointer( +; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x i32> +; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x i32> %[[BC]], i32 0 +; CHECK-NEXT: %[[I2P:.*]] = inttoptr i32 %[[EXT]] to i32* +; CHECK-NEXT: ret i32* %[[I2P]] + %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> + %1 = bitcast <4 x i8> %sroa to i32 + %2 = inttoptr i32 %1 to i32* + ret i32* %2 +} + +; Type punning to an array of 32-bit floating-point values. +define float @type_pun_float(<16 x i8> %in) { +; CHECK-LABEL: @type_pun_float( +; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x float> +; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x float> %[[BC]], i32 0 +; CHECK-NEXT: ret float %[[EXT]] + %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> + %1 = bitcast <4 x i8> %sroa to float + ret float %1 +} + +; Type punning to an array of 64-bit floating-point values. +define double @type_pun_double(<16 x i8> %in) { +; CHECK-LABEL: @type_pun_double( +; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <2 x double> +; CHECK-NEXT: %[[EXT:.*]] = extractelement <2 x double> %[[BC]], i32 0 +; CHECK-NEXT: ret double %[[EXT]] + %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <8 x i32> + %1 = bitcast <8 x i8> %sroa to double + ret double %1 +} + +; Extracting a type that won't fit in a vector isn't handled. The function +; should stay the same. +define i40 @type_pun_unhandled(<16 x i8> %in) { +; CHECK-LABEL: @type_pun_unhandled( +; CHECK-NEXT: %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <5 x i32> +; CHECK-NEXT: %1 = bitcast <5 x i8> %sroa to i40 +; CHECK-NEXT: ret i40 %1 + %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <5 x i32> + %1 = bitcast <5 x i8> %sroa to i40 + ret i40 %1 +}