Index: lib/Transforms/InstCombine/InstCombineVectorOps.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombineInternal.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/IR/PatternMatch.h"
 using namespace llvm;
 using namespace PatternMatch;
@@ -853,10 +854,32 @@
   }
 }
 
+// Returns true if the shuffle is extracting a contiguous range of values from
+// LHS, for example:
+//                 +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+//   Input:        |AA|BB|CC|DD|EE|FF|GG|HH|II|JJ|KK|LL|MM|NN|OO|PP|
+//   Shuffles to:  |EE|FF|GG|HH|
+//                 +--+--+--+--+
+static bool isShuffleExtractingFromLHS(ShuffleVectorInst &SVI,
+                                       SmallVector<int, 16> &Mask) {
+  unsigned LHSElems =
+      cast<VectorType>(SVI.getOperand(0)->getType())->getNumElements();
+  unsigned MaskElems = Mask.size();
+  unsigned BegIdx = Mask.front();
+  unsigned EndIdx = Mask.back();
+  if (BegIdx > EndIdx || EndIdx >= LHSElems || EndIdx - BegIdx != MaskElems - 1)
+    return false;
+  for (unsigned I = 0; I != MaskElems; ++I)
+    if (static_cast<unsigned>(Mask[I]) != BegIdx + I)
+      return false;
+  return true;
+}
+
 Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   Value *LHS = SVI.getOperand(0);
   Value *RHS = SVI.getOperand(1);
   SmallVector<int, 16> Mask = SVI.getShuffleMask();
+  Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
 
   bool MadeChange = false;
 
@@ -892,18 +915,17 @@
     SmallVector<Constant*, 16> Elts;
     for (unsigned i = 0, e = LHSWidth; i != VWidth; ++i) {
       if (Mask[i] < 0) {
-        Elts.push_back(UndefValue::get(Type::getInt32Ty(SVI.getContext())));
+        Elts.push_back(UndefValue::get(Int32Ty));
         continue;
       }
 
       if ((Mask[i] >= (int)e && isa<UndefValue>(RHS)) ||
           (Mask[i] <  (int)e && isa<UndefValue>(LHS))) {
         Mask[i] = -1;     // Turn into undef.
-        Elts.push_back(UndefValue::get(Type::getInt32Ty(SVI.getContext())));
+        Elts.push_back(UndefValue::get(Int32Ty));
       } else {
         Mask[i] = Mask[i] % e;  // Force to LHS.
-        Elts.push_back(ConstantInt::get(Type::getInt32Ty(SVI.getContext()),
-                                        Mask[i]));
+        Elts.push_back(ConstantInt::get(Int32Ty, Mask[i]));
       }
     }
     SVI.setOperand(0, SVI.getOperand(1));
@@ -929,6 +951,96 @@
     return ReplaceInstUsesWith(SVI, V);
   }
 
+  // SROA generates shuffle+bitcast when the extracted sub-vector is bitcast to
+  // a non-vector type. We can instead bitcast the original vector followed by
+  // an extract of the desired element:
+  //
+  //   %sroa = shufflevector <16 x i8> %in, <16 x i8> undef,
+  //                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  //   %1 = bitcast <4 x i8> %sroa to i32
+  // Becomes:
+  //   %bc = bitcast <16 x i8> %in to <4 x i32>
+  //   %ext = extractelement <4 x i32> %bc, i32 0
+  //
+  // If the shuffle is extracting a contiguous range of values from the input
+  // vector then each use which is a bitcast of the extracted size can be
+  // replaced. This will work if the vector types are compatible, and the begin
+  // index is aligned to a value in the casted vector type. If the begin index
+  // isn't aligned then we can shuffle the original vector (keeping the same
+  // vector type) before extracting.
+  //
+  // This code will bail out if the target type is fundamentally incompatible
+  // with vectors of the source type.
+  //
+  // Example of <16 x i8>, target type i32:
+  // Index range [4,8):         v-----------v Will work.
+  //                +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+  //     <16 x i8>: |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
+  //     <4 x i32>: |           |           |           |           |
+  //                +-----------+-----------+-----------+-----------+
+  // Index range [6,10):              ^-----------^ Needs an extra shuffle.
+  // Target type i40:           ^--------------^ Won't work, bail.
+  if (isShuffleExtractingFromLHS(SVI, Mask)) {
+    Value *V = LHS;
+    unsigned MaskElems = Mask.size();
+    unsigned BegIdx = Mask.front();
+    VectorType *SrcTy = cast<VectorType>(V->getType());
+    unsigned VecBitWidth = SrcTy->getBitWidth();
+    unsigned SrcElemBitWidth =
+        SrcTy->getElementType()->getPrimitiveSizeInBits();
+    assert(SrcElemBitWidth && "vector elements must have a bitwidth");
+    unsigned SrcNumElems = SrcTy->getNumElements();
+    SmallVector<BitCastInst *, 8> BCs;
+    DenseMap<Type *, Value *> NewBCs;
+    for (User *U : SVI.users())
+      if (BitCastInst *BC = dyn_cast<BitCastInst>(U))
+        if (!BC->use_empty())
+          // Only visit bitcasts that weren't previously handled.
+          BCs.push_back(BC);
+    for (BitCastInst *BC : BCs) {
+      Type *TgtTy = BC->getDestTy();
+      unsigned TgtElemBitWidth = TgtTy->getPrimitiveSizeInBits();
+      if (!TgtElemBitWidth)
+        continue;
+      unsigned TgtNumElems = VecBitWidth / TgtElemBitWidth;
+      bool VecBitWidthsEqual = VecBitWidth == TgtNumElems * TgtElemBitWidth;
+      bool BegIsAligned = 0 == ((SrcElemBitWidth * BegIdx) % TgtElemBitWidth);
+      if (!VecBitWidthsEqual)
+        continue;
+      if (!VectorType::isValidElementType(TgtTy))
+        continue;
+      VectorType *CastSrcTy = VectorType::get(TgtTy, TgtNumElems);
+      if (!BegIsAligned) {
+        // Shuffle the input so [0,NumElements) contains the output, and
+        // [NumElems,SrcNumElems) is undef.
+        SmallVector<Constant *, 16> ShuffleMask(SrcNumElems,
+                                                UndefValue::get(Int32Ty));
+        for (unsigned I = 0, E = MaskElems, Idx = BegIdx; I != E; ++Idx, ++I)
+          ShuffleMask[I] = ConstantInt::get(Int32Ty, Idx);
+        V = Builder->CreateShuffleVector(V, UndefValue::get(V->getType()),
+                                         ConstantVector::get(ShuffleMask),
+                                         SVI.getName() + ".extract");
+        BegIdx = 0;
+      }
+      unsigned SrcElemsPerTgtElem = TgtElemBitWidth / SrcElemBitWidth;
+      assert(SrcElemsPerTgtElem);
+      BegIdx /= SrcElemsPerTgtElem;
+      bool BCAlreadyExists = NewBCs.find(CastSrcTy) != NewBCs.end();
+      auto *NewBC =
+          BCAlreadyExists
+              ? NewBCs[CastSrcTy]
+              : Builder->CreateBitCast(V, CastSrcTy, SVI.getName() + ".bc");
+      if (!BCAlreadyExists)
+        NewBCs[CastSrcTy] = NewBC;
+      auto *Ext = Builder->CreateExtractElement(
+          NewBC, ConstantInt::get(Int32Ty, BegIdx), SVI.getName() + ".extract");
+      // The shufflevector isn't being replaced: the bitcast that used it
+      // is. InstCombine will visit the newly-created instructions.
+      ReplaceInstUsesWith(*BC, Ext);
+      MadeChange = true;
+    }
+  }
+
   // If the LHS is a shufflevector itself, see if we can combine it with this
   // one without producing an unusual shuffle.
   // Cases that might be simplified:
@@ -1099,7 +1211,6 @@
   // or is a splat, do the replacement.
   if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) {
     SmallVector<Constant*, 16> Elts;
-    Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
     for (unsigned i = 0, e = newMask.size(); i != e; ++i) {
       if (newMask[i] < 0) {
         Elts.push_back(UndefValue::get(Int32Ty));
Index: test/Transforms/InstCombine/type_pun.ll
===================================================================
--- /dev/null
+++ test/Transforms/InstCombine/type_pun.ll
@@ -0,0 +1,137 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; Ensure that type punning using a union of vector and same-sized array
+; generates an extract instead of a shuffle with an uncommon vector size:
+;
+;   typedef uint32_t v4i32 __attribute__((vector_size(16)));
+;   union { v4i32 v; uint32_t a[4]; };
+;
+; This cleans up behind SROA, which inserts the uncommon vector size when
+; cleaning up the alloca/store/GEP/load.
+
+
+; Extracting the zeroth element in an i32 array.
+define i32 @type_pun_zeroth(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_zeroth(
+; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x i32>
+; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x i32> %[[BC]], i32 0
+; CHECK-NEXT: ret i32 %[[EXT]]
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %1 = bitcast <4 x i8> %sroa to i32
+  ret i32 %1
+}
+
+; Extracting the first element in an i32 array.
+define i32 @type_pun_first(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_first(
+; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x i32>
+; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x i32> %[[BC]], i32 1
+; CHECK-NEXT: ret i32 %[[EXT]]
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %1 = bitcast <4 x i8> %sroa to i32
+  ret i32 %1
+}
+
+; Extracting an i32 that isn't aligned to any natural boundary.
+define i32 @type_pun_misaligned(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_misaligned(
+; CHECK-NEXT: %[[SHUF:.*]] = shufflevector <16 x i8> %in, <16 x i8> undef, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %[[SHUF]] to <4 x i32>
+; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x i32> %[[BC]], i32 0
+; CHECK-NEXT: ret i32 %[[EXT]]
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+  %1 = bitcast <4 x i8> %sroa to i32
+  ret i32 %1
+}
+
+; Type punning to an array of pointers.
+define i32* @type_pun_pointer(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_pointer(
+; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x i32>
+; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x i32> %[[BC]], i32 0
+; CHECK-NEXT: %[[I2P:.*]] = inttoptr i32 %[[EXT]] to i32*
+; CHECK-NEXT: ret i32* %[[I2P]]
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %1 = bitcast <4 x i8> %sroa to i32
+  %2 = inttoptr i32 %1 to i32*
+  ret i32* %2
+}
+
+; Type punning to an array of 32-bit floating-point values.
+define float @type_pun_float(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_float(
+; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x float>
+; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x float> %[[BC]], i32 0
+; CHECK-NEXT: ret float %[[EXT]]
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %1 = bitcast <4 x i8> %sroa to float
+  ret float %1
+}
+
+; Type punning to an array of 64-bit floating-point values.
+define double @type_pun_double(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_double(
+; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <2 x double>
+; CHECK-NEXT: %[[EXT:.*]] = extractelement <2 x double> %[[BC]], i32 0
+; CHECK-NEXT: ret double %[[EXT]]
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %1 = bitcast <8 x i8> %sroa to double
+  ret double %1
+}
+
+; Type punning to same-size floating-point and integer values.
+; Verify that multiple uses with different bitcast types are properly handled.
+define { float, i32 } @type_pun_float_i32(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_float_i32(
+; CHECK-NEXT: %[[BCI:.*]] = bitcast <16 x i8> %in to <4 x i32>
+; CHECK-NEXT: %[[EXTI:.*]] = extractelement <4 x i32> %[[BCI]], i32 0
+; CHECK-NEXT: %[[BCF:.*]] = bitcast <16 x i8> %in to <4 x float>
+; CHECK-NEXT: %[[EXTF:.*]] = extractelement <4 x float> %[[BCF]], i32 0
+; CHECK-NEXT: %1 = insertvalue { float, i32 } undef, float %[[EXTF]], 0
+; CHECK-NEXT: %2 = insertvalue { float, i32 } %1, i32 %[[EXTI]], 1
+; CHECK-NEXT: ret { float, i32 } %2
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %f = bitcast <4 x i8> %sroa to float
+  %i = bitcast <4 x i8> %sroa to i32
+  %1 = insertvalue { float, i32 } undef, float %f, 0
+  %2 = insertvalue { float, i32 } %1, i32 %i, 1
+  ret { float, i32 } %2
+}
+
+; Type punning two i32 values, with control flow.
+; Verify that the bitcast is shared and dominates usage.
+define i32 @type_pun_i32_ctrl(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_i32_ctrl(
+entry: ; CHECK-NEXT: entry:
+; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x i32>
+; CHECK-NEXT: br
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  br i1 undef, label %left, label %right
+left: ; CHECK: left:
+; CHECK-NEXT: %[[EXTL:.*]] = extractelement <4 x i32> %[[BC]], i32 0
+; CHECK-NEXT: br
+  %lhs = bitcast <4 x i8> %sroa to i32
+  br label %tail
+right: ; CHECK: right:
+; CHECK-NEXT: %[[EXTR:.*]] = extractelement <4 x i32> %[[BC]], i32 0
+; CHECK-NEXT: br
+  %rhs = bitcast <4 x i8> %sroa to i32
+  br label %tail
+tail: ; CHECK: tail:
+; CHECK-NEXT: %i = phi i32 [ %[[EXTL]], %left ], [ %[[EXTR]], %right ]
+; CHECK-NEXT: ret i32 %i
+  %i = phi i32 [ %lhs, %left ], [ %rhs, %right ]
+  ret i32 %i
+}
+
+; Extracting a type that won't fit in a vector isn't handled. The function
+; should stay the same.
+define i40 @type_pun_unhandled(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_unhandled(
+; CHECK-NEXT: %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <5 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT: %1 = bitcast <5 x i8> %sroa to i40
+; CHECK-NEXT: ret i40 %1
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <5 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8>
+  %1 = bitcast <5 x i8> %sroa to i40
+  ret i40 %1
+}