Index: lib/Target/PowerPC/PPCISelLowering.cpp
===================================================================
--- lib/Target/PowerPC/PPCISelLowering.cpp
+++ lib/Target/PowerPC/PPCISelLowering.cpp
@@ -771,6 +771,11 @@
       setOperationAction(ISD::SRL, MVT::v1i128, Legal);
       setOperationAction(ISD::SRA, MVT::v1i128, Expand);
     }
+
+    if (Subtarget.hasP9Altivec()) {
+      setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
+    }
   }
 
   if (Subtarget.hasQPX()) {
@@ -8746,11 +8751,30 @@
                                                   SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT &&
          "Should only be called for ISD::INSERT_VECTOR_ELT");
+
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(2));
   // We have legal lowering for constant indices but not for variable ones.
-  if (C)
-    return Op;
-  return SDValue();
+  if (!C)
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  SDValue V3 = Op.getOperand(2);
+  // We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types.
+  if (VT == MVT::v8i16 || VT == MVT::v16i8) {
+    SDValue Mtvsrz = DAG.getNode(PPCISD::MTVSRZ, dl, VT, V2);
+    unsigned BytesInEachElement = VT.getVectorElementType().getSizeInBits() / 8;
+    unsigned InsertAtElement = C->getZExtValue();
+    unsigned InsertAtByte = InsertAtElement * BytesInEachElement;
+    if (Subtarget.isLittleEndian()) {
+      InsertAtByte = (16 - BytesInEachElement) - InsertAtByte;
+    }
+    return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, Mtvsrz,
+                       DAG.getConstant(InsertAtByte, dl, MVT::i32));
+  }
+  return Op;
 }
 
 SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
Index: lib/Target/PowerPC/PPCInstrVSX.td
===================================================================
--- lib/Target/PowerPC/PPCInstrVSX.td
+++ lib/Target/PowerPC/PPCInstrVSX.td
@@ -1767,6 +1767,32 @@
                                   (COPY_TO_REGCLASS $S, VRRC),
                                   BE_VDWORD_PERM_VEC);
   dag BE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS BE_VDOUBLE_PERMUTE, VSRC);
+
+  dag P9ALTIVEC_LE_HALF_3 =
+    (EXTRACT_SUBREG (MFVSRD (EXTRACT_SUBREG (VEXTRACTUH 8, $S), sub_64)), sub_32);
+  dag P9ALTIVEC_LE_HALF_2 =
+    (EXTRACT_SUBREG (MFVSRD (EXTRACT_SUBREG (VEXTRACTUH 10, $S), sub_64)), sub_32);
+  dag P9ALTIVEC_LE_HALF_1 =
+    (EXTRACT_SUBREG (MFVSRD (EXTRACT_SUBREG (VEXTRACTUH 12, $S), sub_64)), sub_32);
+  dag P9ALTIVEC_LE_HALF_0 =
+    (EXTRACT_SUBREG (MFVSRD (EXTRACT_SUBREG (VEXTRACTUH 14, $S), sub_64)), sub_32);
+
+  dag P9ALTIVEC_LE_BYTE_7 =
+    (EXTRACT_SUBREG (MFVSRD (EXTRACT_SUBREG (VEXTRACTUB 8, $S), sub_64)), sub_32);
+  dag P9ALTIVEC_LE_BYTE_6 =
+    (EXTRACT_SUBREG (MFVSRD (EXTRACT_SUBREG (VEXTRACTUB 9, $S), sub_64)), sub_32);
+  dag P9ALTIVEC_LE_BYTE_5 =
+    (EXTRACT_SUBREG (MFVSRD (EXTRACT_SUBREG (VEXTRACTUB 10, $S), sub_64)), sub_32);
+  dag P9ALTIVEC_LE_BYTE_4 =
+    (EXTRACT_SUBREG (MFVSRD (EXTRACT_SUBREG (VEXTRACTUB 11, $S), sub_64)), sub_32);
+  dag P9ALTIVEC_LE_BYTE_3 =
+    (EXTRACT_SUBREG (MFVSRD (EXTRACT_SUBREG (VEXTRACTUB 12, $S), sub_64)), sub_32);
+  dag P9ALTIVEC_LE_BYTE_2 =
+    (EXTRACT_SUBREG (MFVSRD (EXTRACT_SUBREG (VEXTRACTUB 13, $S), sub_64)), sub_32);
+  dag P9ALTIVEC_LE_BYTE_1 =
+    (EXTRACT_SUBREG (MFVSRD (EXTRACT_SUBREG (VEXTRACTUB 14, $S), sub_64)), sub_32);
+  dag P9ALTIVEC_LE_BYTE_0 =
+    (EXTRACT_SUBREG (MFVSRD (EXTRACT_SUBREG (VEXTRACTUB 15, $S), sub_64)), sub_32);
 }
 
 let AddedComplexity = 400 in {
@@ -1877,6 +1903,64 @@
             (i64 VectorExtractions.BE_VARIABLE_DWORD)>;
 } // IsBigEndian, HasDirectMove
 
+// Better vector_extract patterns when extracting from dword[1] of VSR.
+let Predicates = [IsBigEndian, HasP9Altivec] in {
+  def : Pat<(i32 (vector_extract v8i16:$S, 4)),
+            (i32 VectorExtractions.P9ALTIVEC_LE_HALF_3)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 5)),
+            (i32 VectorExtractions.P9ALTIVEC_LE_HALF_2)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 6)),
+            (i32 VectorExtractions.P9ALTIVEC_LE_HALF_1)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 7)),
+            (i32 VectorExtractions.P9ALTIVEC_LE_HALF_0)>;
+
+  def : Pat<(i32 (vector_extract v16i8:$S, 8)),
+            (i32 VectorExtractions.P9ALTIVEC_LE_BYTE_7)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 9)),
+            (i32 VectorExtractions.P9ALTIVEC_LE_BYTE_6)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 10)),
+            (i32 VectorExtractions.P9ALTIVEC_LE_BYTE_5)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 11)),
+            (i32 VectorExtractions.P9ALTIVEC_LE_BYTE_4)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 12)),
+            (i32 VectorExtractions.P9ALTIVEC_LE_BYTE_3)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 13)),
+            (i32 VectorExtractions.P9ALTIVEC_LE_BYTE_2)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 14)),
+            (i32 VectorExtractions.P9ALTIVEC_LE_BYTE_1)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 15)),
+            (i32 VectorExtractions.P9ALTIVEC_LE_BYTE_0)>;
+} // IsBigEndian, HasP9Altivec
+
+// Better vector_extract patterns when extracting from dword[1] of VSR.
+let Predicates = [IsLittleEndian, HasP9Altivec] in {
+  def : Pat<(i32 (vector_extract v8i16:$S, 0)),
+            (i32 VectorExtractions.P9ALTIVEC_LE_HALF_0)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 1)),
+            (i32 VectorExtractions.P9ALTIVEC_LE_HALF_1)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 2)),
+            (i32 VectorExtractions.P9ALTIVEC_LE_HALF_2)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 3)),
+            (i32 VectorExtractions.P9ALTIVEC_LE_HALF_3)>;
+
+  def : Pat<(i32 (vector_extract v16i8:$S, 0)),
+            (i32 VectorExtractions.P9ALTIVEC_LE_BYTE_0)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 1)),
+            (i32 VectorExtractions.P9ALTIVEC_LE_BYTE_1)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 2)),
+            (i32 VectorExtractions.P9ALTIVEC_LE_BYTE_2)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 3)),
+            (i32 VectorExtractions.P9ALTIVEC_LE_BYTE_3)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 4)),
+            (i32 VectorExtractions.P9ALTIVEC_LE_BYTE_4)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 5)),
+            (i32 VectorExtractions.P9ALTIVEC_LE_BYTE_5)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 6)),
+            (i32 VectorExtractions.P9ALTIVEC_LE_BYTE_6)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 7)),
+            (i32 VectorExtractions.P9ALTIVEC_LE_BYTE_7)>;
+} // IsLittleEndian, HasP9Altivec
+
 // v4f32 scalar <-> vector conversions (LE)
 let Predicates = [IsLittleEndian, HasP8Vector] in {
   def : Pat<(v4f32 (scalar_to_vector f32:$A)),
@@ -2458,6 +2542,13 @@
                     UseVSXReg;
   } // mayStore
 
+  // Alternate patterns for PPCmtvsrz where the output is v8i16 or v16i8 instead
+  // of f64
+  def : Pat<(v8i16 (PPCmtvsrz i32:$A)),
+            (v8i16 (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64))>;
+  def : Pat<(v16i8 (PPCmtvsrz i32:$A)),
+            (v16i8 (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64))>;
+
   // Patterns for which instructions from ISA 3.0 are a better match
   let Predicates = [IsLittleEndian, HasP9Vector] in {
   def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))),
Index: test/CodeGen/PowerPC/p9-vinsert-vextract.ll
===================================================================
--- test/CodeGen/PowerPC/p9-vinsert-vextract.ll
+++ test/CodeGen/PowerPC/p9-vinsert-vextract.ll
@@ -876,3 +876,493 @@
   %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 8>
   ret <16 x i8> %vecins
 }
+
+; The following tests try to insert one halfword element into the vector.  We
+; should always be using the 'vinserth' instruction.
+define <8 x i16> @insert_halfword_0(<8 x i16> %a, i16 %b) {
+entry:
+; CHECK-LABEL: insert_halfword_0
+; CHECK: vinserth 2, 3, 14
+; CHECK-BE-LABEL: insert_halfword_0
+; CHECK-BE: vinserth 2, 3, 0
+  %vecins = insertelement <8 x i16> %a, i16 %b, i32 0
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_1(<8 x i16> %a, i16 %b) {
+entry:
+; CHECK-LABEL: insert_halfword_1
+; CHECK: vinserth 2, 3, 12
+; CHECK-BE-LABEL: insert_halfword_1
+; CHECK-BE: vinserth 2, 3, 2
+  %vecins = insertelement <8 x i16> %a, i16 %b, i32 1
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_2(<8 x i16> %a, i16 %b) {
+entry:
+; CHECK-LABEL: insert_halfword_2
+; CHECK: vinserth 2, 3, 10
+; CHECK-BE-LABEL: insert_halfword_2
+; CHECK-BE: vinserth 2, 3, 4
+  %vecins = insertelement <8 x i16> %a, i16 %b, i32 2
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_3(<8 x i16> %a, i16 %b) {
+entry:
+; CHECK-LABEL: insert_halfword_3
+; CHECK: vinserth 2, 3, 8
+; CHECK-BE-LABEL: insert_halfword_3
+; CHECK-BE: vinserth 2, 3, 6
+  %vecins = insertelement <8 x i16> %a, i16 %b, i32 3
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_4(<8 x i16> %a, i16 %b) {
+entry:
+; CHECK-LABEL: insert_halfword_4
+; CHECK: vinserth 2, 3, 6
+; CHECK-BE-LABEL: insert_halfword_4
+; CHECK-BE: vinserth 2, 3, 8
+  %vecins = insertelement <8 x i16> %a, i16 %b, i32 4
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_5(<8 x i16> %a, i16 %b) {
+entry:
+; CHECK-LABEL: insert_halfword_5
+; CHECK: vinserth 2, 3, 4
+; CHECK-BE-LABEL: insert_halfword_5
+; CHECK-BE: vinserth 2, 3, 10
+  %vecins = insertelement <8 x i16> %a, i16 %b, i32 5
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_6(<8 x i16> %a, i16 %b) {
+entry:
+; CHECK-LABEL: insert_halfword_6
+; CHECK: vinserth 2, 3, 2
+; CHECK-BE-LABEL: insert_halfword_6
+; CHECK-BE: vinserth 2, 3, 12
+  %vecins = insertelement <8 x i16> %a, i16 %b, i32 6
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_7(<8 x i16> %a, i16 %b) {
+entry:
+; CHECK-LABEL: insert_halfword_7
+; CHECK: vinserth 2, 3, 0
+; CHECK-BE-LABEL: insert_halfword_7
+; CHECK-BE: vinserth 2, 3, 14
+  %vecins = insertelement <8 x i16> %a, i16 %b, i32 7
+  ret <8 x i16> %vecins
+}
+
+; The following tests try to insert one byte element into the vector.  We
+; should always be using the 'vinsertb' instruction.
+define <16 x i8> @insert_byte_0(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_0
+; CHECK: vinsertb 2, 3, 15
+; CHECK-BE-LABEL: insert_byte_0
+; CHECK-BE: vinsertb 2, 3, 0
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 0
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_1(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_1
+; CHECK: vinsertb 2, 3, 14
+; CHECK-BE-LABEL: insert_byte_1
+; CHECK-BE: vinsertb 2, 3, 1
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 1
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_2(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_2
+; CHECK: vinsertb 2, 3, 13
+; CHECK-BE-LABEL: insert_byte_2
+; CHECK-BE: vinsertb 2, 3, 2
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 2
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_3(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_3
+; CHECK: vinsertb 2, 3, 12
+; CHECK-BE-LABEL: insert_byte_3
+; CHECK-BE: vinsertb 2, 3, 3
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 3
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_4(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_4
+; CHECK: vinsertb 2, 3, 11
+; CHECK-BE-LABEL: insert_byte_4
+; CHECK-BE: vinsertb 2, 3, 4
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 4
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_5(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_5
+; CHECK: vinsertb 2, 3, 10
+; CHECK-BE-LABEL: insert_byte_5
+; CHECK-BE: vinsertb 2, 3, 5
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 5
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_6(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_6
+; CHECK: vinsertb 2, 3, 9
+; CHECK-BE-LABEL: insert_byte_6
+; CHECK-BE: vinsertb 2, 3, 6
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 6
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_7(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_7
+; CHECK: vinsertb 2, 3, 8
+; CHECK-BE-LABEL: insert_byte_7
+; CHECK-BE: vinsertb 2, 3, 7
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 7
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_8(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_8
+; CHECK: vinsertb 2, 3, 7
+; CHECK-BE-LABEL: insert_byte_8
+; CHECK-BE: vinsertb 2, 3, 8
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 8
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_9(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_9
+; CHECK: vinsertb 2, 3, 6
+; CHECK-BE-LABEL: insert_byte_9
+; CHECK-BE: vinsertb 2, 3, 9
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 9
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_10(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_10
+; CHECK: vinsertb 2, 3, 5
+; CHECK-BE-LABEL: insert_byte_10
+; CHECK-BE: vinsertb 2, 3, 10
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 10
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_11(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_11
+; CHECK: vinsertb 2, 3, 4
+; CHECK-BE-LABEL: insert_byte_11
+; CHECK-BE: vinsertb 2, 3, 11
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 11
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_12(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_12
+; CHECK: vinsertb 2, 3, 3
+; CHECK-BE-LABEL: insert_byte_12
+; CHECK-BE: vinsertb 2, 3, 12
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 12
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_13(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_13
+; CHECK: vinsertb 2, 3, 2
+; CHECK-BE-LABEL: insert_byte_13
+; CHECK-BE: vinsertb 2, 3, 13
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 13
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_14(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_14
+; CHECK: vinsertb 2, 3, 1
+; CHECK-BE-LABEL: insert_byte_14
+; CHECK-BE: vinsertb 2, 3, 14
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 14
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_15(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_15
+; CHECK: vinsertb 2, 3, 0
+; CHECK-BE-LABEL: insert_byte_15
+; CHECK-BE: vinsertb 2, 3, 15
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 15
+  ret <16 x i8> %vecins
+}
+
+; The following tests try to extract one halfword element from the vector.  We
+; should only be using the 'vextractuh' instruction when extracting elements
+; [0,3] on LE and [4,7] on BE to avoid a xxswapd.
+define i16 @extract_halfword_0(<8 x i16> %a) {
+entry:
+; CHECK-LABEL: extract_halfword_0
+; CHECK: vextractuh 2, 2, 14
+; CHECK-BE-LABEL: extract_halfword_0
+; CHECK-BE-NOT: vextractuh
+  %vecext = extractelement <8 x i16> %a, i32 0
+  ret i16 %vecext
+}
+
+define i16 @extract_halfword_1(<8 x i16> %a) {
+entry:
+; CHECK-LABEL: extract_halfword_1
+; CHECK: vextractuh 2, 2, 12
+; CHECK-BE-LABEL: extract_halfword_1
+; CHECK-BE-NOT: vextractuh
+  %vecext = extractelement <8 x i16> %a, i32 1
+  ret i16 %vecext
+}
+
+define i16 @extract_halfword_2(<8 x i16> %a) {
+entry:
+; CHECK-LABEL: extract_halfword_2
+; CHECK: vextractuh 2, 2, 10
+; CHECK-BE-LABEL: extract_halfword_2
+; CHECK-BE-NOT: vextractuh
+  %vecext = extractelement <8 x i16> %a, i32 2
+  ret i16 %vecext
+}
+
+define i16 @extract_halfword_3(<8 x i16> %a) {
+entry:
+; CHECK-LABEL: extract_halfword_3
+; CHECK: vextractuh 2, 2, 8
+; CHECK-BE-LABEL: extract_halfword_3
+; CHECK-BE-NOT: vextractuh
+  %vecext = extractelement <8 x i16> %a, i32 3
+  ret i16 %vecext
+}
+
+define i16 @extract_halfword_4(<8 x i16> %a) {
+entry:
+; CHECK-LABEL: extract_halfword_4
+; CHECK-NOT: vextractuh
+; CHECK-BE-LABEL: extract_halfword_4
+; CHECK-BE: vextractuh 2, 2, 8
+  %vecext = extractelement <8 x i16> %a, i32 4
+  ret i16 %vecext
+}
+
+define i16 @extract_halfword_5(<8 x i16> %a) {
+entry:
+; CHECK-LABEL: extract_halfword_5
+; CHECK-NOT: vextractuh
+; CHECK-BE-LABEL: extract_halfword_5
+; CHECK-BE: vextractuh 2, 2, 10
+  %vecext = extractelement <8 x i16> %a, i32 5
+  ret i16 %vecext
+}
+
+define i16 @extract_halfword_6(<8 x i16> %a) {
+entry:
+; CHECK-LABEL: extract_halfword_6
+; CHECK-NOT: vextractuh
+; CHECK-BE-LABEL: extract_halfword_6
+; CHECK-BE: vextractuh 2, 2, 12
+  %vecext = extractelement <8 x i16> %a, i32 6
+  ret i16 %vecext
+}
+
+define i16 @extract_halfword_7(<8 x i16> %a) {
+entry:
+; CHECK-LABEL: extract_halfword_7
+; CHECK-NOT: vextractuh
+; CHECK-BE-LABEL: extract_halfword_7
+; CHECK-BE: vextractuh 2, 2, 14
+  %vecext = extractelement <8 x i16> %a, i32 7
+  ret i16 %vecext
+}
+
+; The following tests try to extract one byte element from the vector.  We
+; should only be using the 'vextractub' instruction when extracting elements
+; [0,7] on LE and [8,15] on BE to avoid a xxswapd.
+define i8 @extract_byte_0(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: extract_byte_0
+; CHECK: vextractub 2, 2, 15
+; CHECK-BE-LABEL: extract_byte_0
+; CHECK-BE-NOT: vextractub
+  %vecext = extractelement <16 x i8> %a, i32 0
+  ret i8 %vecext
+}
+
+define i8 @extract_byte_1(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: extract_byte_1
+; CHECK: vextractub 2, 2, 14
+; CHECK-BE-LABEL: extract_byte_1
+; CHECK-BE-NOT: vextractub
+  %vecext = extractelement <16 x i8> %a, i32 1
+  ret i8 %vecext
+}
+
+define i8 @extract_byte_2(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: extract_byte_2
+; CHECK: vextractub 2, 2, 13
+; CHECK-BE-LABEL: extract_byte_2
+; CHECK-BE-NOT: vextractub
+  %vecext = extractelement <16 x i8> %a, i32 2
+  ret i8 %vecext
+}
+
+define i8 @extract_byte_3(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: extract_byte_3
+; CHECK: vextractub 2, 2, 12
+; CHECK-BE-LABEL: extract_byte_3
+; CHECK-BE-NOT: vextractub
+  %vecext = extractelement <16 x i8> %a, i32 3
+  ret i8 %vecext
+}
+
+define i8 @extract_byte_4(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: extract_byte_4
+; CHECK: vextractub 2, 2, 11
+; CHECK-BE-LABEL: extract_byte_4
+; CHECK-BE-NOT: vextractub
+  %vecext = extractelement <16 x i8> %a, i32 4
+  ret i8 %vecext
+}
+
+define i8 @extract_byte_5(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: extract_byte_5
+; CHECK: vextractub 2, 2, 10
+; CHECK-BE-LABEL: extract_byte_5
+; CHECK-BE-NOT: vextractub
+  %vecext = extractelement <16 x i8> %a, i32 5
+  ret i8 %vecext
+}
+
+define i8 @extract_byte_6(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: extract_byte_6
+; CHECK: vextractub 2, 2, 9
+; CHECK-BE-LABEL: extract_byte_6
+; CHECK-BE-NOT: vextractub
+  %vecext = extractelement <16 x i8> %a, i32 6
+  ret i8 %vecext
+}
+
+define i8 @extract_byte_7(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: extract_byte_7
+; CHECK: vextractub 2, 2, 8
+; CHECK-BE-LABEL: extract_byte_7
+; CHECK-BE-NOT: vextractub
+  %vecext = extractelement <16 x i8> %a, i32 7
+  ret i8 %vecext
+}
+
+define i8 @extract_byte_8(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: extract_byte_8
+; CHECK-NOT: vextractub
+; CHECK-BE-LABEL: extract_byte_8
+; CHECK-BE: vextractub 2, 2, 8
+  %vecext = extractelement <16 x i8> %a, i32 8
+  ret i8 %vecext
+}
+
+define i8 @extract_byte_9(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: extract_byte_9
+; CHECK-NOT: vextractub
+; CHECK-BE-LABEL: extract_byte_9
+; CHECK-BE: vextractub 2, 2, 9
+  %vecext = extractelement <16 x i8> %a, i32 9
+  ret i8 %vecext
+}
+
+define i8 @extract_byte_10(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: extract_byte_10
+; CHECK-NOT: vextractub
+; CHECK-BE-LABEL: extract_byte_10
+; CHECK-BE: vextractub 2, 2, 10
+  %vecext = extractelement <16 x i8> %a, i32 10
+  ret i8 %vecext
+}
+
+define i8 @extract_byte_11(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: extract_byte_11
+; CHECK-NOT: vextractub
+; CHECK-BE-LABEL: extract_byte_11
+; CHECK-BE: vextractub 2, 2, 11
+  %vecext = extractelement <16 x i8> %a, i32 11
+  ret i8 %vecext
+}
+
+define i8 @extract_byte_12(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: extract_byte_12
+; CHECK-NOT: vextractub
+; CHECK-BE-LABEL: extract_byte_12
+; CHECK-BE: vextractub 2, 2, 12
+  %vecext = extractelement <16 x i8> %a, i32 12
+  ret i8 %vecext
+}
+
+define i8 @extract_byte_13(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: extract_byte_13
+; CHECK-NOT: vextractub
+; CHECK-BE-LABEL: extract_byte_13
+; CHECK-BE: vextractub 2, 2, 13
+  %vecext = extractelement <16 x i8> %a, i32 13
+  ret i8 %vecext
+}
+
+define i8 @extract_byte_14(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: extract_byte_14
+; CHECK-NOT: vextractub
+; CHECK-BE-LABEL: extract_byte_14
+; CHECK-BE: vextractub 2, 2, 14
+  %vecext = extractelement <16 x i8> %a, i32 14
+  ret i8 %vecext
+}
+
+define i8 @extract_byte_15(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: extract_byte_15
+; CHECK-NOT: vextractub
+; CHECK-BE-LABEL: extract_byte_15
+; CHECK-BE: vextractub 2, 2, 15
+  %vecext = extractelement <16 x i8> %a, i32 15
+  ret i8 %vecext
+}