diff --git a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
--- a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
@@ -29,10 +29,12 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/Support/KnownBits.h"
 
 using namespace llvm;
 
@@ -61,6 +63,9 @@
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
     Ops.push_back(I->getOperand(0));
     Ops.push_back(I->getOperand(1));
     break;
@@ -98,8 +103,33 @@
       // Worklist and the Stack, and add it to the instruction info map.
       Worklist.pop_back();
       Stack.pop_back();
+
       // Insert I to the Info map.
-      InstInfoMap.insert(std::make_pair(I, Info()));
+      // Initialize MinBitWidth for shift instructions with the number
+      // satisfying conditions:
+      // 1. Shift amount is less than MinBitWidth at least by 1
+      // 2. For right shifts all truncated bits are zeros and even
+      //    one untruncated bit is also zero for arithmetic shift.
+      // Also normalize MinBitWidth not to be greater than source bitwidth.
+      Info InstInfo;
+      unsigned int MinBitWidth = 0;
+      if (I->getOpcode() == Instruction::Shl ||
+          I->getOpcode() == Instruction::LShr ||
+          I->getOpcode() == Instruction::AShr) {
+        KnownBits KnownLHS = computeKnownBits(I->getOperand(0), DL);
+        KnownBits KnownRHS = computeKnownBits(I->getOperand(1), DL);
+        const unsigned int SrcBitWidth = KnownLHS.getBitWidth();
+        if (I->getOpcode() != Instruction::Shl)
+          MinBitWidth = SrcBitWidth - KnownLHS.countMinLeadingZeros();
+        if (I->getOpcode() == Instruction::AShr && MinBitWidth < SrcBitWidth)
+          MinBitWidth++;
+        InstInfo.MinBitWidth =
+            std::max(uint64_t(MinBitWidth),
+                     std::min(KnownRHS.getMaxValue().getZExtValue(),
+                              uint64_t(SrcBitWidth - 1)) +
+                         1);
+      }
+      InstInfoMap.insert(std::make_pair(I, InstInfo));
       continue;
     }
 
@@ -127,6 +157,9 @@
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
     case Instruction::Select: {
       SmallVector<Value *, 2> Operands;
       getRelevantOperands(I, Operands);
@@ -137,8 +170,7 @@
       // TODO: Can handle more cases here:
       // 1. shufflevector, extractelement, insertelement
       // 2. udiv, urem
-      // 3. shl, lshr, ashr
-      // 4. phi node(and loop handling)
+      // 3. phi node(and loop handling)
       // ...
       return false;
     }
@@ -356,10 +388,19 @@
     case Instruction::Mul:
     case Instruction::And:
     case Instruction::Or:
-    case Instruction::Xor: {
+    case Instruction::Xor:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr: {
       Value *LHS = getReducedOperand(I->getOperand(0), SclTy);
       Value *RHS = getReducedOperand(I->getOperand(1), SclTy);
       Res = Builder.CreateBinOp((Instruction::BinaryOps)Opc, LHS, RHS);
+      // Try to preserve flags, but `shl nsw` is more poisonous
+      // if bitwidth is smaller.
+      if (Opc == Instruction::Shl)
+        cast<Instruction>(Res)->setHasNoUnsignedWrap(I->hasNoUnsignedWrap());
+      if (Opc == Instruction::LShr || Opc == Instruction::AShr)
+        cast<Instruction>(Res)->setIsExact(I->isExact());
       break;
     }
     case Instruction::Select: {
diff --git a/llvm/test/Transforms/AggressiveInstCombine/pr50555.ll b/llvm/test/Transforms/AggressiveInstCombine/pr50555.ll
--- a/llvm/test/Transforms/AggressiveInstCombine/pr50555.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/pr50555.ll
@@ -3,11 +3,10 @@
 
 define void @trunc_one_add(i16* %a, i8 %b) {
 ; CHECK-LABEL: @trunc_one_add(
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[B:%.*]] to i32
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[ZEXT]], 1
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[ZEXT]], [[SHR]]
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[ADD]] to i16
-; CHECK-NEXT:    store i16 [[TRUNC]], i16* [[A:%.*]], align 2
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[B:%.*]] to i16
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i16 [[ZEXT]], 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[ZEXT]], [[SHR]]
+; CHECK-NEXT:    store i16 [[ADD]], i16* [[A:%.*]], align 2
 ; CHECK-NEXT:    ret void
 ;
   %zext = zext i8 %b to i32
@@ -20,14 +19,13 @@
 
 define void @trunc_two_adds(i16* %a, i8 %b, i8 %c) {
 ; CHECK-LABEL: @trunc_two_adds(
-; CHECK-NEXT:    [[ZEXT1:%.*]] = zext i8 [[B:%.*]] to i32
-; CHECK-NEXT:    [[ZEXT2:%.*]] = zext i8 [[C:%.*]] to i32
-; CHECK-NEXT:    [[ADD1:%.*]] = add nuw nsw i32 [[ZEXT1]], [[ZEXT2]]
-; CHECK-NEXT:    [[SHR1:%.*]] = lshr i32 [[ADD1]], 1
-; CHECK-NEXT:    [[ADD2:%.*]] = add nuw nsw i32 [[ADD1]], [[SHR1]]
-; CHECK-NEXT:    [[SHR2:%.*]] = lshr i32 [[ADD2]], 2
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHR2]] to i16
-; CHECK-NEXT:    store i16 [[TRUNC]], i16* [[A:%.*]], align 2
+; CHECK-NEXT:    [[ZEXT1:%.*]] = zext i8 [[B:%.*]] to i16
+; CHECK-NEXT:    [[ZEXT2:%.*]] = zext i8 [[C:%.*]] to i16
+; CHECK-NEXT:    [[ADD1:%.*]] = add i16 [[ZEXT1]], [[ZEXT2]]
+; CHECK-NEXT:    [[SHR1:%.*]] = lshr i16 [[ADD1]], 1
+; CHECK-NEXT:    [[ADD2:%.*]] = add i16 [[ADD1]], [[SHR1]]
+; CHECK-NEXT:    [[SHR2:%.*]] = lshr i16 [[ADD2]], 2
+; CHECK-NEXT:    store i16 [[SHR2]], i16* [[A:%.*]], align 2
 ; CHECK-NEXT:    ret void
 ;
   %zext1 = zext i8 %b to i32
diff --git a/llvm/test/Transforms/AggressiveInstCombine/trunc_shifts.ll b/llvm/test/Transforms/AggressiveInstCombine/trunc_shifts.ll
--- a/llvm/test/Transforms/AggressiveInstCombine/trunc_shifts.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/trunc_shifts.ll
@@ -3,10 +3,8 @@
 
 define i16 @lshr_trunc_commute(i16 %x) {
 ; CHECK-LABEL: @lshr_trunc_commute(
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i16 [[X:%.*]] to i32
-; CHECK-NEXT:    [[LSHR:%.*]] = lshr i32 [[ZEXT]], 15
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[LSHR]] to i16
-; CHECK-NEXT:    ret i16 [[TRUNC]]
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i16 [[X:%.*]], 15
+; CHECK-NEXT:    ret i16 [[LSHR]]
 ;
   %zext = zext i16 %x to i32
   %lshr = lshr i32 %zext, 15
@@ -42,11 +40,9 @@
 
 define i16 @ashr_trunc_commute(i16 %x) {
 ; CHECK-LABEL: @ashr_trunc_commute(
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i16 [[X:%.*]] to i32
-; CHECK-NEXT:    [[AND:%.*]] = and i32 [[ZEXT]], 32767
-; CHECK-NEXT:    [[ASHR:%.*]] = ashr i32 [[AND]], 15
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[ASHR]] to i16
-; CHECK-NEXT:    ret i16 [[TRUNC]]
+; CHECK-NEXT:    [[AND:%.*]] = and i16 [[X:%.*]], 32767
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr i16 [[AND]], 15
+; CHECK-NEXT:    ret i16 [[ASHR]]
 ;
   %zext = zext i16 %x to i32
   %and = and i32 %zext, 32767
@@ -76,14 +72,13 @@
 
 define i16 @var_shift_commute(i8 %x, i8 %amt) {
 ; CHECK-LABEL: @var_shift_commute(
-; CHECK-NEXT:    [[Z:%.*]] = zext i8 [[X:%.*]] to i32
-; CHECK-NEXT:    [[ZA:%.*]] = zext i8 [[AMT:%.*]] to i32
-; CHECK-NEXT:    [[ZA2:%.*]] = and i32 [[ZA]], 15
-; CHECK-NEXT:    [[S:%.*]] = lshr i32 [[Z]], [[ZA2]]
-; CHECK-NEXT:    [[A:%.*]] = add i32 [[S]], [[Z]]
-; CHECK-NEXT:    [[S2:%.*]] = lshr i32 [[A]], 2
-; CHECK-NEXT:    [[T:%.*]] = trunc i32 [[S2]] to i16
-; CHECK-NEXT:    ret i16 [[T]]
+; CHECK-NEXT:    [[Z:%.*]] = zext i8 [[X:%.*]] to i16
+; CHECK-NEXT:    [[ZA:%.*]] = zext i8 [[AMT:%.*]] to i16
+; CHECK-NEXT:    [[ZA2:%.*]] = and i16 [[ZA]], 15
+; CHECK-NEXT:    [[S:%.*]] = lshr i16 [[Z]], [[ZA2]]
+; CHECK-NEXT:    [[A:%.*]] = add i16 [[S]], [[Z]]
+; CHECK-NEXT:    [[S2:%.*]] = lshr i16 [[A]], 2
+; CHECK-NEXT:    ret i16 [[S2]]
 ;
   %z = zext i8 %x to i32
   %za = zext i8 %amt to i32
@@ -97,16 +92,15 @@
 
 define void @big_dag(i16* %a, i8 %b, i8 %c) {
 ; CHECK-LABEL: @big_dag(
-; CHECK-NEXT:    [[ZEXT1:%.*]] = zext i8 [[B:%.*]] to i32
-; CHECK-NEXT:    [[ZEXT2:%.*]] = zext i8 [[C:%.*]] to i32
-; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[ZEXT1]], [[ZEXT2]]
-; CHECK-NEXT:    [[SFT1:%.*]] = and i32 [[ADD1]], 15
-; CHECK-NEXT:    [[SHR1:%.*]] = lshr i32 [[ADD1]], [[SFT1]]
-; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[ADD1]], [[SHR1]]
-; CHECK-NEXT:    [[SFT2:%.*]] = and i32 [[ADD2]], 7
-; CHECK-NEXT:    [[SHR2:%.*]] = lshr i32 [[ADD2]], [[SFT2]]
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHR2]] to i16
-; CHECK-NEXT:    store i16 [[TRUNC]], i16* [[A:%.*]], align 2
+; CHECK-NEXT:    [[ZEXT1:%.*]] = zext i8 [[B:%.*]] to i16
+; CHECK-NEXT:    [[ZEXT2:%.*]] = zext i8 [[C:%.*]] to i16
+; CHECK-NEXT:    [[ADD1:%.*]] = add i16 [[ZEXT1]], [[ZEXT2]]
+; CHECK-NEXT:    [[SFT1:%.*]] = and i16 [[ADD1]], 15
+; CHECK-NEXT:    [[SHR1:%.*]] = lshr i16 [[ADD1]], [[SFT1]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add i16 [[ADD1]], [[SHR1]]
+; CHECK-NEXT:    [[SFT2:%.*]] = and i16 [[ADD2]], 7
+; CHECK-NEXT:    [[SHR2:%.*]] = lshr i16 [[ADD2]], [[SFT2]]
+; CHECK-NEXT:    store i16 [[SHR2]], i16* [[A:%.*]], align 2
 ; CHECK-NEXT:    ret void
 ;
   %zext1 = zext i8 %b to i32
@@ -124,13 +118,12 @@
 
 define <2 x i16> @vector_commute(<2 x i8> %x) {
 ; CHECK-LABEL: @vector_commute(
-; CHECK-NEXT:    [[Z:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32>
-; CHECK-NEXT:    [[ZA:%.*]] = and <2 x i32> [[Z]], <i32 7, i32 8>
-; CHECK-NEXT:    [[S:%.*]] = lshr <2 x i32> [[Z]], [[ZA]]
-; CHECK-NEXT:    [[A:%.*]] = add <2 x i32> [[S]], [[Z]]
-; CHECK-NEXT:    [[S2:%.*]] = lshr <2 x i32> [[A]], <i32 4, i32 5>
-; CHECK-NEXT:    [[T:%.*]] = trunc <2 x i32> [[S2]] to <2 x i16>
-; CHECK-NEXT:    ret <2 x i16> [[T]]
+; CHECK-NEXT:    [[Z:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i16>
+; CHECK-NEXT:    [[ZA:%.*]] = and <2 x i16> [[Z]], <i16 7, i16 8>
+; CHECK-NEXT:    [[S:%.*]] = lshr <2 x i16> [[Z]], [[ZA]]
+; CHECK-NEXT:    [[A:%.*]] = add <2 x i16> [[S]], [[Z]]
+; CHECK-NEXT:    [[S2:%.*]] = lshr <2 x i16> [[A]], <i16 4, i16 5>
+; CHECK-NEXT:    ret <2 x i16> [[S2]]
 ;
   %z = zext <2 x i8> %x to <2 x i32>
   %za = and <2 x i32> %z, <i32 7, i32 8>
@@ -182,6 +175,7 @@
 define i16 @shl_not_commute(i8 %x) {
 ; CHECK-LABEL: @shl_not_commute(
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[ZEXT]], [[ZEXT]]
 ; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHL]] to i16
 ; CHECK-NEXT:    ret i16 [[TRUNC]]
 ;
@@ -193,10 +187,10 @@
 
 define i16 @shl_commute(i8 %x) {
 ; CHECK-LABEL: @shl_commute(
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i32
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[ZEXT]], [[AND]]
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHL]] to i16
-; CHECK-NEXT:    ret i16 [[TRUNC]]
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i16
+; CHECK-NEXT:    [[AND:%.*]] = and i16 [[ZEXT]], 15
+; CHECK-NEXT:    [[SHL:%.*]] = shl i16 [[ZEXT]], [[AND]]
+; CHECK-NEXT:    ret i16 [[SHL]]
 ;
   %zext = zext i8 %x to i32
   %and = and i32 %zext, 15
@@ -207,10 +201,8 @@
 
 define i16 @lshr_exact(i16 %x) {
 ; CHECK-LABEL: @lshr_exact(
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i16 [[X:%.*]] to i32
-; CHECK-NEXT:    [[LSHR:%.*]] = lshr exact i32 [[ZEXT]], 15
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[LSHR]] to i16
-; CHECK-NEXT:    ret i16 [[TRUNC]]
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr exact i16 [[X:%.*]], 15
+; CHECK-NEXT:    ret i16 [[LSHR]]
 ;
   %zext = zext i16 %x to i32
   %lshr = lshr exact i32 %zext, 15
@@ -220,11 +212,9 @@
 
 define i16 @ashr_exact(i16 %x) {
 ; CHECK-LABEL: @ashr_exact(
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i16 [[X:%.*]] to i32
-; CHECK-NEXT:    [[AND:%.*]] = and i32 [[ZEXT]], 32767
-; CHECK-NEXT:    [[ASHR:%.*]] = ashr exact i32 [[AND]], 15
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[ASHR]] to i16
-; CHECK-NEXT:    ret i16 [[TRUNC]]
+; CHECK-NEXT:    [[AND:%.*]] = and i16 [[X:%.*]], 32767
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr exact i16 [[AND]], 15
+; CHECK-NEXT:    ret i16 [[ASHR]]
 ;
   %zext = zext i16 %x to i32
   %and = and i32 %zext, 32767
@@ -235,10 +225,9 @@
 
 define i16 @shl_nuw(i8 %x, i8 %sh1) {
 ; CHECK-LABEL: @shl_nuw(
-; CHECK-NEXT:    [[Z:%.*]] = zext i8 [[X:%.*]] to i32
-; CHECK-NEXT:    [[S:%.*]] = shl nuw i32 [[Z]], 8
-; CHECK-NEXT:    [[T:%.*]] = trunc i32 [[S]] to i16
-; CHECK-NEXT:    ret i16 [[T]]
+; CHECK-NEXT:    [[Z:%.*]] = zext i8 [[X:%.*]] to i16
+; CHECK-NEXT:    [[S:%.*]] = shl nuw i16 [[Z]], 8
+; CHECK-NEXT:    ret i16 [[S]]
 ;
   %z = zext i8 %x to i32
   %s = shl nuw i32 %z, 8
@@ -248,10 +237,9 @@
 
 define i16 @shl_nsw(i8 %x, i8 %sh1) {
 ; CHECK-LABEL: @shl_nsw(
-; CHECK-NEXT:    [[Z:%.*]] = zext i8 [[X:%.*]] to i32
-; CHECK-NEXT:    [[S:%.*]] = shl nsw i32 [[Z]], 8
-; CHECK-NEXT:    [[T:%.*]] = trunc i32 [[S]] to i16
-; CHECK-NEXT:    ret i16 [[T]]
+; CHECK-NEXT:    [[Z:%.*]] = zext i8 [[X:%.*]] to i16
+; CHECK-NEXT:    [[S:%.*]] = shl i16 [[Z]], 8
+; CHECK-NEXT:    ret i16 [[S]]
 ;
   %z = zext i8 %x to i32
   %s = shl nsw i32 %z, 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll
@@ -4,71 +4,35 @@
 
 define void @trunc_through_one_add(i16* noalias %0, i8* noalias readonly %1) {
 ; SSE-LABEL: @trunc_through_one_add(
-; SSE-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <4 x i8>*
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1
-; SSE-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
-; SSE-NEXT:    [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 1, i32 1, i32 1, i32 1>
-; SSE-NEXT:    [[TMP7:%.*]] = add nuw nsw <4 x i32> [[TMP6]], [[TMP5]]
-; SSE-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP9:%.*]] = trunc <4 x i32> [[TMP8]] to <4 x i16>
-; SSE-NEXT:    [[TMP10:%.*]] = bitcast i16* [[TMP0:%.*]] to <4 x i16>*
-; SSE-NEXT:    store <4 x i16> [[TMP9]], <4 x i16>* [[TMP10]], align 2
-; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 4
-; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 4
-; SSE-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to <4 x i8>*
-; SSE-NEXT:    [[TMP14:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1
-; SSE-NEXT:    [[TMP15:%.*]] = zext <4 x i8> [[TMP14]] to <4 x i32>
-; SSE-NEXT:    [[TMP16:%.*]] = lshr <4 x i32> [[TMP15]], <i32 1, i32 1, i32 1, i32 1>
-; SSE-NEXT:    [[TMP17:%.*]] = add nuw nsw <4 x i32> [[TMP16]], [[TMP15]]
-; SSE-NEXT:    [[TMP18:%.*]] = lshr <4 x i32> [[TMP17]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP19:%.*]] = trunc <4 x i32> [[TMP18]] to <4 x i16>
-; SSE-NEXT:    [[TMP20:%.*]] = bitcast i16* [[TMP12]] to <4 x i16>*
-; SSE-NEXT:    store <4 x i16> [[TMP19]], <4 x i16>* [[TMP20]], align 2
-; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8
-; SSE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8
-; SSE-NEXT:    [[TMP23:%.*]] = bitcast i8* [[TMP21]] to <4 x i8>*
-; SSE-NEXT:    [[TMP24:%.*]] = load <4 x i8>, <4 x i8>* [[TMP23]], align 1
-; SSE-NEXT:    [[TMP25:%.*]] = zext <4 x i8> [[TMP24]] to <4 x i32>
-; SSE-NEXT:    [[TMP26:%.*]] = lshr <4 x i32> [[TMP25]], <i32 1, i32 1, i32 1, i32 1>
-; SSE-NEXT:    [[TMP27:%.*]] = add nuw nsw <4 x i32> [[TMP26]], [[TMP25]]
-; SSE-NEXT:    [[TMP28:%.*]] = lshr <4 x i32> [[TMP27]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP29:%.*]] = trunc <4 x i32> [[TMP28]] to <4 x i16>
-; SSE-NEXT:    [[TMP30:%.*]] = bitcast i16* [[TMP22]] to <4 x i16>*
-; SSE-NEXT:    store <4 x i16> [[TMP29]], <4 x i16>* [[TMP30]], align 2
-; SSE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 12
-; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 12
-; SSE-NEXT:    [[TMP33:%.*]] = bitcast i8* [[TMP31]] to <4 x i8>*
-; SSE-NEXT:    [[TMP34:%.*]] = load <4 x i8>, <4 x i8>* [[TMP33]], align 1
-; SSE-NEXT:    [[TMP35:%.*]] = zext <4 x i8> [[TMP34]] to <4 x i32>
-; SSE-NEXT:    [[TMP36:%.*]] = lshr <4 x i32> [[TMP35]], <i32 1, i32 1, i32 1, i32 1>
-; SSE-NEXT:    [[TMP37:%.*]] = add nuw nsw <4 x i32> [[TMP36]], [[TMP35]]
-; SSE-NEXT:    [[TMP38:%.*]] = lshr <4 x i32> [[TMP37]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP39:%.*]] = trunc <4 x i32> [[TMP38]] to <4 x i16>
-; SSE-NEXT:    [[TMP40:%.*]] = bitcast i16* [[TMP32]] to <4 x i16>*
-; SSE-NEXT:    store <4 x i16> [[TMP39]], <4 x i16>* [[TMP40]], align 2
+; SSE-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>*
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1
+; SSE-NEXT:    [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i16>
+; SSE-NEXT:    [[TMP6:%.*]] = lshr <8 x i16> [[TMP5]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; SSE-NEXT:    [[TMP7:%.*]] = add <8 x i16> [[TMP6]], [[TMP5]]
+; SSE-NEXT:    [[TMP8:%.*]] = lshr <8 x i16> [[TMP7]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+; SSE-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>*
+; SSE-NEXT:    store <8 x i16> [[TMP8]], <8 x i16>* [[TMP9]], align 2
+; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8
+; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8
+; SSE-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>*
+; SSE-NEXT:    [[TMP13:%.*]] = load <8 x i8>, <8 x i8>* [[TMP12]], align 1
+; SSE-NEXT:    [[TMP14:%.*]] = zext <8 x i8> [[TMP13]] to <8 x i16>
+; SSE-NEXT:    [[TMP15:%.*]] = lshr <8 x i16> [[TMP14]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; SSE-NEXT:    [[TMP16:%.*]] = add <8 x i16> [[TMP15]], [[TMP14]]
+; SSE-NEXT:    [[TMP17:%.*]] = lshr <8 x i16> [[TMP16]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+; SSE-NEXT:    [[TMP18:%.*]] = bitcast i16* [[TMP11]] to <8 x i16>*
+; SSE-NEXT:    store <8 x i16> [[TMP17]], <8 x i16>* [[TMP18]], align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @trunc_through_one_add(
-; AVX-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>*
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1
-; AVX-NEXT:    [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32>
-; AVX-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[TMP5]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX-NEXT:    [[TMP7:%.*]] = add nuw nsw <8 x i32> [[TMP6]], [[TMP5]]
-; AVX-NEXT:    [[TMP8:%.*]] = lshr <8 x i32> [[TMP7]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; AVX-NEXT:    [[TMP9:%.*]] = trunc <8 x i32> [[TMP8]] to <8 x i16>
-; AVX-NEXT:    [[TMP10:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>*
-; AVX-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* [[TMP10]], align 2
-; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8
-; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8
-; AVX-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to <8 x i8>*
-; AVX-NEXT:    [[TMP14:%.*]] = load <8 x i8>, <8 x i8>* [[TMP13]], align 1
-; AVX-NEXT:    [[TMP15:%.*]] = zext <8 x i8> [[TMP14]] to <8 x i32>
-; AVX-NEXT:    [[TMP16:%.*]] = lshr <8 x i32> [[TMP15]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX-NEXT:    [[TMP17:%.*]] = add nuw nsw <8 x i32> [[TMP16]], [[TMP15]]
-; AVX-NEXT:    [[TMP18:%.*]] = lshr <8 x i32> [[TMP17]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; AVX-NEXT:    [[TMP19:%.*]] = trunc <8 x i32> [[TMP18]] to <8 x i16>
-; AVX-NEXT:    [[TMP20:%.*]] = bitcast i16* [[TMP12]] to <8 x i16>*
-; AVX-NEXT:    store <8 x i16> [[TMP19]], <8 x i16>* [[TMP20]], align 2
+; AVX-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <16 x i8>*
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 1
+; AVX-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[TMP4]] to <16 x i16>
+; AVX-NEXT:    [[TMP6:%.*]] = lshr <16 x i16> [[TMP5]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; AVX-NEXT:    [[TMP7:%.*]] = add <16 x i16> [[TMP6]], [[TMP5]]
+; AVX-NEXT:    [[TMP8:%.*]] = lshr <16 x i16> [[TMP7]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+; AVX-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP0:%.*]] to <16 x i16>*
+; AVX-NEXT:    store <16 x i16> [[TMP8]], <16 x i16>* [[TMP9]], align 2
 ; AVX-NEXT:    ret void
 ;
   %3 = load i8, i8* %1, align 1
@@ -218,99 +182,48 @@
 
 define void @trunc_through_two_adds(i16* noalias %0, i8* noalias readonly %1, i8* noalias readonly %2) {
 ; SSE-LABEL: @trunc_through_two_adds(
-; SSE-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <4 x i8>*
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1
-; SSE-NEXT:    [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32>
-; SSE-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <4 x i8>*
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1
-; SSE-NEXT:    [[TMP9:%.*]] = zext <4 x i8> [[TMP8]] to <4 x i32>
-; SSE-NEXT:    [[TMP10:%.*]] = add nuw nsw <4 x i32> [[TMP9]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = lshr <4 x i32> [[TMP10]], <i32 1, i32 1, i32 1, i32 1>
-; SSE-NEXT:    [[TMP12:%.*]] = add nuw nsw <4 x i32> [[TMP11]], [[TMP10]]
-; SSE-NEXT:    [[TMP13:%.*]] = lshr <4 x i32> [[TMP12]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP14:%.*]] = trunc <4 x i32> [[TMP13]] to <4 x i16>
-; SSE-NEXT:    [[TMP15:%.*]] = bitcast i16* [[TMP0:%.*]] to <4 x i16>*
-; SSE-NEXT:    store <4 x i16> [[TMP14]], <4 x i16>* [[TMP15]], align 2
-; SSE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 4
-; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 4
-; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 4
-; SSE-NEXT:    [[TMP19:%.*]] = bitcast i8* [[TMP16]] to <4 x i8>*
-; SSE-NEXT:    [[TMP20:%.*]] = load <4 x i8>, <4 x i8>* [[TMP19]], align 1
-; SSE-NEXT:    [[TMP21:%.*]] = zext <4 x i8> [[TMP20]] to <4 x i32>
-; SSE-NEXT:    [[TMP22:%.*]] = bitcast i8* [[TMP17]] to <4 x i8>*
-; SSE-NEXT:    [[TMP23:%.*]] = load <4 x i8>, <4 x i8>* [[TMP22]], align 1
-; SSE-NEXT:    [[TMP24:%.*]] = zext <4 x i8> [[TMP23]] to <4 x i32>
-; SSE-NEXT:    [[TMP25:%.*]] = add nuw nsw <4 x i32> [[TMP24]], [[TMP21]]
-; SSE-NEXT:    [[TMP26:%.*]] = lshr <4 x i32> [[TMP25]], <i32 1, i32 1, i32 1, i32 1>
-; SSE-NEXT:    [[TMP27:%.*]] = add nuw nsw <4 x i32> [[TMP26]], [[TMP25]]
-; SSE-NEXT:    [[TMP28:%.*]] = lshr <4 x i32> [[TMP27]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP29:%.*]] = trunc <4 x i32> [[TMP28]] to <4 x i16>
-; SSE-NEXT:    [[TMP30:%.*]] = bitcast i16* [[TMP18]] to <4 x i16>*
-; SSE-NEXT:    store <4 x i16> [[TMP29]], <4 x i16>* [[TMP30]], align 2
-; SSE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8
-; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 8
-; SSE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8
-; SSE-NEXT:    [[TMP34:%.*]] = bitcast i8* [[TMP31]] to <4 x i8>*
-; SSE-NEXT:    [[TMP35:%.*]] = load <4 x i8>, <4 x i8>* [[TMP34]], align 1
-; SSE-NEXT:    [[TMP36:%.*]] = zext <4 x i8> [[TMP35]] to <4 x i32>
-; SSE-NEXT:    [[TMP37:%.*]] = bitcast i8* [[TMP32]] to <4 x i8>*
-; SSE-NEXT:    [[TMP38:%.*]] = load <4 x i8>, <4 x i8>* [[TMP37]], align 1
-; SSE-NEXT:    [[TMP39:%.*]] = zext <4 x i8> [[TMP38]] to <4 x i32>
-; SSE-NEXT:    [[TMP40:%.*]] = add nuw nsw <4 x i32> [[TMP39]], [[TMP36]]
-; SSE-NEXT:    [[TMP41:%.*]] = lshr <4 x i32> [[TMP40]], <i32 1, i32 1, i32 1, i32 1>
-; SSE-NEXT:    [[TMP42:%.*]] = add nuw nsw <4 x i32> [[TMP41]], [[TMP40]]
-; SSE-NEXT:    [[TMP43:%.*]] = lshr <4 x i32> [[TMP42]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP44:%.*]] = trunc <4 x i32> [[TMP43]] to <4 x i16>
-; SSE-NEXT:    [[TMP45:%.*]] = bitcast i16* [[TMP33]] to <4 x i16>*
-; SSE-NEXT:    store <4 x i16> [[TMP44]], <4 x i16>* [[TMP45]], align 2
-; SSE-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 12
-; SSE-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 12
-; SSE-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 12
-; SSE-NEXT:    [[TMP49:%.*]] = bitcast i8* [[TMP46]] to <4 x i8>*
-; SSE-NEXT:    [[TMP50:%.*]] = load <4 x i8>, <4 x i8>* [[TMP49]], align 1
-; SSE-NEXT:    [[TMP51:%.*]] = zext <4 x i8> [[TMP50]] to <4 x i32>
-; SSE-NEXT:    [[TMP52:%.*]] = bitcast i8* [[TMP47]] to <4 x i8>*
-; SSE-NEXT:    [[TMP53:%.*]] = load <4 x i8>, <4 x i8>* [[TMP52]], align 1
-; SSE-NEXT:    [[TMP54:%.*]] = zext <4 x i8> [[TMP53]] to <4 x i32>
-; SSE-NEXT:    [[TMP55:%.*]] = add nuw nsw <4 x i32> [[TMP54]], [[TMP51]]
-; SSE-NEXT:    [[TMP56:%.*]] = lshr <4 x i32> [[TMP55]], <i32 1, i32 1, i32 1, i32 1>
-; SSE-NEXT:    [[TMP57:%.*]] = add nuw nsw <4 x i32> [[TMP56]], [[TMP55]]
-; SSE-NEXT:    [[TMP58:%.*]] = lshr <4 x i32> [[TMP57]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP59:%.*]] = trunc <4 x i32> [[TMP58]] to <4 x i16>
-; SSE-NEXT:    [[TMP60:%.*]] = bitcast i16* [[TMP48]] to <4 x i16>*
-; SSE-NEXT:    store <4 x i16> [[TMP59]], <4 x i16>* [[TMP60]], align 2
+; SSE-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>*
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[TMP4]], align 1
+; SSE-NEXT:    [[TMP6:%.*]] = zext <8 x i8> [[TMP5]] to <8 x i16>
+; SSE-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <8 x i8>*
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1
+; SSE-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[TMP8]] to <8 x i16>
+; SSE-NEXT:    [[TMP10:%.*]] = add <8 x i16> [[TMP9]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = lshr <8 x i16> [[TMP10]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; SSE-NEXT:    [[TMP12:%.*]] = add <8 x i16> [[TMP11]], [[TMP10]]
+; SSE-NEXT:    [[TMP13:%.*]] = lshr <8 x i16> [[TMP12]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+; SSE-NEXT:    [[TMP14:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>*
+; SSE-NEXT:    store <8 x i16> [[TMP13]], <8 x i16>* [[TMP14]], align 2
+; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8
+; SSE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 8
+; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8
+; SSE-NEXT:    [[TMP18:%.*]] = bitcast i8* [[TMP15]] to <8 x i8>*
+; SSE-NEXT:    [[TMP19:%.*]] = load <8 x i8>, <8 x i8>* [[TMP18]], align 1
+; SSE-NEXT:    [[TMP20:%.*]] = zext <8 x i8> [[TMP19]] to <8 x i16>
+; SSE-NEXT:    [[TMP21:%.*]] = bitcast i8* [[TMP16]] to <8 x i8>*
+; SSE-NEXT:    [[TMP22:%.*]] = load <8 x i8>, <8 x i8>* [[TMP21]], align 1
+; SSE-NEXT:    [[TMP23:%.*]] = zext <8 x i8> [[TMP22]] to <8 x i16>
+; SSE-NEXT:    [[TMP24:%.*]] = add <8 x i16> [[TMP23]], [[TMP20]]
+; SSE-NEXT:    [[TMP25:%.*]] = lshr <8 x i16> [[TMP24]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; SSE-NEXT:    [[TMP26:%.*]] = add <8 x i16> [[TMP25]], [[TMP24]]
+; SSE-NEXT:    [[TMP27:%.*]] = lshr <8 x i16> [[TMP26]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+; SSE-NEXT:    [[TMP28:%.*]] = bitcast i16* [[TMP17]] to <8 x i16>*
+; SSE-NEXT:    store <8 x i16> [[TMP27]], <8 x i16>* [[TMP28]], align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @trunc_through_two_adds(
-; AVX-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>*
-; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[TMP4]], align 1
-; AVX-NEXT:    [[TMP6:%.*]] = zext <8 x i8> [[TMP5]] to <8 x i32>
-; AVX-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <8 x i8>*
-; AVX-NEXT:    [[TMP8:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1
-; AVX-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[TMP8]] to <8 x i32>
-; AVX-NEXT:    [[TMP10:%.*]] = add nuw nsw <8 x i32> [[TMP9]], [[TMP6]]
-; AVX-NEXT:    [[TMP11:%.*]] = lshr <8 x i32> [[TMP10]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX-NEXT:    [[TMP12:%.*]] = add nuw nsw <8 x i32> [[TMP11]], [[TMP10]]
-; AVX-NEXT:    [[TMP13:%.*]] = lshr <8 x i32> [[TMP12]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; AVX-NEXT:    [[TMP14:%.*]] = trunc <8 x i32> [[TMP13]] to <8 x i16>
-; AVX-NEXT:    [[TMP15:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>*
-; AVX-NEXT:    store <8 x i16> [[TMP14]], <8 x i16>* [[TMP15]], align 2
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8
-; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 8
-; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8
-; AVX-NEXT:    [[TMP19:%.*]] = bitcast i8* [[TMP16]] to <8 x i8>*
-; AVX-NEXT:    [[TMP20:%.*]] = load <8 x i8>, <8 x i8>* [[TMP19]], align 1
-; AVX-NEXT:    [[TMP21:%.*]] = zext <8 x i8> [[TMP20]] to <8 x i32>
-; AVX-NEXT:    [[TMP22:%.*]] = bitcast i8* [[TMP17]] to <8 x i8>*
-; AVX-NEXT:    [[TMP23:%.*]] = load <8 x i8>, <8 x i8>* [[TMP22]], align 1
-; AVX-NEXT:    [[TMP24:%.*]] = zext <8 x i8> [[TMP23]] to <8 x i32>
-; AVX-NEXT:    [[TMP25:%.*]] = add nuw nsw <8 x i32> [[TMP24]], [[TMP21]]
-; AVX-NEXT:    [[TMP26:%.*]] = lshr <8 x i32> [[TMP25]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX-NEXT:    [[TMP27:%.*]] = add nuw nsw <8 x i32> [[TMP26]], [[TMP25]]
-; AVX-NEXT:    [[TMP28:%.*]] = lshr <8 x i32> [[TMP27]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; AVX-NEXT:    [[TMP29:%.*]] = trunc <8 x i32> [[TMP28]] to <8 x i16>
-; AVX-NEXT:    [[TMP30:%.*]] = bitcast i16* [[TMP18]] to <8 x i16>*
-; AVX-NEXT:    store <8 x i16> [[TMP29]], <8 x i16>* [[TMP30]], align 2
+; AVX-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <16 x i8>*
+; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 1
+; AVX-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[TMP5]] to <16 x i16>
+; AVX-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <16 x i8>*
+; AVX-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[TMP7]], align 1
+; AVX-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[TMP8]] to <16 x i16>
+; AVX-NEXT:    [[TMP10:%.*]] = add <16 x i16> [[TMP9]], [[TMP6]]
+; AVX-NEXT:    [[TMP11:%.*]] = lshr <16 x i16> [[TMP10]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; AVX-NEXT:    [[TMP12:%.*]] = add <16 x i16> [[TMP11]], [[TMP10]]
+; AVX-NEXT:    [[TMP13:%.*]] = lshr <16 x i16> [[TMP12]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+; AVX-NEXT:    [[TMP14:%.*]] = bitcast i16* [[TMP0:%.*]] to <16 x i16>*
+; AVX-NEXT:    store <16 x i16> [[TMP13]], <16 x i16>* [[TMP14]], align 2
 ; AVX-NEXT:    ret void
 ;
   %4 = load i8, i8* %1, align 1