diff --git a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
--- a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
@@ -64,6 +64,10 @@
     Ops.push_back(I->getOperand(0));
     Ops.push_back(I->getOperand(1));
     break;
+  case Instruction::AShr:
+  case Instruction::LShr:
+    Ops.push_back(I->getOperand(0));
+    break;
   case Instruction::Select:
     Ops.push_back(I->getOperand(1));
     Ops.push_back(I->getOperand(2));
@@ -127,6 +131,8 @@
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor:
+    case Instruction::LShr:
+    case Instruction::AShr:
     case Instruction::Select: {
       SmallVector<Value *, 2> Operands;
       getRelevantOperands(I, Operands);
@@ -137,7 +143,7 @@
       // TODO: Can handle more cases here:
       // 1. shufflevector, extractelement, insertelement
       // 2. udiv, urem
-      // 3. shl, lshr, ashr
+      // 3. shl
       // 4. phi node(and loop handling)
       // ...
       return false;
@@ -356,7 +362,9 @@
     case Instruction::Mul:
     case Instruction::And:
     case Instruction::Or:
-    case Instruction::Xor: {
+    case Instruction::Xor:
+    case Instruction::AShr:
+    case Instruction::LShr: {
       Value *LHS = getReducedOperand(I->getOperand(0), SclTy);
       Value *RHS = getReducedOperand(I->getOperand(1), SclTy);
       Res = Builder.CreateBinOp((Instruction::BinaryOps)Opc, LHS, RHS);
diff --git a/llvm/test/Transforms/AggressiveInstCombine/pr50555.ll b/llvm/test/Transforms/AggressiveInstCombine/pr50555.ll
--- a/llvm/test/Transforms/AggressiveInstCombine/pr50555.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/pr50555.ll
@@ -3,11 +3,10 @@
 
 define void @trunc_one_add(i16* %a, i8 %b) {
 ; CHECK-LABEL: @trunc_one_add(
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[B:%.*]] to i32
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[ZEXT]], 1
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[ZEXT]], [[SHR]]
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[ADD]] to i16
-; CHECK-NEXT:    store i16 [[TRUNC]], i16* [[A:%.*]], align 2
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[B:%.*]] to i16
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i16 [[ZEXT]], 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[ZEXT]], [[SHR]]
+; CHECK-NEXT:    store i16 [[ADD]], i16* [[A:%.*]], align 2
 ; CHECK-NEXT:    ret void
 ;
   %zext = zext i8 %b to i32
@@ -20,14 +19,13 @@
 
 define void @trunc_two_adds(i16* %a, i8 %b, i8 %c) {
 ; CHECK-LABEL: @trunc_two_adds(
-; CHECK-NEXT:    [[ZEXT1:%.*]] = zext i8 [[B:%.*]] to i32
-; CHECK-NEXT:    [[ZEXT2:%.*]] = zext i8 [[C:%.*]] to i32
-; CHECK-NEXT:    [[ADD1:%.*]] = add nuw nsw i32 [[ZEXT1]], [[ZEXT2]]
-; CHECK-NEXT:    [[SHR1:%.*]] = lshr i32 [[ADD1]], 1
-; CHECK-NEXT:    [[ADD2:%.*]] = add nuw nsw i32 [[ADD1]], [[SHR1]]
-; CHECK-NEXT:    [[SHR2:%.*]] = lshr i32 [[ADD2]], 2
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHR2]] to i16
-; CHECK-NEXT:    store i16 [[TRUNC]], i16* [[A:%.*]], align 2
+; CHECK-NEXT:    [[ZEXT1:%.*]] = zext i8 [[B:%.*]] to i16
+; CHECK-NEXT:    [[ZEXT2:%.*]] = zext i8 [[C:%.*]] to i16
+; CHECK-NEXT:    [[ADD1:%.*]] = add i16 [[ZEXT1]], [[ZEXT2]]
+; CHECK-NEXT:    [[SHR1:%.*]] = lshr i16 [[ADD1]], 1
+; CHECK-NEXT:    [[ADD2:%.*]] = add i16 [[ADD1]], [[SHR1]]
+; CHECK-NEXT:    [[SHR2:%.*]] = lshr i16 [[ADD2]], 2
+; CHECK-NEXT:    store i16 [[SHR2]], i16* [[A:%.*]], align 2
 ; CHECK-NEXT:    ret void
 ;
   %zext1 = zext i8 %b to i32
diff --git a/llvm/test/Transforms/InstCombine/2008-01-21-MulTrunc.ll b/llvm/test/Transforms/InstCombine/2008-01-21-MulTrunc.ll
--- a/llvm/test/Transforms/InstCombine/2008-01-21-MulTrunc.ll
+++ b/llvm/test/Transforms/InstCombine/2008-01-21-MulTrunc.ll
@@ -5,9 +5,9 @@
 
 define i16 @test1(i16 %a) {
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[C:%.*]] = lshr i16 [[A:%.*]], 8
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i16 [[A:%.*]], 8
 ; CHECK-NEXT:    [[D:%.*]] = mul i16 [[A]], 5
-; CHECK-NEXT:    [[E:%.*]] = or i16 [[C]], [[D]]
+; CHECK-NEXT:    [[E:%.*]] = or i16 [[D]], [[TMP1]]
 ; CHECK-NEXT:    ret i16 [[E]]
 ;
   %b = zext i16 %a to i32    ; <i32> [#uses=2]
@@ -20,9 +20,9 @@
 
 define <2 x i16> @test1_vec(<2 x i16> %a) {
 ; CHECK-LABEL: @test1_vec(
-; CHECK-NEXT:    [[C:%.*]] = lshr <2 x i16> [[A:%.*]], <i16 8, i16 8>
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <2 x i16> [[A:%.*]], <i16 8, i16 8>
 ; CHECK-NEXT:    [[D:%.*]] = mul <2 x i16> [[A]], <i16 5, i16 5>
-; CHECK-NEXT:    [[E:%.*]] = or <2 x i16> [[C]], [[D]]
+; CHECK-NEXT:    [[E:%.*]] = or <2 x i16> [[D]], [[TMP1]]
 ; CHECK-NEXT:    ret <2 x i16> [[E]]
 ;
   %b = zext <2 x i16> %a to <2 x i32>
diff --git a/llvm/test/Transforms/InstCombine/apint-cast.ll b/llvm/test/Transforms/InstCombine/apint-cast.ll
--- a/llvm/test/Transforms/InstCombine/apint-cast.ll
+++ b/llvm/test/Transforms/InstCombine/apint-cast.ll
@@ -7,9 +7,9 @@
 
 define i17 @test1(i17 %a) {
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[C:%.*]] = lshr i17 [[A:%.*]], 8
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i17 [[A:%.*]], 8
 ; CHECK-NEXT:    [[D:%.*]] = shl i17 [[A]], 8
-; CHECK-NEXT:    [[E:%.*]] = or i17 [[C]], [[D]]
+; CHECK-NEXT:    [[E:%.*]] = or i17 [[D]], [[TMP1]]
 ; CHECK-NEXT:    ret i17 [[E]]
 ;
   %b = zext i17 %a to i37    ; <i37> [#uses=2]
@@ -22,9 +22,9 @@
 
 define i167 @test2(i167 %a) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[C:%.*]] = lshr i167 [[A:%.*]], 9
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i167 [[A:%.*]], 9
 ; CHECK-NEXT:    [[D:%.*]] = shl i167 [[A]], 8
-; CHECK-NEXT:    [[E:%.*]] = or i167 [[C]], [[D]]
+; CHECK-NEXT:    [[E:%.*]] = or i167 [[D]], [[TMP1]]
 ; CHECK-NEXT:    ret i167 [[E]]
 ;
   %b = zext i167 %a to i577   ; <i577> [#uses=2]
diff --git a/llvm/test/Transforms/InstCombine/cast.ll b/llvm/test/Transforms/InstCombine/cast.ll
--- a/llvm/test/Transforms/InstCombine/cast.ll
+++ b/llvm/test/Transforms/InstCombine/cast.ll
@@ -472,9 +472,9 @@
 
 define i16 @test40(i16 %a) {
 ; ALL-LABEL: @test40(
-; ALL-NEXT:    [[T21:%.*]] = lshr i16 [[A:%.*]], 9
+; ALL-NEXT:    [[TMP1:%.*]] = lshr i16 [[A:%.*]], 9
 ; ALL-NEXT:    [[T5:%.*]] = shl i16 [[A]], 8
-; ALL-NEXT:    [[T32:%.*]] = or i16 [[T21]], [[T5]]
+; ALL-NEXT:    [[T32:%.*]] = or i16 [[T5]], [[TMP1]]
 ; ALL-NEXT:    ret i16 [[T32]]
 ;
   %t = zext i16 %a to i32
@@ -487,9 +487,9 @@
 
 define <2 x i16> @test40vec(<2 x i16> %a) {
 ; ALL-LABEL: @test40vec(
-; ALL-NEXT:    [[T21:%.*]] = lshr <2 x i16> [[A:%.*]], <i16 9, i16 9>
+; ALL-NEXT:    [[TMP1:%.*]] = lshr <2 x i16> [[A:%.*]], <i16 9, i16 9>
 ; ALL-NEXT:    [[T5:%.*]] = shl <2 x i16> [[A]], <i16 8, i16 8>
-; ALL-NEXT:    [[T32:%.*]] = or <2 x i16> [[T21]], [[T5]]
+; ALL-NEXT:    [[T32:%.*]] = or <2 x i16> [[T5]], [[TMP1]]
 ; ALL-NEXT:    ret <2 x i16> [[T32]]
 ;
   %t = zext <2 x i16> %a to <2 x i32>
@@ -2084,8 +2084,8 @@
 ; ALL-LABEL: @trunc_lshr_zext_uses1(
 ; ALL-NEXT:    [[B:%.*]] = zext <2 x i8> [[A:%.*]] to <2 x i32>
 ; ALL-NEXT:    call void @use_v2i32(<2 x i32> [[B]])
-; ALL-NEXT:    [[C:%.*]] = lshr <2 x i8> [[A]], <i8 6, i8 6>
-; ALL-NEXT:    ret <2 x i8> [[C]]
+; ALL-NEXT:    [[TMP1:%.*]] = lshr <2 x i8> [[A]], <i8 6, i8 6>
+; ALL-NEXT:    ret <2 x i8> [[TMP1]]
 ;
   %B = zext <2 x i8> %A to <2 x i32>
   call void @use_v2i32(<2 x i32> %B)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll
@@ -4,71 +4,35 @@
 
 define void @trunc_through_one_add(i16* noalias %0, i8* noalias readonly %1) {
 ; SSE-LABEL: @trunc_through_one_add(
-; SSE-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <4 x i8>*
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1
-; SSE-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
-; SSE-NEXT:    [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 1, i32 1, i32 1, i32 1>
-; SSE-NEXT:    [[TMP7:%.*]] = add nuw nsw <4 x i32> [[TMP6]], [[TMP5]]
-; SSE-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP9:%.*]] = trunc <4 x i32> [[TMP8]] to <4 x i16>
-; SSE-NEXT:    [[TMP10:%.*]] = bitcast i16* [[TMP0:%.*]] to <4 x i16>*
-; SSE-NEXT:    store <4 x i16> [[TMP9]], <4 x i16>* [[TMP10]], align 2
-; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 4
-; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 4
-; SSE-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to <4 x i8>*
-; SSE-NEXT:    [[TMP14:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1
-; SSE-NEXT:    [[TMP15:%.*]] = zext <4 x i8> [[TMP14]] to <4 x i32>
-; SSE-NEXT:    [[TMP16:%.*]] = lshr <4 x i32> [[TMP15]], <i32 1, i32 1, i32 1, i32 1>
-; SSE-NEXT:    [[TMP17:%.*]] = add nuw nsw <4 x i32> [[TMP16]], [[TMP15]]
-; SSE-NEXT:    [[TMP18:%.*]] = lshr <4 x i32> [[TMP17]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP19:%.*]] = trunc <4 x i32> [[TMP18]] to <4 x i16>
-; SSE-NEXT:    [[TMP20:%.*]] = bitcast i16* [[TMP12]] to <4 x i16>*
-; SSE-NEXT:    store <4 x i16> [[TMP19]], <4 x i16>* [[TMP20]], align 2
-; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8
-; SSE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8
-; SSE-NEXT:    [[TMP23:%.*]] = bitcast i8* [[TMP21]] to <4 x i8>*
-; SSE-NEXT:    [[TMP24:%.*]] = load <4 x i8>, <4 x i8>* [[TMP23]], align 1
-; SSE-NEXT:    [[TMP25:%.*]] = zext <4 x i8> [[TMP24]] to <4 x i32>
-; SSE-NEXT:    [[TMP26:%.*]] = lshr <4 x i32> [[TMP25]], <i32 1, i32 1, i32 1, i32 1>
-; SSE-NEXT:    [[TMP27:%.*]] = add nuw nsw <4 x i32> [[TMP26]], [[TMP25]]
-; SSE-NEXT:    [[TMP28:%.*]] = lshr <4 x i32> [[TMP27]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP29:%.*]] = trunc <4 x i32> [[TMP28]] to <4 x i16>
-; SSE-NEXT:    [[TMP30:%.*]] = bitcast i16* [[TMP22]] to <4 x i16>*
-; SSE-NEXT:    store <4 x i16> [[TMP29]], <4 x i16>* [[TMP30]], align 2
-; SSE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 12
-; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 12
-; SSE-NEXT:    [[TMP33:%.*]] = bitcast i8* [[TMP31]] to <4 x i8>*
-; SSE-NEXT:    [[TMP34:%.*]] = load <4 x i8>, <4 x i8>* [[TMP33]], align 1
-; SSE-NEXT:    [[TMP35:%.*]] = zext <4 x i8> [[TMP34]] to <4 x i32>
-; SSE-NEXT:    [[TMP36:%.*]] = lshr <4 x i32> [[TMP35]], <i32 1, i32 1, i32 1, i32 1>
-; SSE-NEXT:    [[TMP37:%.*]] = add nuw nsw <4 x i32> [[TMP36]], [[TMP35]]
-; SSE-NEXT:    [[TMP38:%.*]] = lshr <4 x i32> [[TMP37]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP39:%.*]] = trunc <4 x i32> [[TMP38]] to <4 x i16>
-; SSE-NEXT:    [[TMP40:%.*]] = bitcast i16* [[TMP32]] to <4 x i16>*
-; SSE-NEXT:    store <4 x i16> [[TMP39]], <4 x i16>* [[TMP40]], align 2
+; SSE-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>*
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1
+; SSE-NEXT:    [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i16>
+; SSE-NEXT:    [[TMP6:%.*]] = lshr <8 x i16> [[TMP5]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; SSE-NEXT:    [[TMP7:%.*]] = add <8 x i16> [[TMP6]], [[TMP5]]
+; SSE-NEXT:    [[TMP8:%.*]] = lshr <8 x i16> [[TMP7]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+; SSE-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>*
+; SSE-NEXT:    store <8 x i16> [[TMP8]], <8 x i16>* [[TMP9]], align 2
+; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8
+; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8
+; SSE-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>*
+; SSE-NEXT:    [[TMP13:%.*]] = load <8 x i8>, <8 x i8>* [[TMP12]], align 1
+; SSE-NEXT:    [[TMP14:%.*]] = zext <8 x i8> [[TMP13]] to <8 x i16>
+; SSE-NEXT:    [[TMP15:%.*]] = lshr <8 x i16> [[TMP14]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; SSE-NEXT:    [[TMP16:%.*]] = add <8 x i16> [[TMP15]], [[TMP14]]
+; SSE-NEXT:    [[TMP17:%.*]] = lshr <8 x i16> [[TMP16]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+; SSE-NEXT:    [[TMP18:%.*]] = bitcast i16* [[TMP11]] to <8 x i16>*
+; SSE-NEXT:    store <8 x i16> [[TMP17]], <8 x i16>* [[TMP18]], align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @trunc_through_one_add(
-; AVX-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>*
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1
-; AVX-NEXT:    [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32>
-; AVX-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[TMP5]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX-NEXT:    [[TMP7:%.*]] = add nuw nsw <8 x i32> [[TMP6]], [[TMP5]]
-; AVX-NEXT:    [[TMP8:%.*]] = lshr <8 x i32> [[TMP7]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; AVX-NEXT:    [[TMP9:%.*]] = trunc <8 x i32> [[TMP8]] to <8 x i16>
-; AVX-NEXT:    [[TMP10:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>*
-; AVX-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* [[TMP10]], align 2
-; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8
-; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8
-; AVX-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to <8 x i8>*
-; AVX-NEXT:    [[TMP14:%.*]] = load <8 x i8>, <8 x i8>* [[TMP13]], align 1
-; AVX-NEXT:    [[TMP15:%.*]] = zext <8 x i8> [[TMP14]] to <8 x i32>
-; AVX-NEXT:    [[TMP16:%.*]] = lshr <8 x i32> [[TMP15]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX-NEXT:    [[TMP17:%.*]] = add nuw nsw <8 x i32> [[TMP16]], [[TMP15]]
-; AVX-NEXT:    [[TMP18:%.*]] = lshr <8 x i32> [[TMP17]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; AVX-NEXT:    [[TMP19:%.*]] = trunc <8 x i32> [[TMP18]] to <8 x i16>
-; AVX-NEXT:    [[TMP20:%.*]] = bitcast i16* [[TMP12]] to <8 x i16>*
-; AVX-NEXT:    store <8 x i16> [[TMP19]], <8 x i16>* [[TMP20]], align 2
+; AVX-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <16 x i8>*
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 1
+; AVX-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[TMP4]] to <16 x i16>
+; AVX-NEXT:    [[TMP6:%.*]] = lshr <16 x i16> [[TMP5]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; AVX-NEXT:    [[TMP7:%.*]] = add <16 x i16> [[TMP6]], [[TMP5]]
+; AVX-NEXT:    [[TMP8:%.*]] = lshr <16 x i16> [[TMP7]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+; AVX-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP0:%.*]] to <16 x i16>*
+; AVX-NEXT:    store <16 x i16> [[TMP8]], <16 x i16>* [[TMP9]], align 2
 ; AVX-NEXT:    ret void
 ;
   %3 = load i8, i8* %1, align 1
@@ -218,99 +182,48 @@
 
 define void @trunc_through_two_adds(i16* noalias %0, i8* noalias readonly %1, i8* noalias readonly %2) {
 ; SSE-LABEL: @trunc_through_two_adds(
-; SSE-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <4 x i8>*
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1
-; SSE-NEXT:    [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32>
-; SSE-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <4 x i8>*
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1
-; SSE-NEXT:    [[TMP9:%.*]] = zext <4 x i8> [[TMP8]] to <4 x i32>
-; SSE-NEXT:    [[TMP10:%.*]] = add nuw nsw <4 x i32> [[TMP9]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = lshr <4 x i32> [[TMP10]], <i32 1, i32 1, i32 1, i32 1>
-; SSE-NEXT:    [[TMP12:%.*]] = add nuw nsw <4 x i32> [[TMP11]], [[TMP10]]
-; SSE-NEXT:    [[TMP13:%.*]] = lshr <4 x i32> [[TMP12]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP14:%.*]] = trunc <4 x i32> [[TMP13]] to <4 x i16>
-; SSE-NEXT:    [[TMP15:%.*]] = bitcast i16* [[TMP0:%.*]] to <4 x i16>*
-; SSE-NEXT:    store <4 x i16> [[TMP14]], <4 x i16>* [[TMP15]], align 2
-; SSE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 4
-; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 4
-; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 4
-; SSE-NEXT:    [[TMP19:%.*]] = bitcast i8* [[TMP16]] to <4 x i8>*
-; SSE-NEXT:    [[TMP20:%.*]] = load <4 x i8>, <4 x i8>* [[TMP19]], align 1
-; SSE-NEXT:    [[TMP21:%.*]] = zext <4 x i8> [[TMP20]] to <4 x i32>
-; SSE-NEXT:    [[TMP22:%.*]] = bitcast i8* [[TMP17]] to <4 x i8>*
-; SSE-NEXT:    [[TMP23:%.*]] = load <4 x i8>, <4 x i8>* [[TMP22]], align 1
-; SSE-NEXT:    [[TMP24:%.*]] = zext <4 x i8> [[TMP23]] to <4 x i32>
-; SSE-NEXT:    [[TMP25:%.*]] = add nuw nsw <4 x i32> [[TMP24]], [[TMP21]]
-; SSE-NEXT:    [[TMP26:%.*]] = lshr <4 x i32> [[TMP25]], <i32 1, i32 1, i32 1, i32 1>
-; SSE-NEXT:    [[TMP27:%.*]] = add nuw nsw <4 x i32> [[TMP26]], [[TMP25]]
-; SSE-NEXT:    [[TMP28:%.*]] = lshr <4 x i32> [[TMP27]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP29:%.*]] = trunc <4 x i32> [[TMP28]] to <4 x i16>
-; SSE-NEXT:    [[TMP30:%.*]] = bitcast i16* [[TMP18]] to <4 x i16>*
-; SSE-NEXT:    store <4 x i16> [[TMP29]], <4 x i16>* [[TMP30]], align 2
-; SSE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8
-; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 8
-; SSE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8
-; SSE-NEXT:    [[TMP34:%.*]] = bitcast i8* [[TMP31]] to <4 x i8>*
-; SSE-NEXT:    [[TMP35:%.*]] = load <4 x i8>, <4 x i8>* [[TMP34]], align 1
-; SSE-NEXT:    [[TMP36:%.*]] = zext <4 x i8> [[TMP35]] to <4 x i32>
-; SSE-NEXT:    [[TMP37:%.*]] = bitcast i8* [[TMP32]] to <4 x i8>*
-; SSE-NEXT:    [[TMP38:%.*]] = load <4 x i8>, <4 x i8>* [[TMP37]], align 1
-; SSE-NEXT:    [[TMP39:%.*]] = zext <4 x i8> [[TMP38]] to <4 x i32>
-; SSE-NEXT:    [[TMP40:%.*]] = add nuw nsw <4 x i32> [[TMP39]], [[TMP36]]
-; SSE-NEXT:    [[TMP41:%.*]] = lshr <4 x i32> [[TMP40]], <i32 1, i32 1, i32 1, i32 1>
-; SSE-NEXT:    [[TMP42:%.*]] = add nuw nsw <4 x i32> [[TMP41]], [[TMP40]]
-; SSE-NEXT:    [[TMP43:%.*]] = lshr <4 x i32> [[TMP42]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP44:%.*]] = trunc <4 x i32> [[TMP43]] to <4 x i16>
-; SSE-NEXT:    [[TMP45:%.*]] = bitcast i16* [[TMP33]] to <4 x i16>*
-; SSE-NEXT:    store <4 x i16> [[TMP44]], <4 x i16>* [[TMP45]], align 2
-; SSE-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 12
-; SSE-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 12
-; SSE-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 12
-; SSE-NEXT:    [[TMP49:%.*]] = bitcast i8* [[TMP46]] to <4 x i8>*
-; SSE-NEXT:    [[TMP50:%.*]] = load <4 x i8>, <4 x i8>* [[TMP49]], align 1
-; SSE-NEXT:    [[TMP51:%.*]] = zext <4 x i8> [[TMP50]] to <4 x i32>
-; SSE-NEXT:    [[TMP52:%.*]] = bitcast i8* [[TMP47]] to <4 x i8>*
-; SSE-NEXT:    [[TMP53:%.*]] = load <4 x i8>, <4 x i8>* [[TMP52]], align 1
-; SSE-NEXT:    [[TMP54:%.*]] = zext <4 x i8> [[TMP53]] to <4 x i32>
-; SSE-NEXT:    [[TMP55:%.*]] = add nuw nsw <4 x i32> [[TMP54]], [[TMP51]]
-; SSE-NEXT:    [[TMP56:%.*]] = lshr <4 x i32> [[TMP55]], <i32 1, i32 1, i32 1, i32 1>
-; SSE-NEXT:    [[TMP57:%.*]] = add nuw nsw <4 x i32> [[TMP56]], [[TMP55]]
-; SSE-NEXT:    [[TMP58:%.*]] = lshr <4 x i32> [[TMP57]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP59:%.*]] = trunc <4 x i32> [[TMP58]] to <4 x i16>
-; SSE-NEXT:    [[TMP60:%.*]] = bitcast i16* [[TMP48]] to <4 x i16>*
-; SSE-NEXT:    store <4 x i16> [[TMP59]], <4 x i16>* [[TMP60]], align 2
+; SSE-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>*
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[TMP4]], align 1
+; SSE-NEXT:    [[TMP6:%.*]] = zext <8 x i8> [[TMP5]] to <8 x i16>
+; SSE-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <8 x i8>*
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1
+; SSE-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[TMP8]] to <8 x i16>
+; SSE-NEXT:    [[TMP10:%.*]] = add <8 x i16> [[TMP9]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = lshr <8 x i16> [[TMP10]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; SSE-NEXT:    [[TMP12:%.*]] = add <8 x i16> [[TMP11]], [[TMP10]]
+; SSE-NEXT:    [[TMP13:%.*]] = lshr <8 x i16> [[TMP12]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+; SSE-NEXT:    [[TMP14:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>*
+; SSE-NEXT:    store <8 x i16> [[TMP13]], <8 x i16>* [[TMP14]], align 2
+; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8
+; SSE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 8
+; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8
+; SSE-NEXT:    [[TMP18:%.*]] = bitcast i8* [[TMP15]] to <8 x i8>*
+; SSE-NEXT:    [[TMP19:%.*]] = load <8 x i8>, <8 x i8>* [[TMP18]], align 1
+; SSE-NEXT:    [[TMP20:%.*]] = zext <8 x i8> [[TMP19]] to <8 x i16>
+; SSE-NEXT:    [[TMP21:%.*]] = bitcast i8* [[TMP16]] to <8 x i8>*
+; SSE-NEXT:    [[TMP22:%.*]] = load <8 x i8>, <8 x i8>* [[TMP21]], align 1
+; SSE-NEXT:    [[TMP23:%.*]] = zext <8 x i8> [[TMP22]] to <8 x i16>
+; SSE-NEXT:    [[TMP24:%.*]] = add <8 x i16> [[TMP23]], [[TMP20]]
+; SSE-NEXT:    [[TMP25:%.*]] = lshr <8 x i16> [[TMP24]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; SSE-NEXT:    [[TMP26:%.*]] = add <8 x i16> [[TMP25]], [[TMP24]]
+; SSE-NEXT:    [[TMP27:%.*]] = lshr <8 x i16> [[TMP26]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+; SSE-NEXT:    [[TMP28:%.*]] = bitcast i16* [[TMP17]] to <8 x i16>*
+; SSE-NEXT:    store <8 x i16> [[TMP27]], <8 x i16>* [[TMP28]], align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @trunc_through_two_adds(
-; AVX-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>*
-; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[TMP4]], align 1
-; AVX-NEXT:    [[TMP6:%.*]] = zext <8 x i8> [[TMP5]] to <8 x i32>
-; AVX-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <8 x i8>*
-; AVX-NEXT:    [[TMP8:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1
-; AVX-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[TMP8]] to <8 x i32>
-; AVX-NEXT:    [[TMP10:%.*]] = add nuw nsw <8 x i32> [[TMP9]], [[TMP6]]
-; AVX-NEXT:    [[TMP11:%.*]] = lshr <8 x i32> [[TMP10]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX-NEXT:    [[TMP12:%.*]] = add nuw nsw <8 x i32> [[TMP11]], [[TMP10]]
-; AVX-NEXT:    [[TMP13:%.*]] = lshr <8 x i32> [[TMP12]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; AVX-NEXT:    [[TMP14:%.*]] = trunc <8 x i32> [[TMP13]] to <8 x i16>
-; AVX-NEXT:    [[TMP15:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>*
-; AVX-NEXT:    store <8 x i16> [[TMP14]], <8 x i16>* [[TMP15]], align 2
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8
-; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 8
-; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8
-; AVX-NEXT:    [[TMP19:%.*]] = bitcast i8* [[TMP16]] to <8 x i8>*
-; AVX-NEXT:    [[TMP20:%.*]] = load <8 x i8>, <8 x i8>* [[TMP19]], align 1
-; AVX-NEXT:    [[TMP21:%.*]] = zext <8 x i8> [[TMP20]] to <8 x i32>
-; AVX-NEXT:    [[TMP22:%.*]] = bitcast i8* [[TMP17]] to <8 x i8>*
-; AVX-NEXT:    [[TMP23:%.*]] = load <8 x i8>, <8 x i8>* [[TMP22]], align 1
-; AVX-NEXT:    [[TMP24:%.*]] = zext <8 x i8> [[TMP23]] to <8 x i32>
-; AVX-NEXT:    [[TMP25:%.*]] = add nuw nsw <8 x i32> [[TMP24]], [[TMP21]]
-; AVX-NEXT:    [[TMP26:%.*]] = lshr <8 x i32> [[TMP25]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX-NEXT:    [[TMP27:%.*]] = add nuw nsw <8 x i32> [[TMP26]], [[TMP25]]
-; AVX-NEXT:    [[TMP28:%.*]] = lshr <8 x i32> [[TMP27]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; AVX-NEXT:    [[TMP29:%.*]] = trunc <8 x i32> [[TMP28]] to <8 x i16>
-; AVX-NEXT:    [[TMP30:%.*]] = bitcast i16* [[TMP18]] to <8 x i16>*
-; AVX-NEXT:    store <8 x i16> [[TMP29]], <8 x i16>* [[TMP30]], align 2
+; AVX-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <16 x i8>*
+; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 1
+; AVX-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[TMP5]] to <16 x i16>
+; AVX-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <16 x i8>*
+; AVX-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[TMP7]], align 1
+; AVX-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[TMP8]] to <16 x i16>
+; AVX-NEXT:    [[TMP10:%.*]] = add <16 x i16> [[TMP9]], [[TMP6]]
+; AVX-NEXT:    [[TMP11:%.*]] = lshr <16 x i16> [[TMP10]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; AVX-NEXT:    [[TMP12:%.*]] = add <16 x i16> [[TMP11]], [[TMP10]]
+; AVX-NEXT:    [[TMP13:%.*]] = lshr <16 x i16> [[TMP12]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+; AVX-NEXT:    [[TMP14:%.*]] = bitcast i16* [[TMP0:%.*]] to <16 x i16>*
+; AVX-NEXT:    store <16 x i16> [[TMP13]], <16 x i16>* [[TMP14]], align 2
 ; AVX-NEXT:    ret void
 ;
   %4 = load i8, i8* %1, align 1