diff --git a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp --- a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp @@ -64,6 +64,10 @@ Ops.push_back(I->getOperand(0)); Ops.push_back(I->getOperand(1)); break; + case Instruction::AShr: + case Instruction::LShr: + Ops.push_back(I->getOperand(0)); + break; case Instruction::Select: Ops.push_back(I->getOperand(1)); Ops.push_back(I->getOperand(2)); @@ -127,6 +131,8 @@ case Instruction::And: case Instruction::Or: case Instruction::Xor: + case Instruction::LShr: + case Instruction::AShr: case Instruction::Select: { SmallVector<Value *, 2> Operands; getRelevantOperands(I, Operands); @@ -137,7 +143,7 @@ // TODO: Can handle more cases here: // 1. shufflevector, extractelement, insertelement // 2. udiv, urem - // 3. shl, lshr, ashr + // 3. shl // 4. phi node(and loop handling) // ... return false; @@ -356,7 +362,9 @@ case Instruction::Mul: case Instruction::And: case Instruction::Or: - case Instruction::Xor: { + case Instruction::Xor: + case Instruction::AShr: + case Instruction::LShr: { Value *LHS = getReducedOperand(I->getOperand(0), SclTy); Value *RHS = getReducedOperand(I->getOperand(1), SclTy); Res = Builder.CreateBinOp((Instruction::BinaryOps)Opc, LHS, RHS); diff --git a/llvm/test/Transforms/AggressiveInstCombine/pr50555.ll b/llvm/test/Transforms/AggressiveInstCombine/pr50555.ll --- a/llvm/test/Transforms/AggressiveInstCombine/pr50555.ll +++ b/llvm/test/Transforms/AggressiveInstCombine/pr50555.ll @@ -3,11 +3,10 @@ define void @trunc_one_add(i16* %a, i8 %b) { ; CHECK-LABEL: @trunc_one_add( -; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[B:%.*]] to i32 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[ZEXT]], 1 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[ZEXT]], [[SHR]] -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[ADD]] to i16 -; CHECK-NEXT: store i16 [[TRUNC]], i16* [[A:%.*]], align 2 +; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[B:%.*]] to i16 +; CHECK-NEXT: [[SHR:%.*]] = lshr i16 [[ZEXT]], 1 +; CHECK-NEXT: [[ADD:%.*]] = add i16 [[ZEXT]], [[SHR]] +; CHECK-NEXT: store i16 [[ADD]], i16* [[A:%.*]], align 2 ; CHECK-NEXT: ret void ; %zext = zext i8 %b to i32 @@ -20,14 +19,13 @@ define void @trunc_two_adds(i16* %a, i8 %b, i8 %c) { ; CHECK-LABEL: @trunc_two_adds( -; CHECK-NEXT: [[ZEXT1:%.*]] = zext i8 [[B:%.*]] to i32 -; CHECK-NEXT: [[ZEXT2:%.*]] = zext i8 [[C:%.*]] to i32 -; CHECK-NEXT: [[ADD1:%.*]] = add nuw nsw i32 [[ZEXT1]], [[ZEXT2]] -; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[ADD1]], 1 -; CHECK-NEXT: [[ADD2:%.*]] = add nuw nsw i32 [[ADD1]], [[SHR1]] -; CHECK-NEXT: [[SHR2:%.*]] = lshr i32 [[ADD2]], 2 -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[SHR2]] to i16 -; CHECK-NEXT: store i16 [[TRUNC]], i16* [[A:%.*]], align 2 +; CHECK-NEXT: [[ZEXT1:%.*]] = zext i8 [[B:%.*]] to i16 +; CHECK-NEXT: [[ZEXT2:%.*]] = zext i8 [[C:%.*]] to i16 +; CHECK-NEXT: [[ADD1:%.*]] = add i16 [[ZEXT1]], [[ZEXT2]] +; CHECK-NEXT: [[SHR1:%.*]] = lshr i16 [[ADD1]], 1 +; CHECK-NEXT: [[ADD2:%.*]] = add i16 [[ADD1]], [[SHR1]] +; CHECK-NEXT: [[SHR2:%.*]] = lshr i16 [[ADD2]], 2 +; CHECK-NEXT: store i16 [[SHR2]], i16* [[A:%.*]], align 2 ; CHECK-NEXT: ret void ; %zext1 = zext i8 %b to i32 diff --git a/llvm/test/Transforms/InstCombine/2008-01-21-MulTrunc.ll b/llvm/test/Transforms/InstCombine/2008-01-21-MulTrunc.ll --- a/llvm/test/Transforms/InstCombine/2008-01-21-MulTrunc.ll +++ b/llvm/test/Transforms/InstCombine/2008-01-21-MulTrunc.ll @@ -5,9 +5,9 @@ define i16 @test1(i16 %a) { ; CHECK-LABEL: @test1( -; CHECK-NEXT: [[C:%.*]] = lshr i16 [[A:%.*]], 8 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i16 [[A:%.*]], 8 ; CHECK-NEXT: [[D:%.*]] = mul i16 [[A]], 5 -; CHECK-NEXT: [[E:%.*]] = or i16 [[C]], [[D]] +; CHECK-NEXT: [[E:%.*]] = or i16 [[D]], [[TMP1]] ; CHECK-NEXT: ret i16 [[E]] ; %b = zext i16 %a to i32 ; <i32> [#uses=2] @@ -20,9 +20,9 @@ define <2 x i16> @test1_vec(<2 x i16> %a) { ; CHECK-LABEL: @test1_vec( -; CHECK-NEXT: [[C:%.*]] = lshr <2 x i16> [[A:%.*]], <i16 8, i16 8> +; CHECK-NEXT: [[TMP1:%.*]] = lshr <2 x i16> [[A:%.*]], <i16 8, i16 8> ; CHECK-NEXT: [[D:%.*]] = mul <2 x i16> [[A]], <i16 5, i16 5> -; CHECK-NEXT: [[E:%.*]] = or <2 x i16> [[C]], [[D]] +; CHECK-NEXT: [[E:%.*]] = or <2 x i16> [[D]], [[TMP1]] ; CHECK-NEXT: ret <2 x i16> [[E]] ; %b = zext <2 x i16> %a to <2 x i32> diff --git a/llvm/test/Transforms/InstCombine/apint-cast.ll b/llvm/test/Transforms/InstCombine/apint-cast.ll --- a/llvm/test/Transforms/InstCombine/apint-cast.ll +++ b/llvm/test/Transforms/InstCombine/apint-cast.ll @@ -7,9 +7,9 @@ define i17 @test1(i17 %a) { ; CHECK-LABEL: @test1( -; CHECK-NEXT: [[C:%.*]] = lshr i17 [[A:%.*]], 8 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i17 [[A:%.*]], 8 ; CHECK-NEXT: [[D:%.*]] = shl i17 [[A]], 8 -; CHECK-NEXT: [[E:%.*]] = or i17 [[C]], [[D]] +; CHECK-NEXT: [[E:%.*]] = or i17 [[D]], [[TMP1]] ; CHECK-NEXT: ret i17 [[E]] ; %b = zext i17 %a to i37 ; <i37> [#uses=2] @@ -22,9 +22,9 @@ define i167 @test2(i167 %a) { ; CHECK-LABEL: @test2( -; CHECK-NEXT: [[C:%.*]] = lshr i167 [[A:%.*]], 9 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i167 [[A:%.*]], 9 ; CHECK-NEXT: [[D:%.*]] = shl i167 [[A]], 8 -; CHECK-NEXT: [[E:%.*]] = or i167 [[C]], [[D]] +; CHECK-NEXT: [[E:%.*]] = or i167 [[D]], [[TMP1]] ; CHECK-NEXT: ret i167 [[E]] ; %b = zext i167 %a to i577 ; <i577> [#uses=2] diff --git a/llvm/test/Transforms/InstCombine/cast.ll b/llvm/test/Transforms/InstCombine/cast.ll --- a/llvm/test/Transforms/InstCombine/cast.ll +++ b/llvm/test/Transforms/InstCombine/cast.ll @@ -472,9 +472,9 @@ define i16 @test40(i16 %a) { ; ALL-LABEL: @test40( -; ALL-NEXT: [[T21:%.*]] = lshr i16 [[A:%.*]], 9 +; ALL-NEXT: [[TMP1:%.*]] = lshr i16 [[A:%.*]], 9 ; ALL-NEXT: [[T5:%.*]] = shl i16 [[A]], 8 -; ALL-NEXT: [[T32:%.*]] = or i16 [[T21]], [[T5]] +; ALL-NEXT: [[T32:%.*]] = or i16 [[T5]], [[TMP1]] ; ALL-NEXT: ret i16 [[T32]] ; %t = zext i16 %a to i32 @@ -487,9 +487,9 @@ define <2 x i16> @test40vec(<2 x i16> %a) { ; ALL-LABEL: @test40vec( -; ALL-NEXT: [[T21:%.*]] = lshr <2 x i16> [[A:%.*]], <i16 9, i16 9> +; ALL-NEXT: [[TMP1:%.*]] = lshr <2 x i16> [[A:%.*]], <i16 9, i16 9> ; ALL-NEXT: [[T5:%.*]] = shl <2 x i16> [[A]], <i16 8, i16 8> -; ALL-NEXT: [[T32:%.*]] = or <2 x i16> [[T21]], [[T5]] +; ALL-NEXT: [[T32:%.*]] = or <2 x i16> [[T5]], [[TMP1]] ; ALL-NEXT: ret <2 x i16> [[T32]] ; %t = zext <2 x i16> %a to <2 x i32> @@ -2084,8 +2084,8 @@ ; ALL-LABEL: @trunc_lshr_zext_uses1( ; ALL-NEXT: [[B:%.*]] = zext <2 x i8> [[A:%.*]] to <2 x i32> ; ALL-NEXT: call void @use_v2i32(<2 x i32> [[B]]) -; ALL-NEXT: [[C:%.*]] = lshr <2 x i8> [[A]], <i8 6, i8 6> -; ALL-NEXT: ret <2 x i8> [[C]] +; ALL-NEXT: [[TMP1:%.*]] = lshr <2 x i8> [[A]], <i8 6, i8 6> +; ALL-NEXT: ret <2 x i8> [[TMP1]] ; %B = zext <2 x i8> %A to <2 x i32> call void @use_v2i32(<2 x i32> %B) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll @@ -4,71 +4,35 @@ define void @trunc_through_one_add(i16* noalias %0, i8* noalias readonly %1) { ; SSE-LABEL: @trunc_through_one_add( -; SSE-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <4 x i8>* -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1 -; SSE-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> -; SSE-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 1, i32 1, i32 1, i32 1> -; SSE-NEXT: [[TMP7:%.*]] = add nuw nsw <4 x i32> [[TMP6]], [[TMP5]] -; SSE-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], <i32 2, i32 2, i32 2, i32 2> -; SSE-NEXT: [[TMP9:%.*]] = trunc <4 x i32> [[TMP8]] to <4 x i16> -; SSE-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP0:%.*]] to <4 x i16>* -; SSE-NEXT: store <4 x i16> [[TMP9]], <4 x i16>* [[TMP10]], align 2 -; SSE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 4 -; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 4 -; SSE-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP11]] to <4 x i8>* -; SSE-NEXT: [[TMP14:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1 -; SSE-NEXT: [[TMP15:%.*]] = zext <4 x i8> [[TMP14]] to <4 x i32> -; SSE-NEXT: [[TMP16:%.*]] = lshr <4 x i32> [[TMP15]], <i32 1, i32 1, i32 1, i32 1> -; SSE-NEXT: [[TMP17:%.*]] = add nuw nsw <4 x i32> [[TMP16]], [[TMP15]] -; SSE-NEXT: [[TMP18:%.*]] = lshr <4 x i32> [[TMP17]], <i32 2, i32 2, i32 2, i32 2> -; SSE-NEXT: [[TMP19:%.*]] = trunc <4 x i32> [[TMP18]] to <4 x i16> -; SSE-NEXT: [[TMP20:%.*]] = bitcast i16* [[TMP12]] to <4 x i16>* -; SSE-NEXT: store <4 x i16> [[TMP19]], <4 x i16>* [[TMP20]], align 2 -; SSE-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8 -; SSE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8 -; SSE-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to <4 x i8>* -; SSE-NEXT: [[TMP24:%.*]] = load <4 x i8>, <4 x i8>* [[TMP23]], align 1 -; SSE-NEXT: [[TMP25:%.*]] = zext <4 x i8> [[TMP24]] to <4 x i32> -; SSE-NEXT: [[TMP26:%.*]] = lshr <4 x i32> [[TMP25]], <i32 1, i32 1, i32 1, i32 1> -; SSE-NEXT: [[TMP27:%.*]] = add nuw nsw <4 x i32> [[TMP26]], [[TMP25]] -; SSE-NEXT: [[TMP28:%.*]] = lshr <4 x i32> [[TMP27]], <i32 2, i32 2, i32 2, i32 2> -; SSE-NEXT: [[TMP29:%.*]] = trunc <4 x i32> [[TMP28]] to <4 x i16> -; SSE-NEXT: [[TMP30:%.*]] = bitcast i16* [[TMP22]] to <4 x i16>* -; SSE-NEXT: store <4 x i16> [[TMP29]], <4 x i16>* [[TMP30]], align 2 -; SSE-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 12 -; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 12 -; SSE-NEXT: [[TMP33:%.*]] = bitcast i8* [[TMP31]] to <4 x i8>* -; SSE-NEXT: [[TMP34:%.*]] = load <4 x i8>, <4 x i8>* [[TMP33]], align 1 -; SSE-NEXT: [[TMP35:%.*]] = zext <4 x i8> [[TMP34]] to <4 x i32> -; SSE-NEXT: [[TMP36:%.*]] = lshr <4 x i32> [[TMP35]], <i32 1, i32 1, i32 1, i32 1> -; SSE-NEXT: [[TMP37:%.*]] = add nuw nsw <4 x i32> [[TMP36]], [[TMP35]] -; SSE-NEXT: [[TMP38:%.*]] = lshr <4 x i32> [[TMP37]], <i32 2, i32 2, i32 2, i32 2> -; SSE-NEXT: [[TMP39:%.*]] = trunc <4 x i32> [[TMP38]] to <4 x i16> -; SSE-NEXT: [[TMP40:%.*]] = bitcast i16* [[TMP32]] to <4 x i16>* -; SSE-NEXT: store <4 x i16> [[TMP39]], <4 x i16>* [[TMP40]], align 2 +; SSE-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>* +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1 +; SSE-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i16> +; SSE-NEXT: [[TMP6:%.*]] = lshr <8 x i16> [[TMP5]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> +; SSE-NEXT: [[TMP7:%.*]] = add <8 x i16> [[TMP6]], [[TMP5]] +; SSE-NEXT: [[TMP8:%.*]] = lshr <8 x i16> [[TMP7]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2> +; SSE-NEXT: [[TMP9:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>* +; SSE-NEXT: store <8 x i16> [[TMP8]], <8 x i16>* [[TMP9]], align 2 +; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8 +; SSE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8 +; SSE-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>* +; SSE-NEXT: [[TMP13:%.*]] = load <8 x i8>, <8 x i8>* [[TMP12]], align 1 +; SSE-NEXT: [[TMP14:%.*]] = zext <8 x i8> [[TMP13]] to <8 x i16> +; SSE-NEXT: [[TMP15:%.*]] = lshr <8 x i16> [[TMP14]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> +; SSE-NEXT: [[TMP16:%.*]] = add <8 x i16> [[TMP15]], [[TMP14]] +; SSE-NEXT: [[TMP17:%.*]] = lshr <8 x i16> [[TMP16]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2> +; SSE-NEXT: [[TMP18:%.*]] = bitcast i16* [[TMP11]] to <8 x i16>* +; SSE-NEXT: store <8 x i16> [[TMP17]], <8 x i16>* [[TMP18]], align 2 ; SSE-NEXT: ret void ; ; AVX-LABEL: @trunc_through_one_add( -; AVX-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>* -; AVX-NEXT: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1 -; AVX-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32> -; AVX-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[TMP5]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> -; AVX-NEXT: [[TMP7:%.*]] = add nuw nsw <8 x i32> [[TMP6]], [[TMP5]] -; AVX-NEXT: [[TMP8:%.*]] = lshr <8 x i32> [[TMP7]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> -; AVX-NEXT: [[TMP9:%.*]] = trunc <8 x i32> [[TMP8]] to <8 x i16> -; AVX-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>* -; AVX-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* [[TMP10]], align 2 -; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8 -; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8 -; AVX-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP11]] to <8 x i8>* -; AVX-NEXT: [[TMP14:%.*]] = load <8 x i8>, <8 x i8>* [[TMP13]], align 1 -; AVX-NEXT: [[TMP15:%.*]] = zext <8 x i8> [[TMP14]] to <8 x i32> -; AVX-NEXT: [[TMP16:%.*]] = lshr <8 x i32> [[TMP15]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> -; AVX-NEXT: [[TMP17:%.*]] = add nuw nsw <8 x i32> [[TMP16]], [[TMP15]] -; AVX-NEXT: [[TMP18:%.*]] = lshr <8 x i32> [[TMP17]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> -; AVX-NEXT: [[TMP19:%.*]] = trunc <8 x i32> [[TMP18]] to <8 x i16> -; AVX-NEXT: [[TMP20:%.*]] = bitcast i16* [[TMP12]] to <8 x i16>* -; AVX-NEXT: store <8 x i16> [[TMP19]], <8 x i16>* [[TMP20]], align 2 +; AVX-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <16 x i8>* +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 1 +; AVX-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[TMP4]] to <16 x i16> +; AVX-NEXT: [[TMP6:%.*]] = lshr <16 x i16> [[TMP5]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> +; AVX-NEXT: [[TMP7:%.*]] = add <16 x i16> [[TMP6]], [[TMP5]] +; AVX-NEXT: [[TMP8:%.*]] = lshr <16 x i16> [[TMP7]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2> +; AVX-NEXT: [[TMP9:%.*]] = bitcast i16* [[TMP0:%.*]] to <16 x i16>* +; AVX-NEXT: store <16 x i16> [[TMP8]], <16 x i16>* [[TMP9]], align 2 ; AVX-NEXT: ret void ; %3 = load i8, i8* %1, align 1 @@ -218,99 +182,48 @@ define void @trunc_through_two_adds(i16* noalias %0, i8* noalias readonly %1, i8* noalias readonly %2) { ; SSE-LABEL: @trunc_through_two_adds( -; SSE-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <4 x i8>* -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1 -; SSE-NEXT: [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32> -; SSE-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <4 x i8>* -; SSE-NEXT: [[TMP8:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1 -; SSE-NEXT: [[TMP9:%.*]] = zext <4 x i8> [[TMP8]] to <4 x i32> -; SSE-NEXT: [[TMP10:%.*]] = add nuw nsw <4 x i32> [[TMP9]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = lshr <4 x i32> [[TMP10]], <i32 1, i32 1, i32 1, i32 1> -; SSE-NEXT: [[TMP12:%.*]] = add nuw nsw <4 x i32> [[TMP11]], [[TMP10]] -; SSE-NEXT: [[TMP13:%.*]] = lshr <4 x i32> [[TMP12]], <i32 2, i32 2, i32 2, i32 2> -; SSE-NEXT: [[TMP14:%.*]] = trunc <4 x i32> [[TMP13]] to <4 x i16> -; SSE-NEXT: [[TMP15:%.*]] = bitcast i16* [[TMP0:%.*]] to <4 x i16>* -; SSE-NEXT: store <4 x i16> [[TMP14]], <4 x i16>* [[TMP15]], align 2 -; SSE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 4 -; SSE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 4 -; SSE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 4 -; SSE-NEXT: [[TMP19:%.*]] = bitcast i8* [[TMP16]] to <4 x i8>* -; SSE-NEXT: [[TMP20:%.*]] = load <4 x i8>, <4 x i8>* [[TMP19]], align 1 -; SSE-NEXT: [[TMP21:%.*]] = zext <4 x i8> [[TMP20]] to <4 x i32> -; SSE-NEXT: [[TMP22:%.*]] = bitcast i8* [[TMP17]] to <4 x i8>* -; SSE-NEXT: [[TMP23:%.*]] = load <4 x i8>, <4 x i8>* [[TMP22]], align 1 -; SSE-NEXT: [[TMP24:%.*]] = zext <4 x i8> [[TMP23]] to <4 x i32> -; SSE-NEXT: [[TMP25:%.*]] = add nuw nsw <4 x i32> [[TMP24]], [[TMP21]] -; SSE-NEXT: [[TMP26:%.*]] = lshr <4 x i32> [[TMP25]], <i32 1, i32 1, i32 1, i32 1> -; SSE-NEXT: [[TMP27:%.*]] = add nuw nsw <4 x i32> [[TMP26]], [[TMP25]] -; SSE-NEXT: [[TMP28:%.*]] = lshr <4 x i32> [[TMP27]], <i32 2, i32 2, i32 2, i32 2> -; SSE-NEXT: [[TMP29:%.*]] = trunc <4 x i32> [[TMP28]] to <4 x i16> -; SSE-NEXT: [[TMP30:%.*]] = bitcast i16* [[TMP18]] to <4 x i16>* -; SSE-NEXT: store <4 x i16> [[TMP29]], <4 x i16>* [[TMP30]], align 2 -; SSE-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8 -; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 8 -; SSE-NEXT: [[TMP33:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8 -; SSE-NEXT: [[TMP34:%.*]] = bitcast i8* [[TMP31]] to <4 x i8>* -; SSE-NEXT: [[TMP35:%.*]] = load <4 x i8>, <4 x i8>* [[TMP34]], align 1 -; SSE-NEXT: [[TMP36:%.*]] = zext <4 x i8> [[TMP35]] to <4 x i32> -; SSE-NEXT: [[TMP37:%.*]] = bitcast i8* [[TMP32]] to <4 x i8>* -; SSE-NEXT: [[TMP38:%.*]] = load <4 x i8>, <4 x i8>* [[TMP37]], align 1 -; SSE-NEXT: [[TMP39:%.*]] = zext <4 x i8> [[TMP38]] to <4 x i32> -; SSE-NEXT: [[TMP40:%.*]] = add nuw nsw <4 x i32> [[TMP39]], [[TMP36]] -; SSE-NEXT: [[TMP41:%.*]] = lshr <4 x i32> [[TMP40]], <i32 1, i32 1, i32 1, i32 1> -; SSE-NEXT: [[TMP42:%.*]] = add nuw nsw <4 x i32> [[TMP41]], [[TMP40]] -; SSE-NEXT: [[TMP43:%.*]] = lshr <4 x i32> [[TMP42]], <i32 2, i32 2, i32 2, i32 2> -; SSE-NEXT: [[TMP44:%.*]] = trunc <4 x i32> [[TMP43]] to <4 x i16> -; SSE-NEXT: [[TMP45:%.*]] = bitcast i16* [[TMP33]] to <4 x i16>* -; SSE-NEXT: store <4 x i16> [[TMP44]], <4 x i16>* [[TMP45]], align 2 -; SSE-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 12 -; SSE-NEXT: [[TMP47:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 12 -; SSE-NEXT: [[TMP48:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 12 -; SSE-NEXT: [[TMP49:%.*]] = bitcast i8* [[TMP46]] to <4 x i8>* -; SSE-NEXT: [[TMP50:%.*]] = load <4 x i8>, <4 x i8>* [[TMP49]], align 1 -; SSE-NEXT: [[TMP51:%.*]] = zext <4 x i8> [[TMP50]] to <4 x i32> -; SSE-NEXT: [[TMP52:%.*]] = bitcast i8* [[TMP47]] to <4 x i8>* -; SSE-NEXT: [[TMP53:%.*]] = load <4 x i8>, <4 x i8>* [[TMP52]], align 1 -; SSE-NEXT: [[TMP54:%.*]] = zext <4 x i8> [[TMP53]] to <4 x i32> -; SSE-NEXT: [[TMP55:%.*]] = add nuw nsw <4 x i32> [[TMP54]], [[TMP51]] -; SSE-NEXT: [[TMP56:%.*]] = lshr <4 x i32> [[TMP55]], <i32 1, i32 1, i32 1, i32 1> -; SSE-NEXT: [[TMP57:%.*]] = add nuw nsw <4 x i32> [[TMP56]], [[TMP55]] -; SSE-NEXT: [[TMP58:%.*]] = lshr <4 x i32> [[TMP57]], <i32 2, i32 2, i32 2, i32 2> -; SSE-NEXT: [[TMP59:%.*]] = trunc <4 x i32> [[TMP58]] to <4 x i16> -; SSE-NEXT: [[TMP60:%.*]] = bitcast i16* [[TMP48]] to <4 x i16>* -; SSE-NEXT: store <4 x i16> [[TMP59]], <4 x i16>* [[TMP60]], align 2 +; SSE-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>* +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[TMP4]], align 1 +; SSE-NEXT: [[TMP6:%.*]] = zext <8 x i8> [[TMP5]] to <8 x i16> +; SSE-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <8 x i8>* +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1 +; SSE-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[TMP8]] to <8 x i16> +; SSE-NEXT: [[TMP10:%.*]] = add <8 x i16> [[TMP9]], [[TMP6]] +; SSE-NEXT: [[TMP11:%.*]] = lshr <8 x i16> [[TMP10]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> +; SSE-NEXT: [[TMP12:%.*]] = add <8 x i16> [[TMP11]], [[TMP10]] +; SSE-NEXT: [[TMP13:%.*]] = lshr <8 x i16> [[TMP12]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2> +; SSE-NEXT: [[TMP14:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>* +; SSE-NEXT: store <8 x i16> [[TMP13]], <8 x i16>* [[TMP14]], align 2 +; SSE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8 +; SSE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 8 +; SSE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8 +; SSE-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP15]] to <8 x i8>* +; SSE-NEXT: [[TMP19:%.*]] = load <8 x i8>, <8 x i8>* [[TMP18]], align 1 +; SSE-NEXT: [[TMP20:%.*]] = zext <8 x i8> [[TMP19]] to <8 x i16> +; SSE-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP16]] to <8 x i8>* +; SSE-NEXT: [[TMP22:%.*]] = load <8 x i8>, <8 x i8>* [[TMP21]], align 1 +; SSE-NEXT: [[TMP23:%.*]] = zext <8 x i8> [[TMP22]] to <8 x i16> +; SSE-NEXT: [[TMP24:%.*]] = add <8 x i16> [[TMP23]], [[TMP20]] +; SSE-NEXT: [[TMP25:%.*]] = lshr <8 x i16> [[TMP24]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> +; SSE-NEXT: [[TMP26:%.*]] = add <8 x i16> [[TMP25]], [[TMP24]] +; SSE-NEXT: [[TMP27:%.*]] = lshr <8 x i16> [[TMP26]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2> +; SSE-NEXT: [[TMP28:%.*]] = bitcast i16* [[TMP17]] to <8 x i16>* +; SSE-NEXT: store <8 x i16> [[TMP27]], <8 x i16>* [[TMP28]], align 2 ; SSE-NEXT: ret void ; ; AVX-LABEL: @trunc_through_two_adds( -; AVX-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>* -; AVX-NEXT: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[TMP4]], align 1 -; AVX-NEXT: [[TMP6:%.*]] = zext <8 x i8> [[TMP5]] to <8 x i32> -; AVX-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <8 x i8>* -; AVX-NEXT: [[TMP8:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1 -; AVX-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[TMP8]] to <8 x i32> -; AVX-NEXT: [[TMP10:%.*]] = add nuw nsw <8 x i32> [[TMP9]], [[TMP6]] -; AVX-NEXT: [[TMP11:%.*]] = lshr <8 x i32> [[TMP10]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> -; AVX-NEXT: [[TMP12:%.*]] = add nuw nsw <8 x i32> [[TMP11]], [[TMP10]] -; AVX-NEXT: [[TMP13:%.*]] = lshr <8 x i32> [[TMP12]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> -; AVX-NEXT: [[TMP14:%.*]] = trunc <8 x i32> [[TMP13]] to <8 x i16> -; AVX-NEXT: [[TMP15:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>* -; AVX-NEXT: store <8 x i16> [[TMP14]], <8 x i16>* [[TMP15]], align 2 -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8 -; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 8 -; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8 -; AVX-NEXT: [[TMP19:%.*]] = bitcast i8* [[TMP16]] to <8 x i8>* -; AVX-NEXT: [[TMP20:%.*]] = load <8 x i8>, <8 x i8>* [[TMP19]], align 1 -; AVX-NEXT: [[TMP21:%.*]] = zext <8 x i8> [[TMP20]] to <8 x i32> -; AVX-NEXT: [[TMP22:%.*]] = bitcast i8* [[TMP17]] to <8 x i8>* -; AVX-NEXT: [[TMP23:%.*]] = load <8 x i8>, <8 x i8>* [[TMP22]], align 1 -; AVX-NEXT: [[TMP24:%.*]] = zext <8 x i8> [[TMP23]] to <8 x i32> -; AVX-NEXT: [[TMP25:%.*]] = add nuw nsw <8 x i32> [[TMP24]], [[TMP21]] -; AVX-NEXT: [[TMP26:%.*]] = lshr <8 x i32> [[TMP25]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> -; AVX-NEXT: [[TMP27:%.*]] = add nuw nsw <8 x i32> [[TMP26]], [[TMP25]] -; AVX-NEXT: [[TMP28:%.*]] = lshr <8 x i32> [[TMP27]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> -; AVX-NEXT: [[TMP29:%.*]] = trunc <8 x i32> [[TMP28]] to <8 x i16> -; AVX-NEXT: [[TMP30:%.*]] = bitcast i16* [[TMP18]] to <8 x i16>* -; AVX-NEXT: store <8 x i16> [[TMP29]], <8 x i16>* [[TMP30]], align 2 +; AVX-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <16 x i8>* +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 1 +; AVX-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[TMP5]] to <16 x i16> +; AVX-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <16 x i8>* +; AVX-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[TMP7]], align 1 +; AVX-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[TMP8]] to <16 x i16> +; AVX-NEXT: [[TMP10:%.*]] = add <16 x i16> [[TMP9]], [[TMP6]] +; AVX-NEXT: [[TMP11:%.*]] = lshr <16 x i16> [[TMP10]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> +; AVX-NEXT: [[TMP12:%.*]] = add <16 x i16> [[TMP11]], [[TMP10]] +; AVX-NEXT: [[TMP13:%.*]] = lshr <16 x i16> [[TMP12]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2> +; AVX-NEXT: [[TMP14:%.*]] = bitcast i16* [[TMP0:%.*]] to <16 x i16>* +; AVX-NEXT: store <16 x i16> [[TMP13]], <16 x i16>* [[TMP14]], align 2 ; AVX-NEXT: ret void ; %4 = load i8, i8* %1, align 1