diff --git a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp --- a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp @@ -64,6 +64,7 @@ case Instruction::Or: case Instruction::Xor: case Instruction::Shl: + case Instruction::LShr: Ops.push_back(I->getOperand(0)); Ops.push_back(I->getOperand(1)); break; @@ -131,6 +132,7 @@ case Instruction::Or: case Instruction::Xor: case Instruction::Shl: + case Instruction::LShr: case Instruction::Select: { SmallVector Operands; getRelevantOperands(I, Operands); @@ -141,7 +143,7 @@ // TODO: Can handle more cases here: // 1. shufflevector, extractelement, insertelement // 2. udiv, urem - // 3. lshr, ashr + // 3. ashr // 4. phi node(and loop handling) // ... return false; @@ -274,12 +276,15 @@ unsigned OrigBitWidth = CurrentTruncInst->getOperand(0)->getType()->getScalarSizeInBits(); - // Initialize MinBitWidth for `shl` instructions with the minimum number - // that is greater than shift amount (i.e. shift amount + 1). + // Initialize MinBitWidth for shift instructions with the minimum number + // that is greater than shift amount (i.e. shift amount + 1). For `lshr` + // adjust MinBitWidth so that all potentially truncated bits of + // the value-to-be-shifted are zeros. // Also normalize MinBitWidth not to be greater than source bitwidth. for (auto &Itr : InstInfoMap) { Instruction *I = Itr.first; - if (I->getOpcode() == Instruction::Shl) { + if (I->getOpcode() == Instruction::Shl || + I->getOpcode() == Instruction::LShr) { KnownBits KnownRHS = computeKnownBits(I->getOperand(1), DL); const unsigned SrcBitWidth = KnownRHS.getBitWidth(); unsigned MinBitWidth = KnownRHS.getMaxValue() @@ -287,6 +292,13 @@ .getLimitedValue(SrcBitWidth); if (MinBitWidth >= OrigBitWidth) return nullptr; + if (I->getOpcode() == Instruction::LShr) { + KnownBits KnownLHS = computeKnownBits(I->getOperand(0), DL); + MinBitWidth = + std::max(MinBitWidth, KnownLHS.getMaxValue().getActiveBits()); + if (MinBitWidth >= OrigBitWidth) + return nullptr; + } Itr.second.MinBitWidth = MinBitWidth; } } @@ -378,10 +390,14 @@ case Instruction::And: case Instruction::Or: case Instruction::Xor: - case Instruction::Shl: { + case Instruction::Shl: + case Instruction::LShr: { Value *LHS = getReducedOperand(I->getOperand(0), SclTy); Value *RHS = getReducedOperand(I->getOperand(1), SclTy); Res = Builder.CreateBinOp((Instruction::BinaryOps)Opc, LHS, RHS); + // Preserve `exact` flag since truncation doesn't change exactness + if (Opc == Instruction::LShr) + cast(Res)->setIsExact(I->isExact()); break; } case Instruction::Select: { diff --git a/llvm/test/Transforms/AggressiveInstCombine/pr50555.ll b/llvm/test/Transforms/AggressiveInstCombine/pr50555.ll --- a/llvm/test/Transforms/AggressiveInstCombine/pr50555.ll +++ b/llvm/test/Transforms/AggressiveInstCombine/pr50555.ll @@ -3,11 +3,10 @@ define void @trunc_one_add(i16* %a, i8 %b) { ; CHECK-LABEL: @trunc_one_add( -; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[B:%.*]] to i32 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[ZEXT]], 1 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[ZEXT]], [[SHR]] -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[ADD]] to i16 -; CHECK-NEXT: store i16 [[TRUNC]], i16* [[A:%.*]], align 2 +; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[B:%.*]] to i16 +; CHECK-NEXT: [[SHR:%.*]] = lshr i16 [[ZEXT]], 1 +; CHECK-NEXT: [[ADD:%.*]] = add i16 [[ZEXT]], [[SHR]] +; CHECK-NEXT: store i16 [[ADD]], i16* [[A:%.*]], align 2 ; CHECK-NEXT: ret void ; %zext = zext i8 %b to i32 @@ -20,14 +19,13 @@ define void @trunc_two_adds(i16* %a, i8 %b, i8 %c) { ; CHECK-LABEL: @trunc_two_adds( -; CHECK-NEXT: [[ZEXT1:%.*]] = zext i8 [[B:%.*]] to i32 -; CHECK-NEXT: [[ZEXT2:%.*]] = zext i8 [[C:%.*]] to i32 -; CHECK-NEXT: [[ADD1:%.*]] = add nuw nsw i32 [[ZEXT1]], [[ZEXT2]] -; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[ADD1]], 1 -; CHECK-NEXT: [[ADD2:%.*]] = add nuw nsw i32 [[ADD1]], [[SHR1]] -; CHECK-NEXT: [[SHR2:%.*]] = lshr i32 [[ADD2]], 2 -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[SHR2]] to i16 -; CHECK-NEXT: store i16 [[TRUNC]], i16* [[A:%.*]], align 2 +; CHECK-NEXT: [[ZEXT1:%.*]] = zext i8 [[B:%.*]] to i16 +; CHECK-NEXT: [[ZEXT2:%.*]] = zext i8 [[C:%.*]] to i16 +; CHECK-NEXT: [[ADD1:%.*]] = add i16 [[ZEXT1]], [[ZEXT2]] +; CHECK-NEXT: [[SHR1:%.*]] = lshr i16 [[ADD1]], 1 +; CHECK-NEXT: [[ADD2:%.*]] = add i16 [[ADD1]], [[SHR1]] +; CHECK-NEXT: [[SHR2:%.*]] = lshr i16 [[ADD2]], 2 +; CHECK-NEXT: store i16 [[SHR2]], i16* [[A:%.*]], align 2 ; CHECK-NEXT: ret void ; %zext1 = zext i8 %b to i32 diff --git a/llvm/test/Transforms/AggressiveInstCombine/trunc_shifts.ll b/llvm/test/Transforms/AggressiveInstCombine/trunc_shifts.ll --- a/llvm/test/Transforms/AggressiveInstCombine/trunc_shifts.ll +++ b/llvm/test/Transforms/AggressiveInstCombine/trunc_shifts.ll @@ -190,10 +190,8 @@ define i16 @lshr_15(i16 %x) { ; CHECK-LABEL: @lshr_15( -; CHECK-NEXT: [[ZEXT:%.*]] = zext i16 [[X:%.*]] to i32 -; CHECK-NEXT: [[LSHR:%.*]] = lshr i32 [[ZEXT]], 15 -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[LSHR]] to i16 -; CHECK-NEXT: ret i16 [[TRUNC]] +; CHECK-NEXT: [[LSHR:%.*]] = lshr i16 [[X:%.*]], 15 +; CHECK-NEXT: ret i16 [[LSHR]] ; %zext = zext i16 %x to i32 %lshr = lshr i32 %zext, 15 @@ -239,14 +237,13 @@ define i16 @lshr_var_bounded_shift_amount(i8 %x, i8 %amt) { ; CHECK-LABEL: @lshr_var_bounded_shift_amount( -; CHECK-NEXT: [[Z:%.*]] = zext i8 [[X:%.*]] to i32 -; CHECK-NEXT: [[ZA:%.*]] = zext i8 [[AMT:%.*]] to i32 -; CHECK-NEXT: [[ZA2:%.*]] = and i32 [[ZA]], 15 -; CHECK-NEXT: [[S:%.*]] = lshr i32 [[Z]], [[ZA2]] -; CHECK-NEXT: [[A:%.*]] = add i32 [[S]], [[Z]] -; CHECK-NEXT: [[S2:%.*]] = lshr i32 [[A]], 2 -; CHECK-NEXT: [[T:%.*]] = trunc i32 [[S2]] to i16 -; CHECK-NEXT: ret i16 [[T]] +; CHECK-NEXT: [[Z:%.*]] = zext i8 [[X:%.*]] to i16 +; CHECK-NEXT: [[ZA:%.*]] = zext i8 [[AMT:%.*]] to i16 +; CHECK-NEXT: [[ZA2:%.*]] = and i16 [[ZA]], 15 +; CHECK-NEXT: [[S:%.*]] = lshr i16 [[Z]], [[ZA2]] +; CHECK-NEXT: [[A:%.*]] = add i16 [[S]], [[Z]] +; CHECK-NEXT: [[S2:%.*]] = lshr i16 [[A]], 2 +; CHECK-NEXT: ret i16 [[S2]] ; %z = zext i8 %x to i32 %za = zext i8 %amt to i32 @@ -279,16 +276,15 @@ define void @lshr_big_dag(i16* %a, i8 %b, i8 %c) { ; CHECK-LABEL: @lshr_big_dag( -; CHECK-NEXT: [[ZEXT1:%.*]] = zext i8 [[B:%.*]] to i32 -; CHECK-NEXT: [[ZEXT2:%.*]] = zext i8 [[C:%.*]] to i32 -; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[ZEXT1]], [[ZEXT2]] -; CHECK-NEXT: [[SFT1:%.*]] = and i32 [[ADD1]], 15 -; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[ADD1]], [[SFT1]] -; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[ADD1]], [[SHR1]] -; CHECK-NEXT: [[SFT2:%.*]] = and i32 [[ADD2]], 7 -; CHECK-NEXT: [[SHR2:%.*]] = lshr i32 [[ADD2]], [[SFT2]] -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[SHR2]] to i16 -; CHECK-NEXT: store i16 [[TRUNC]], i16* [[A:%.*]], align 2 +; CHECK-NEXT: [[ZEXT1:%.*]] = zext i8 [[B:%.*]] to i16 +; CHECK-NEXT: [[ZEXT2:%.*]] = zext i8 [[C:%.*]] to i16 +; CHECK-NEXT: [[ADD1:%.*]] = add i16 [[ZEXT1]], [[ZEXT2]] +; CHECK-NEXT: [[SFT1:%.*]] = and i16 [[ADD1]], 15 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i16 [[ADD1]], [[SFT1]] +; CHECK-NEXT: [[ADD2:%.*]] = add i16 [[ADD1]], [[SHR1]] +; CHECK-NEXT: [[SFT2:%.*]] = and i16 [[ADD2]], 7 +; CHECK-NEXT: [[SHR2:%.*]] = lshr i16 [[ADD2]], [[SFT2]] +; CHECK-NEXT: store i16 [[SHR2]], i16* [[A:%.*]], align 2 ; CHECK-NEXT: ret void ; %zext1 = zext i8 %b to i32 @@ -308,10 +304,8 @@ ; CHECK-LABEL: @lshr_smaller_bitwidth( ; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i16 ; CHECK-NEXT: [[LSHR:%.*]] = lshr i16 [[ZEXT]], 1 -; CHECK-NEXT: [[ZEXT2:%.*]] = zext i16 [[LSHR]] to i32 -; CHECK-NEXT: [[LSHR2:%.*]] = lshr i32 [[ZEXT2]], 2 -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[LSHR2]] to i16 -; CHECK-NEXT: ret i16 [[TRUNC]] +; CHECK-NEXT: [[LSHR2:%.*]] = lshr i16 [[LSHR]], 2 +; CHECK-NEXT: ret i16 [[LSHR2]] ; %zext = zext i8 %x to i16 %lshr = lshr i16 %zext, 1 @@ -323,12 +317,10 @@ define i16 @lshr_larger_bitwidth(i8 %x) { ; CHECK-LABEL: @lshr_larger_bitwidth( -; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i64 -; CHECK-NEXT: [[LSHR:%.*]] = lshr i64 [[ZEXT]], 1 -; CHECK-NEXT: [[ZEXT2:%.*]] = trunc i64 [[LSHR]] to i32 -; CHECK-NEXT: [[AND:%.*]] = lshr i32 [[ZEXT2]], 2 -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[AND]] to i16 -; CHECK-NEXT: ret i16 [[TRUNC]] +; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i16 +; CHECK-NEXT: [[LSHR:%.*]] = lshr i16 [[ZEXT]], 1 +; CHECK-NEXT: [[AND:%.*]] = lshr i16 [[LSHR]], 2 +; CHECK-NEXT: ret i16 [[AND]] ; %zext = zext i8 %x to i64 %lshr = lshr i64 %zext, 1 @@ -340,13 +332,12 @@ define <2 x i16> @lshr_vector(<2 x i8> %x) { ; CHECK-LABEL: @lshr_vector( -; CHECK-NEXT: [[Z:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32> -; CHECK-NEXT: [[ZA:%.*]] = and <2 x i32> [[Z]], -; CHECK-NEXT: [[S:%.*]] = lshr <2 x i32> [[Z]], [[ZA]] -; CHECK-NEXT: [[A:%.*]] = add <2 x i32> [[S]], [[Z]] -; CHECK-NEXT: [[S2:%.*]] = lshr <2 x i32> [[A]], -; CHECK-NEXT: [[T:%.*]] = trunc <2 x i32> [[S2]] to <2 x i16> -; CHECK-NEXT: ret <2 x i16> [[T]] +; CHECK-NEXT: [[Z:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i16> +; CHECK-NEXT: [[ZA:%.*]] = and <2 x i16> [[Z]], +; CHECK-NEXT: [[S:%.*]] = lshr <2 x i16> [[Z]], [[ZA]] +; CHECK-NEXT: [[A:%.*]] = add <2 x i16> [[S]], [[Z]] +; CHECK-NEXT: [[S2:%.*]] = lshr <2 x i16> [[A]], +; CHECK-NEXT: ret <2 x i16> [[S2]] ; %z = zext <2 x i8> %x to <2 x i32> %za = and <2 x i32> %z, @@ -401,10 +392,8 @@ define i16 @lshr_exact(i16 %x) { ; CHECK-LABEL: @lshr_exact( -; CHECK-NEXT: [[ZEXT:%.*]] = zext i16 [[X:%.*]] to i32 -; CHECK-NEXT: [[LSHR:%.*]] = lshr exact i32 [[ZEXT]], 15 -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[LSHR]] to i16 -; CHECK-NEXT: ret i16 [[TRUNC]] +; CHECK-NEXT: [[LSHR:%.*]] = lshr exact i16 [[X:%.*]], 15 +; CHECK-NEXT: ret i16 [[LSHR]] ; %zext = zext i16 %x to i32 %lshr = lshr exact i32 %zext, 15 diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr50555.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr50555.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/pr50555.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/pr50555.ll @@ -4,71 +4,35 @@ define void @trunc_through_one_add(i16* noalias %0, i8* noalias readonly %1) { ; SSE-LABEL: @trunc_through_one_add( -; SSE-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <4 x i8>* -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1 -; SSE-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> -; SSE-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], -; SSE-NEXT: [[TMP7:%.*]] = add nuw nsw <4 x i32> [[TMP6]], [[TMP5]] -; SSE-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], -; SSE-NEXT: [[TMP9:%.*]] = trunc <4 x i32> [[TMP8]] to <4 x i16> -; SSE-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP0:%.*]] to <4 x i16>* -; SSE-NEXT: store <4 x i16> [[TMP9]], <4 x i16>* [[TMP10]], align 2 -; SSE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 4 -; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 4 -; SSE-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP11]] to <4 x i8>* -; SSE-NEXT: [[TMP14:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1 -; SSE-NEXT: [[TMP15:%.*]] = zext <4 x i8> [[TMP14]] to <4 x i32> -; SSE-NEXT: [[TMP16:%.*]] = lshr <4 x i32> [[TMP15]], -; SSE-NEXT: [[TMP17:%.*]] = add nuw nsw <4 x i32> [[TMP16]], [[TMP15]] -; SSE-NEXT: [[TMP18:%.*]] = lshr <4 x i32> [[TMP17]], -; SSE-NEXT: [[TMP19:%.*]] = trunc <4 x i32> [[TMP18]] to <4 x i16> -; SSE-NEXT: [[TMP20:%.*]] = bitcast i16* [[TMP12]] to <4 x i16>* -; SSE-NEXT: store <4 x i16> [[TMP19]], <4 x i16>* [[TMP20]], align 2 -; SSE-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8 -; SSE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8 -; SSE-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to <4 x i8>* -; SSE-NEXT: [[TMP24:%.*]] = load <4 x i8>, <4 x i8>* [[TMP23]], align 1 -; SSE-NEXT: [[TMP25:%.*]] = zext <4 x i8> [[TMP24]] to <4 x i32> -; SSE-NEXT: [[TMP26:%.*]] = lshr <4 x i32> [[TMP25]], -; SSE-NEXT: [[TMP27:%.*]] = add nuw nsw <4 x i32> [[TMP26]], [[TMP25]] -; SSE-NEXT: [[TMP28:%.*]] = lshr <4 x i32> [[TMP27]], -; SSE-NEXT: [[TMP29:%.*]] = trunc <4 x i32> [[TMP28]] to <4 x i16> -; SSE-NEXT: [[TMP30:%.*]] = bitcast i16* [[TMP22]] to <4 x i16>* -; SSE-NEXT: store <4 x i16> [[TMP29]], <4 x i16>* [[TMP30]], align 2 -; SSE-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 12 -; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 12 -; SSE-NEXT: [[TMP33:%.*]] = bitcast i8* [[TMP31]] to <4 x i8>* -; SSE-NEXT: [[TMP34:%.*]] = load <4 x i8>, <4 x i8>* [[TMP33]], align 1 -; SSE-NEXT: [[TMP35:%.*]] = zext <4 x i8> [[TMP34]] to <4 x i32> -; SSE-NEXT: [[TMP36:%.*]] = lshr <4 x i32> [[TMP35]], -; SSE-NEXT: [[TMP37:%.*]] = add nuw nsw <4 x i32> [[TMP36]], [[TMP35]] -; SSE-NEXT: [[TMP38:%.*]] = lshr <4 x i32> [[TMP37]], -; SSE-NEXT: [[TMP39:%.*]] = trunc <4 x i32> [[TMP38]] to <4 x i16> -; SSE-NEXT: [[TMP40:%.*]] = bitcast i16* [[TMP32]] to <4 x i16>* -; SSE-NEXT: store <4 x i16> [[TMP39]], <4 x i16>* [[TMP40]], align 2 +; SSE-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>* +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1 +; SSE-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i16> +; SSE-NEXT: [[TMP6:%.*]] = lshr <8 x i16> [[TMP5]], +; SSE-NEXT: [[TMP7:%.*]] = add nuw nsw <8 x i16> [[TMP6]], [[TMP5]] +; SSE-NEXT: [[TMP8:%.*]] = lshr <8 x i16> [[TMP7]], +; SSE-NEXT: [[TMP9:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>* +; SSE-NEXT: store <8 x i16> [[TMP8]], <8 x i16>* [[TMP9]], align 2 +; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8 +; SSE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8 +; SSE-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>* +; SSE-NEXT: [[TMP13:%.*]] = load <8 x i8>, <8 x i8>* [[TMP12]], align 1 +; SSE-NEXT: [[TMP14:%.*]] = zext <8 x i8> [[TMP13]] to <8 x i16> +; SSE-NEXT: [[TMP15:%.*]] = lshr <8 x i16> [[TMP14]], +; SSE-NEXT: [[TMP16:%.*]] = add nuw nsw <8 x i16> [[TMP15]], [[TMP14]] +; SSE-NEXT: [[TMP17:%.*]] = lshr <8 x i16> [[TMP16]], +; SSE-NEXT: [[TMP18:%.*]] = bitcast i16* [[TMP11]] to <8 x i16>* +; SSE-NEXT: store <8 x i16> [[TMP17]], <8 x i16>* [[TMP18]], align 2 ; SSE-NEXT: ret void ; ; AVX-LABEL: @trunc_through_one_add( -; AVX-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>* -; AVX-NEXT: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1 -; AVX-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32> -; AVX-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[TMP5]], -; AVX-NEXT: [[TMP7:%.*]] = add nuw nsw <8 x i32> [[TMP6]], [[TMP5]] -; AVX-NEXT: [[TMP8:%.*]] = lshr <8 x i32> [[TMP7]], -; AVX-NEXT: [[TMP9:%.*]] = trunc <8 x i32> [[TMP8]] to <8 x i16> -; AVX-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>* -; AVX-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* [[TMP10]], align 2 -; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8 -; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8 -; AVX-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP11]] to <8 x i8>* -; AVX-NEXT: [[TMP14:%.*]] = load <8 x i8>, <8 x i8>* [[TMP13]], align 1 -; AVX-NEXT: [[TMP15:%.*]] = zext <8 x i8> [[TMP14]] to <8 x i32> -; AVX-NEXT: [[TMP16:%.*]] = lshr <8 x i32> [[TMP15]], -; AVX-NEXT: [[TMP17:%.*]] = add nuw nsw <8 x i32> [[TMP16]], [[TMP15]] -; AVX-NEXT: [[TMP18:%.*]] = lshr <8 x i32> [[TMP17]], -; AVX-NEXT: [[TMP19:%.*]] = trunc <8 x i32> [[TMP18]] to <8 x i16> -; AVX-NEXT: [[TMP20:%.*]] = bitcast i16* [[TMP12]] to <8 x i16>* -; AVX-NEXT: store <8 x i16> [[TMP19]], <8 x i16>* [[TMP20]], align 2 +; AVX-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <16 x i8>* +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 1 +; AVX-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[TMP4]] to <16 x i16> +; AVX-NEXT: [[TMP6:%.*]] = lshr <16 x i16> [[TMP5]], +; AVX-NEXT: [[TMP7:%.*]] = add nuw nsw <16 x i16> [[TMP6]], [[TMP5]] +; AVX-NEXT: [[TMP8:%.*]] = lshr <16 x i16> [[TMP7]], +; AVX-NEXT: [[TMP9:%.*]] = bitcast i16* [[TMP0:%.*]] to <16 x i16>* +; AVX-NEXT: store <16 x i16> [[TMP8]], <16 x i16>* [[TMP9]], align 2 ; AVX-NEXT: ret void ; %3 = load i8, i8* %1, align 1 @@ -218,99 +182,48 @@ define void @trunc_through_two_adds(i16* noalias %0, i8* noalias readonly %1, i8* noalias readonly %2) { ; SSE-LABEL: @trunc_through_two_adds( -; SSE-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <4 x i8>* -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1 -; SSE-NEXT: [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32> -; SSE-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <4 x i8>* -; SSE-NEXT: [[TMP8:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1 -; SSE-NEXT: [[TMP9:%.*]] = zext <4 x i8> [[TMP8]] to <4 x i32> -; SSE-NEXT: [[TMP10:%.*]] = add nuw nsw <4 x i32> [[TMP9]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = lshr <4 x i32> [[TMP10]], -; SSE-NEXT: [[TMP12:%.*]] = add nuw nsw <4 x i32> [[TMP11]], [[TMP10]] -; SSE-NEXT: [[TMP13:%.*]] = lshr <4 x i32> [[TMP12]], -; SSE-NEXT: [[TMP14:%.*]] = trunc <4 x i32> [[TMP13]] to <4 x i16> -; SSE-NEXT: [[TMP15:%.*]] = bitcast i16* [[TMP0:%.*]] to <4 x i16>* -; SSE-NEXT: store <4 x i16> [[TMP14]], <4 x i16>* [[TMP15]], align 2 -; SSE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 4 -; SSE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 4 -; SSE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 4 -; SSE-NEXT: [[TMP19:%.*]] = bitcast i8* [[TMP16]] to <4 x i8>* -; SSE-NEXT: [[TMP20:%.*]] = load <4 x i8>, <4 x i8>* [[TMP19]], align 1 -; SSE-NEXT: [[TMP21:%.*]] = zext <4 x i8> [[TMP20]] to <4 x i32> -; SSE-NEXT: [[TMP22:%.*]] = bitcast i8* [[TMP17]] to <4 x i8>* -; SSE-NEXT: [[TMP23:%.*]] = load <4 x i8>, <4 x i8>* [[TMP22]], align 1 -; SSE-NEXT: [[TMP24:%.*]] = zext <4 x i8> [[TMP23]] to <4 x i32> -; SSE-NEXT: [[TMP25:%.*]] = add nuw nsw <4 x i32> [[TMP24]], [[TMP21]] -; SSE-NEXT: [[TMP26:%.*]] = lshr <4 x i32> [[TMP25]], -; SSE-NEXT: [[TMP27:%.*]] = add nuw nsw <4 x i32> [[TMP26]], [[TMP25]] -; SSE-NEXT: [[TMP28:%.*]] = lshr <4 x i32> [[TMP27]], -; SSE-NEXT: [[TMP29:%.*]] = trunc <4 x i32> [[TMP28]] to <4 x i16> -; SSE-NEXT: [[TMP30:%.*]] = bitcast i16* [[TMP18]] to <4 x i16>* -; SSE-NEXT: store <4 x i16> [[TMP29]], <4 x i16>* [[TMP30]], align 2 -; SSE-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8 -; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 8 -; SSE-NEXT: [[TMP33:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8 -; SSE-NEXT: [[TMP34:%.*]] = bitcast i8* [[TMP31]] to <4 x i8>* -; SSE-NEXT: [[TMP35:%.*]] = load <4 x i8>, <4 x i8>* [[TMP34]], align 1 -; SSE-NEXT: [[TMP36:%.*]] = zext <4 x i8> [[TMP35]] to <4 x i32> -; SSE-NEXT: [[TMP37:%.*]] = bitcast i8* [[TMP32]] to <4 x i8>* -; SSE-NEXT: [[TMP38:%.*]] = load <4 x i8>, <4 x i8>* [[TMP37]], align 1 -; SSE-NEXT: [[TMP39:%.*]] = zext <4 x i8> [[TMP38]] to <4 x i32> -; SSE-NEXT: [[TMP40:%.*]] = add nuw nsw <4 x i32> [[TMP39]], [[TMP36]] -; SSE-NEXT: [[TMP41:%.*]] = lshr <4 x i32> [[TMP40]], -; SSE-NEXT: [[TMP42:%.*]] = add nuw nsw <4 x i32> [[TMP41]], [[TMP40]] -; SSE-NEXT: [[TMP43:%.*]] = lshr <4 x i32> [[TMP42]], -; SSE-NEXT: [[TMP44:%.*]] = trunc <4 x i32> [[TMP43]] to <4 x i16> -; SSE-NEXT: [[TMP45:%.*]] = bitcast i16* [[TMP33]] to <4 x i16>* -; SSE-NEXT: store <4 x i16> [[TMP44]], <4 x i16>* [[TMP45]], align 2 -; SSE-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 12 -; SSE-NEXT: [[TMP47:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 12 -; SSE-NEXT: [[TMP48:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 12 -; SSE-NEXT: [[TMP49:%.*]] = bitcast i8* [[TMP46]] to <4 x i8>* -; SSE-NEXT: [[TMP50:%.*]] = load <4 x i8>, <4 x i8>* [[TMP49]], align 1 -; SSE-NEXT: [[TMP51:%.*]] = zext <4 x i8> [[TMP50]] to <4 x i32> -; SSE-NEXT: [[TMP52:%.*]] = bitcast i8* [[TMP47]] to <4 x i8>* -; SSE-NEXT: [[TMP53:%.*]] = load <4 x i8>, <4 x i8>* [[TMP52]], align 1 -; SSE-NEXT: [[TMP54:%.*]] = zext <4 x i8> [[TMP53]] to <4 x i32> -; SSE-NEXT: [[TMP55:%.*]] = add nuw nsw <4 x i32> [[TMP54]], [[TMP51]] -; SSE-NEXT: [[TMP56:%.*]] = lshr <4 x i32> [[TMP55]], -; SSE-NEXT: [[TMP57:%.*]] = add nuw nsw <4 x i32> [[TMP56]], [[TMP55]] -; SSE-NEXT: [[TMP58:%.*]] = lshr <4 x i32> [[TMP57]], -; SSE-NEXT: [[TMP59:%.*]] = trunc <4 x i32> [[TMP58]] to <4 x i16> -; SSE-NEXT: [[TMP60:%.*]] = bitcast i16* [[TMP48]] to <4 x i16>* -; SSE-NEXT: store <4 x i16> [[TMP59]], <4 x i16>* [[TMP60]], align 2 +; SSE-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>* +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[TMP4]], align 1 +; SSE-NEXT: [[TMP6:%.*]] = zext <8 x i8> [[TMP5]] to <8 x i16> +; SSE-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <8 x i8>* +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1 +; SSE-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[TMP8]] to <8 x i16> +; SSE-NEXT: [[TMP10:%.*]] = add nuw nsw <8 x i16> [[TMP9]], [[TMP6]] +; SSE-NEXT: [[TMP11:%.*]] = lshr <8 x i16> [[TMP10]], +; SSE-NEXT: [[TMP12:%.*]] = add nuw nsw <8 x i16> [[TMP11]], [[TMP10]] +; SSE-NEXT: [[TMP13:%.*]] = lshr <8 x i16> [[TMP12]], +; SSE-NEXT: [[TMP14:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>* +; SSE-NEXT: store <8 x i16> [[TMP13]], <8 x i16>* [[TMP14]], align 2 +; SSE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8 +; SSE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 8 +; SSE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8 +; SSE-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP15]] to <8 x i8>* +; SSE-NEXT: [[TMP19:%.*]] = load <8 x i8>, <8 x i8>* [[TMP18]], align 1 +; SSE-NEXT: [[TMP20:%.*]] = zext <8 x i8> [[TMP19]] to <8 x i16> +; SSE-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP16]] to <8 x i8>* +; SSE-NEXT: [[TMP22:%.*]] = load <8 x i8>, <8 x i8>* [[TMP21]], align 1 +; SSE-NEXT: [[TMP23:%.*]] = zext <8 x i8> [[TMP22]] to <8 x i16> +; SSE-NEXT: [[TMP24:%.*]] = add nuw nsw <8 x i16> [[TMP23]], [[TMP20]] +; SSE-NEXT: [[TMP25:%.*]] = lshr <8 x i16> [[TMP24]], +; SSE-NEXT: [[TMP26:%.*]] = add nuw nsw <8 x i16> [[TMP25]], [[TMP24]] +; SSE-NEXT: [[TMP27:%.*]] = lshr <8 x i16> [[TMP26]], +; SSE-NEXT: [[TMP28:%.*]] = bitcast i16* [[TMP17]] to <8 x i16>* +; SSE-NEXT: store <8 x i16> [[TMP27]], <8 x i16>* [[TMP28]], align 2 ; SSE-NEXT: ret void ; ; AVX-LABEL: @trunc_through_two_adds( -; AVX-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>* -; AVX-NEXT: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[TMP4]], align 1 -; AVX-NEXT: [[TMP6:%.*]] = zext <8 x i8> [[TMP5]] to <8 x i32> -; AVX-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <8 x i8>* -; AVX-NEXT: [[TMP8:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1 -; AVX-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[TMP8]] to <8 x i32> -; AVX-NEXT: [[TMP10:%.*]] = add nuw nsw <8 x i32> [[TMP9]], [[TMP6]] -; AVX-NEXT: [[TMP11:%.*]] = lshr <8 x i32> [[TMP10]], -; AVX-NEXT: [[TMP12:%.*]] = add nuw nsw <8 x i32> [[TMP11]], [[TMP10]] -; AVX-NEXT: [[TMP13:%.*]] = lshr <8 x i32> [[TMP12]], -; AVX-NEXT: [[TMP14:%.*]] = trunc <8 x i32> [[TMP13]] to <8 x i16> -; AVX-NEXT: [[TMP15:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>* -; AVX-NEXT: store <8 x i16> [[TMP14]], <8 x i16>* [[TMP15]], align 2 -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8 -; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 8 -; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8 -; AVX-NEXT: [[TMP19:%.*]] = bitcast i8* [[TMP16]] to <8 x i8>* -; AVX-NEXT: [[TMP20:%.*]] = load <8 x i8>, <8 x i8>* [[TMP19]], align 1 -; AVX-NEXT: [[TMP21:%.*]] = zext <8 x i8> [[TMP20]] to <8 x i32> -; AVX-NEXT: [[TMP22:%.*]] = bitcast i8* [[TMP17]] to <8 x i8>* -; AVX-NEXT: [[TMP23:%.*]] = load <8 x i8>, <8 x i8>* [[TMP22]], align 1 -; AVX-NEXT: [[TMP24:%.*]] = zext <8 x i8> [[TMP23]] to <8 x i32> -; AVX-NEXT: [[TMP25:%.*]] = add nuw nsw <8 x i32> [[TMP24]], [[TMP21]] -; AVX-NEXT: [[TMP26:%.*]] = lshr <8 x i32> [[TMP25]], -; AVX-NEXT: [[TMP27:%.*]] = add nuw nsw <8 x i32> [[TMP26]], [[TMP25]] -; AVX-NEXT: [[TMP28:%.*]] = lshr <8 x i32> [[TMP27]], -; AVX-NEXT: [[TMP29:%.*]] = trunc <8 x i32> [[TMP28]] to <8 x i16> -; AVX-NEXT: [[TMP30:%.*]] = bitcast i16* [[TMP18]] to <8 x i16>* -; AVX-NEXT: store <8 x i16> [[TMP29]], <8 x i16>* [[TMP30]], align 2 +; AVX-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <16 x i8>* +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 1 +; AVX-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[TMP5]] to <16 x i16> +; AVX-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <16 x i8>* +; AVX-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[TMP7]], align 1 +; AVX-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[TMP8]] to <16 x i16> +; AVX-NEXT: [[TMP10:%.*]] = add nuw nsw <16 x i16> [[TMP9]], [[TMP6]] +; AVX-NEXT: [[TMP11:%.*]] = lshr <16 x i16> [[TMP10]], +; AVX-NEXT: [[TMP12:%.*]] = add nuw nsw <16 x i16> [[TMP11]], [[TMP10]] +; AVX-NEXT: [[TMP13:%.*]] = lshr <16 x i16> [[TMP12]], +; AVX-NEXT: [[TMP14:%.*]] = bitcast i16* [[TMP0:%.*]] to <16 x i16>* +; AVX-NEXT: store <16 x i16> [[TMP13]], <16 x i16>* [[TMP14]], align 2 ; AVX-NEXT: ret void ; %4 = load i8, i8* %1, align 1