diff --git a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp --- a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp @@ -29,10 +29,12 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" +#include "llvm/Support/KnownBits.h" using namespace llvm; @@ -61,6 +63,9 @@ case Instruction::And: case Instruction::Or: case Instruction::Xor: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: Ops.push_back(I->getOperand(0)); Ops.push_back(I->getOperand(1)); break; @@ -98,8 +103,33 @@ // Worklist and the Stack, and add it to the instruction info map. Worklist.pop_back(); Stack.pop_back(); + // Insert I to the Info map. - InstInfoMap.insert(std::make_pair(I, Info())); + // Initialize MinBitWidth for shift instructions with the number + // satisfying conditions: + // 1. Shift amount is less than MinBitWidth at least by 1 + // 2. For right shifts all truncated bits are zeros and even + // one untruncated bit is also zero for arithmetic shift. + // Also normalize MinBitWidth not to be greater than source bitwidth. + Info InstInfo; + unsigned int MinBitWidth = 0; + if (I->getOpcode() == Instruction::Shl || + I->getOpcode() == Instruction::LShr || + I->getOpcode() == Instruction::AShr) { + KnownBits KnownLHS = computeKnownBits(I->getOperand(0), DL); + KnownBits KnownRHS = computeKnownBits(I->getOperand(1), DL); + const unsigned int SrcBitWidth = KnownLHS.getBitWidth(); + if (I->getOpcode() != Instruction::Shl) + MinBitWidth = SrcBitWidth - KnownLHS.countMinLeadingZeros(); + if (I->getOpcode() == Instruction::AShr && MinBitWidth < SrcBitWidth) + MinBitWidth++; + InstInfo.MinBitWidth = + std::max(uint64_t(MinBitWidth), + std::min(KnownRHS.getMaxValue().getZExtValue(), + uint64_t(SrcBitWidth - 1)) + + 1); + } + InstInfoMap.insert(std::make_pair(I, InstInfo)); continue; } @@ -127,6 +157,9 @@ case Instruction::And: case Instruction::Or: case Instruction::Xor: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: case Instruction::Select: { SmallVector Operands; getRelevantOperands(I, Operands); @@ -137,8 +170,7 @@ // TODO: Can handle more cases here: // 1. shufflevector, extractelement, insertelement // 2. udiv, urem - // 3. shl, lshr, ashr - // 4. phi node(and loop handling) + // 3. phi node(and loop handling) // ... return false; } @@ -356,10 +388,19 @@ case Instruction::Mul: case Instruction::And: case Instruction::Or: - case Instruction::Xor: { + case Instruction::Xor: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: { Value *LHS = getReducedOperand(I->getOperand(0), SclTy); Value *RHS = getReducedOperand(I->getOperand(1), SclTy); Res = Builder.CreateBinOp((Instruction::BinaryOps)Opc, LHS, RHS); + // Try to preserve flags, but `shl nsw` is more poisonous + // if bitwidth is smaller. + if (Opc == Instruction::Shl) + cast(Res)->setHasNoUnsignedWrap(I->hasNoUnsignedWrap()); + if (Opc == Instruction::LShr || Opc == Instruction::AShr) + cast(Res)->setIsExact(I->isExact()); break; } case Instruction::Select: { diff --git a/llvm/test/Transforms/AggressiveInstCombine/pr50555.ll b/llvm/test/Transforms/AggressiveInstCombine/pr50555.ll --- a/llvm/test/Transforms/AggressiveInstCombine/pr50555.ll +++ b/llvm/test/Transforms/AggressiveInstCombine/pr50555.ll @@ -3,11 +3,10 @@ define void @trunc_one_add(i16* %a, i8 %b) { ; CHECK-LABEL: @trunc_one_add( -; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[B:%.*]] to i32 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[ZEXT]], 1 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[ZEXT]], [[SHR]] -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[ADD]] to i16 -; CHECK-NEXT: store i16 [[TRUNC]], i16* [[A:%.*]], align 2 +; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[B:%.*]] to i16 +; CHECK-NEXT: [[SHR:%.*]] = lshr i16 [[ZEXT]], 1 +; CHECK-NEXT: [[ADD:%.*]] = add i16 [[ZEXT]], [[SHR]] +; CHECK-NEXT: store i16 [[ADD]], i16* [[A:%.*]], align 2 ; CHECK-NEXT: ret void ; %zext = zext i8 %b to i32 @@ -20,14 +19,13 @@ define void @trunc_two_adds(i16* %a, i8 %b, i8 %c) { ; CHECK-LABEL: @trunc_two_adds( -; CHECK-NEXT: [[ZEXT1:%.*]] = zext i8 [[B:%.*]] to i32 -; CHECK-NEXT: [[ZEXT2:%.*]] = zext i8 [[C:%.*]] to i32 -; CHECK-NEXT: [[ADD1:%.*]] = add nuw nsw i32 [[ZEXT1]], [[ZEXT2]] -; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[ADD1]], 1 -; CHECK-NEXT: [[ADD2:%.*]] = add nuw nsw i32 [[ADD1]], [[SHR1]] -; CHECK-NEXT: [[SHR2:%.*]] = lshr i32 [[ADD2]], 2 -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[SHR2]] to i16 -; CHECK-NEXT: store i16 [[TRUNC]], i16* [[A:%.*]], align 2 +; CHECK-NEXT: [[ZEXT1:%.*]] = zext i8 [[B:%.*]] to i16 +; CHECK-NEXT: [[ZEXT2:%.*]] = zext i8 [[C:%.*]] to i16 +; CHECK-NEXT: [[ADD1:%.*]] = add i16 [[ZEXT1]], [[ZEXT2]] +; CHECK-NEXT: [[SHR1:%.*]] = lshr i16 [[ADD1]], 1 +; CHECK-NEXT: [[ADD2:%.*]] = add i16 [[ADD1]], [[SHR1]] +; CHECK-NEXT: [[SHR2:%.*]] = lshr i16 [[ADD2]], 2 +; CHECK-NEXT: store i16 [[SHR2]], i16* [[A:%.*]], align 2 ; CHECK-NEXT: ret void ; %zext1 = zext i8 %b to i32 diff --git a/llvm/test/Transforms/AggressiveInstCombine/trunc_shifts.ll b/llvm/test/Transforms/AggressiveInstCombine/trunc_shifts.ll --- a/llvm/test/Transforms/AggressiveInstCombine/trunc_shifts.ll +++ b/llvm/test/Transforms/AggressiveInstCombine/trunc_shifts.ll @@ -3,10 +3,8 @@ define i16 @lshr_trunc_commute(i16 %x) { ; CHECK-LABEL: @lshr_trunc_commute( -; CHECK-NEXT: [[ZEXT:%.*]] = zext i16 [[X:%.*]] to i32 -; CHECK-NEXT: [[LSHR:%.*]] = lshr i32 [[ZEXT]], 15 -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[LSHR]] to i16 -; CHECK-NEXT: ret i16 [[TRUNC]] +; CHECK-NEXT: [[LSHR:%.*]] = lshr i16 [[X:%.*]], 15 +; CHECK-NEXT: ret i16 [[LSHR]] ; %zext = zext i16 %x to i32 %lshr = lshr i32 %zext, 15 @@ -42,11 +40,9 @@ define i16 @ashr_trunc_commute(i16 %x) { ; CHECK-LABEL: @ashr_trunc_commute( -; CHECK-NEXT: [[ZEXT:%.*]] = zext i16 [[X:%.*]] to i32 -; CHECK-NEXT: [[AND:%.*]] = and i32 [[ZEXT]], 32767 -; CHECK-NEXT: [[ASHR:%.*]] = ashr i32 [[AND]], 15 -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[ASHR]] to i16 -; CHECK-NEXT: ret i16 [[TRUNC]] +; CHECK-NEXT: [[AND:%.*]] = and i16 [[X:%.*]], 32767 +; CHECK-NEXT: [[ASHR:%.*]] = ashr i16 [[AND]], 15 +; CHECK-NEXT: ret i16 [[ASHR]] ; %zext = zext i16 %x to i32 %and = and i32 %zext, 32767 @@ -76,14 +72,13 @@ define i16 @var_shift_commute(i8 %x, i8 %amt) { ; CHECK-LABEL: @var_shift_commute( -; CHECK-NEXT: [[Z:%.*]] = zext i8 [[X:%.*]] to i32 -; CHECK-NEXT: [[ZA:%.*]] = zext i8 [[AMT:%.*]] to i32 -; CHECK-NEXT: [[ZA2:%.*]] = and i32 [[ZA]], 15 -; CHECK-NEXT: [[S:%.*]] = lshr i32 [[Z]], [[ZA2]] -; CHECK-NEXT: [[A:%.*]] = add i32 [[S]], [[Z]] -; CHECK-NEXT: [[S2:%.*]] = lshr i32 [[A]], 2 -; CHECK-NEXT: [[T:%.*]] = trunc i32 [[S2]] to i16 -; CHECK-NEXT: ret i16 [[T]] +; CHECK-NEXT: [[Z:%.*]] = zext i8 [[X:%.*]] to i16 +; CHECK-NEXT: [[ZA:%.*]] = zext i8 [[AMT:%.*]] to i16 +; CHECK-NEXT: [[ZA2:%.*]] = and i16 [[ZA]], 15 +; CHECK-NEXT: [[S:%.*]] = lshr i16 [[Z]], [[ZA2]] +; CHECK-NEXT: [[A:%.*]] = add i16 [[S]], [[Z]] +; CHECK-NEXT: [[S2:%.*]] = lshr i16 [[A]], 2 +; CHECK-NEXT: ret i16 [[S2]] ; %z = zext i8 %x to i32 %za = zext i8 %amt to i32 @@ -97,16 +92,15 @@ define void @big_dag(i16* %a, i8 %b, i8 %c) { ; CHECK-LABEL: @big_dag( -; CHECK-NEXT: [[ZEXT1:%.*]] = zext i8 [[B:%.*]] to i32 -; CHECK-NEXT: [[ZEXT2:%.*]] = zext i8 [[C:%.*]] to i32 -; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[ZEXT1]], [[ZEXT2]] -; CHECK-NEXT: [[SFT1:%.*]] = and i32 [[ADD1]], 15 -; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[ADD1]], [[SFT1]] -; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[ADD1]], [[SHR1]] -; CHECK-NEXT: [[SFT2:%.*]] = and i32 [[ADD2]], 7 -; CHECK-NEXT: [[SHR2:%.*]] = lshr i32 [[ADD2]], [[SFT2]] -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[SHR2]] to i16 -; CHECK-NEXT: store i16 [[TRUNC]], i16* [[A:%.*]], align 2 +; CHECK-NEXT: [[ZEXT1:%.*]] = zext i8 [[B:%.*]] to i16 +; CHECK-NEXT: [[ZEXT2:%.*]] = zext i8 [[C:%.*]] to i16 +; CHECK-NEXT: [[ADD1:%.*]] = add i16 [[ZEXT1]], [[ZEXT2]] +; CHECK-NEXT: [[SFT1:%.*]] = and i16 [[ADD1]], 15 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i16 [[ADD1]], [[SFT1]] +; CHECK-NEXT: [[ADD2:%.*]] = add i16 [[ADD1]], [[SHR1]] +; CHECK-NEXT: [[SFT2:%.*]] = and i16 [[ADD2]], 7 +; CHECK-NEXT: [[SHR2:%.*]] = lshr i16 [[ADD2]], [[SFT2]] +; CHECK-NEXT: store i16 [[SHR2]], i16* [[A:%.*]], align 2 ; CHECK-NEXT: ret void ; %zext1 = zext i8 %b to i32 @@ -124,13 +118,12 @@ define <2 x i16> @vector_commute(<2 x i8> %x) { ; CHECK-LABEL: @vector_commute( -; CHECK-NEXT: [[Z:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32> -; CHECK-NEXT: [[ZA:%.*]] = and <2 x i32> [[Z]], -; CHECK-NEXT: [[S:%.*]] = lshr <2 x i32> [[Z]], [[ZA]] -; CHECK-NEXT: [[A:%.*]] = add <2 x i32> [[S]], [[Z]] -; CHECK-NEXT: [[S2:%.*]] = lshr <2 x i32> [[A]], -; CHECK-NEXT: [[T:%.*]] = trunc <2 x i32> [[S2]] to <2 x i16> -; CHECK-NEXT: ret <2 x i16> [[T]] +; CHECK-NEXT: [[Z:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i16> +; CHECK-NEXT: [[ZA:%.*]] = and <2 x i16> [[Z]], +; CHECK-NEXT: [[S:%.*]] = lshr <2 x i16> [[Z]], [[ZA]] +; CHECK-NEXT: [[A:%.*]] = add <2 x i16> [[S]], [[Z]] +; CHECK-NEXT: [[S2:%.*]] = lshr <2 x i16> [[A]], +; CHECK-NEXT: ret <2 x i16> [[S2]] ; %z = zext <2 x i8> %x to <2 x i32> %za = and <2 x i32> %z, @@ -182,6 +175,7 @@ define i16 @shl_not_commute(i8 %x) { ; CHECK-LABEL: @shl_not_commute( ; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i32 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[ZEXT]], [[ZEXT]] ; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[SHL]] to i16 ; CHECK-NEXT: ret i16 [[TRUNC]] ; @@ -193,10 +187,10 @@ define i16 @shl_commute(i8 %x) { ; CHECK-LABEL: @shl_commute( -; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i32 -; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[ZEXT]], [[AND]] -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[SHL]] to i16 -; CHECK-NEXT: ret i16 [[TRUNC]] +; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i16 +; CHECK-NEXT: [[AND:%.*]] = and i16 [[ZEXT]], 15 +; CHECK-NEXT: [[SHL:%.*]] = shl i16 [[ZEXT]], [[AND]] +; CHECK-NEXT: ret i16 [[SHL]] ; %zext = zext i8 %x to i32 %and = and i32 %zext, 15 @@ -207,10 +201,8 @@ define i16 @lshr_exact(i16 %x) { ; CHECK-LABEL: @lshr_exact( -; CHECK-NEXT: [[ZEXT:%.*]] = zext i16 [[X:%.*]] to i32 -; CHECK-NEXT: [[LSHR:%.*]] = lshr exact i32 [[ZEXT]], 15 -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[LSHR]] to i16 -; CHECK-NEXT: ret i16 [[TRUNC]] +; CHECK-NEXT: [[LSHR:%.*]] = lshr exact i16 [[X:%.*]], 15 +; CHECK-NEXT: ret i16 [[LSHR]] ; %zext = zext i16 %x to i32 %lshr = lshr exact i32 %zext, 15 @@ -220,11 +212,9 @@ define i16 @ashr_exact(i16 %x) { ; CHECK-LABEL: @ashr_exact( -; CHECK-NEXT: [[ZEXT:%.*]] = zext i16 [[X:%.*]] to i32 -; CHECK-NEXT: [[AND:%.*]] = and i32 [[ZEXT]], 32767 -; CHECK-NEXT: [[ASHR:%.*]] = ashr exact i32 [[AND]], 15 -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[ASHR]] to i16 -; CHECK-NEXT: ret i16 [[TRUNC]] +; CHECK-NEXT: [[AND:%.*]] = and i16 [[X:%.*]], 32767 +; CHECK-NEXT: [[ASHR:%.*]] = ashr exact i16 [[AND]], 15 +; CHECK-NEXT: ret i16 [[ASHR]] ; %zext = zext i16 %x to i32 %and = and i32 %zext, 32767 @@ -235,10 +225,9 @@ define i16 @shl_nuw(i8 %x, i8 %sh1) { ; CHECK-LABEL: @shl_nuw( -; CHECK-NEXT: [[Z:%.*]] = zext i8 [[X:%.*]] to i32 -; CHECK-NEXT: [[S:%.*]] = shl nuw i32 [[Z]], 8 -; CHECK-NEXT: [[T:%.*]] = trunc i32 [[S]] to i16 -; CHECK-NEXT: ret i16 [[T]] +; CHECK-NEXT: [[Z:%.*]] = zext i8 [[X:%.*]] to i16 +; CHECK-NEXT: [[S:%.*]] = shl nuw i16 [[Z]], 8 +; CHECK-NEXT: ret i16 [[S]] ; %z = zext i8 %x to i32 %s = shl nuw i32 %z, 8 @@ -248,10 +237,9 @@ define i16 @shl_nsw(i8 %x, i8 %sh1) { ; CHECK-LABEL: @shl_nsw( -; CHECK-NEXT: [[Z:%.*]] = zext i8 [[X:%.*]] to i32 -; CHECK-NEXT: [[S:%.*]] = shl nsw i32 [[Z]], 8 -; CHECK-NEXT: [[T:%.*]] = trunc i32 [[S]] to i16 -; CHECK-NEXT: ret i16 [[T]] +; CHECK-NEXT: [[Z:%.*]] = zext i8 [[X:%.*]] to i16 +; CHECK-NEXT: [[S:%.*]] = shl i16 [[Z]], 8 +; CHECK-NEXT: ret i16 [[S]] ; %z = zext i8 %x to i32 %s = shl nsw i32 %z, 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll @@ -4,71 +4,35 @@ define void @trunc_through_one_add(i16* noalias %0, i8* noalias readonly %1) { ; SSE-LABEL: @trunc_through_one_add( -; SSE-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <4 x i8>* -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1 -; SSE-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> -; SSE-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], -; SSE-NEXT: [[TMP7:%.*]] = add nuw nsw <4 x i32> [[TMP6]], [[TMP5]] -; SSE-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], -; SSE-NEXT: [[TMP9:%.*]] = trunc <4 x i32> [[TMP8]] to <4 x i16> -; SSE-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP0:%.*]] to <4 x i16>* -; SSE-NEXT: store <4 x i16> [[TMP9]], <4 x i16>* [[TMP10]], align 2 -; SSE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 4 -; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 4 -; SSE-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP11]] to <4 x i8>* -; SSE-NEXT: [[TMP14:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1 -; SSE-NEXT: [[TMP15:%.*]] = zext <4 x i8> [[TMP14]] to <4 x i32> -; SSE-NEXT: [[TMP16:%.*]] = lshr <4 x i32> [[TMP15]], -; SSE-NEXT: [[TMP17:%.*]] = add nuw nsw <4 x i32> [[TMP16]], [[TMP15]] -; SSE-NEXT: [[TMP18:%.*]] = lshr <4 x i32> [[TMP17]], -; SSE-NEXT: [[TMP19:%.*]] = trunc <4 x i32> [[TMP18]] to <4 x i16> -; SSE-NEXT: [[TMP20:%.*]] = bitcast i16* [[TMP12]] to <4 x i16>* -; SSE-NEXT: store <4 x i16> [[TMP19]], <4 x i16>* [[TMP20]], align 2 -; SSE-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8 -; SSE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8 -; SSE-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to <4 x i8>* -; SSE-NEXT: [[TMP24:%.*]] = load <4 x i8>, <4 x i8>* [[TMP23]], align 1 -; SSE-NEXT: [[TMP25:%.*]] = zext <4 x i8> [[TMP24]] to <4 x i32> -; SSE-NEXT: [[TMP26:%.*]] = lshr <4 x i32> [[TMP25]], -; SSE-NEXT: [[TMP27:%.*]] = add nuw nsw <4 x i32> [[TMP26]], [[TMP25]] -; SSE-NEXT: [[TMP28:%.*]] = lshr <4 x i32> [[TMP27]], -; SSE-NEXT: [[TMP29:%.*]] = trunc <4 x i32> [[TMP28]] to <4 x i16> -; SSE-NEXT: [[TMP30:%.*]] = bitcast i16* [[TMP22]] to <4 x i16>* -; SSE-NEXT: store <4 x i16> [[TMP29]], <4 x i16>* [[TMP30]], align 2 -; SSE-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 12 -; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 12 -; SSE-NEXT: [[TMP33:%.*]] = bitcast i8* [[TMP31]] to <4 x i8>* -; SSE-NEXT: [[TMP34:%.*]] = load <4 x i8>, <4 x i8>* [[TMP33]], align 1 -; SSE-NEXT: [[TMP35:%.*]] = zext <4 x i8> [[TMP34]] to <4 x i32> -; SSE-NEXT: [[TMP36:%.*]] = lshr <4 x i32> [[TMP35]], -; SSE-NEXT: [[TMP37:%.*]] = add nuw nsw <4 x i32> [[TMP36]], [[TMP35]] -; SSE-NEXT: [[TMP38:%.*]] = lshr <4 x i32> [[TMP37]], -; SSE-NEXT: [[TMP39:%.*]] = trunc <4 x i32> [[TMP38]] to <4 x i16> -; SSE-NEXT: [[TMP40:%.*]] = bitcast i16* [[TMP32]] to <4 x i16>* -; SSE-NEXT: store <4 x i16> [[TMP39]], <4 x i16>* [[TMP40]], align 2 +; SSE-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>* +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1 +; SSE-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i16> +; SSE-NEXT: [[TMP6:%.*]] = lshr <8 x i16> [[TMP5]], +; SSE-NEXT: [[TMP7:%.*]] = add <8 x i16> [[TMP6]], [[TMP5]] +; SSE-NEXT: [[TMP8:%.*]] = lshr <8 x i16> [[TMP7]], +; SSE-NEXT: [[TMP9:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>* +; SSE-NEXT: store <8 x i16> [[TMP8]], <8 x i16>* [[TMP9]], align 2 +; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8 +; SSE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8 +; SSE-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>* +; SSE-NEXT: [[TMP13:%.*]] = load <8 x i8>, <8 x i8>* [[TMP12]], align 1 +; SSE-NEXT: [[TMP14:%.*]] = zext <8 x i8> [[TMP13]] to <8 x i16> +; SSE-NEXT: [[TMP15:%.*]] = lshr <8 x i16> [[TMP14]], +; SSE-NEXT: [[TMP16:%.*]] = add <8 x i16> [[TMP15]], [[TMP14]] +; SSE-NEXT: [[TMP17:%.*]] = lshr <8 x i16> [[TMP16]], +; SSE-NEXT: [[TMP18:%.*]] = bitcast i16* [[TMP11]] to <8 x i16>* +; SSE-NEXT: store <8 x i16> [[TMP17]], <8 x i16>* [[TMP18]], align 2 ; SSE-NEXT: ret void ; ; AVX-LABEL: @trunc_through_one_add( -; AVX-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>* -; AVX-NEXT: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1 -; AVX-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32> -; AVX-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[TMP5]], -; AVX-NEXT: [[TMP7:%.*]] = add nuw nsw <8 x i32> [[TMP6]], [[TMP5]] -; AVX-NEXT: [[TMP8:%.*]] = lshr <8 x i32> [[TMP7]], -; AVX-NEXT: [[TMP9:%.*]] = trunc <8 x i32> [[TMP8]] to <8 x i16> -; AVX-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>* -; AVX-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* [[TMP10]], align 2 -; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8 -; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8 -; AVX-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP11]] to <8 x i8>* -; AVX-NEXT: [[TMP14:%.*]] = load <8 x i8>, <8 x i8>* [[TMP13]], align 1 -; AVX-NEXT: [[TMP15:%.*]] = zext <8 x i8> [[TMP14]] to <8 x i32> -; AVX-NEXT: [[TMP16:%.*]] = lshr <8 x i32> [[TMP15]], -; AVX-NEXT: [[TMP17:%.*]] = add nuw nsw <8 x i32> [[TMP16]], [[TMP15]] -; AVX-NEXT: [[TMP18:%.*]] = lshr <8 x i32> [[TMP17]], -; AVX-NEXT: [[TMP19:%.*]] = trunc <8 x i32> [[TMP18]] to <8 x i16> -; AVX-NEXT: [[TMP20:%.*]] = bitcast i16* [[TMP12]] to <8 x i16>* -; AVX-NEXT: store <8 x i16> [[TMP19]], <8 x i16>* [[TMP20]], align 2 +; AVX-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <16 x i8>* +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 1 +; AVX-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[TMP4]] to <16 x i16> +; AVX-NEXT: [[TMP6:%.*]] = lshr <16 x i16> [[TMP5]], +; AVX-NEXT: [[TMP7:%.*]] = add <16 x i16> [[TMP6]], [[TMP5]] +; AVX-NEXT: [[TMP8:%.*]] = lshr <16 x i16> [[TMP7]], +; AVX-NEXT: [[TMP9:%.*]] = bitcast i16* [[TMP0:%.*]] to <16 x i16>* +; AVX-NEXT: store <16 x i16> [[TMP8]], <16 x i16>* [[TMP9]], align 2 ; AVX-NEXT: ret void ; %3 = load i8, i8* %1, align 1 @@ -218,99 +182,48 @@ define void @trunc_through_two_adds(i16* noalias %0, i8* noalias readonly %1, i8* noalias readonly %2) { ; SSE-LABEL: @trunc_through_two_adds( -; SSE-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <4 x i8>* -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1 -; SSE-NEXT: [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32> -; SSE-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <4 x i8>* -; SSE-NEXT: [[TMP8:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1 -; SSE-NEXT: [[TMP9:%.*]] = zext <4 x i8> [[TMP8]] to <4 x i32> -; SSE-NEXT: [[TMP10:%.*]] = add nuw nsw <4 x i32> [[TMP9]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = lshr <4 x i32> [[TMP10]], -; SSE-NEXT: [[TMP12:%.*]] = add nuw nsw <4 x i32> [[TMP11]], [[TMP10]] -; SSE-NEXT: [[TMP13:%.*]] = lshr <4 x i32> [[TMP12]], -; SSE-NEXT: [[TMP14:%.*]] = trunc <4 x i32> [[TMP13]] to <4 x i16> -; SSE-NEXT: [[TMP15:%.*]] = bitcast i16* [[TMP0:%.*]] to <4 x i16>* -; SSE-NEXT: store <4 x i16> [[TMP14]], <4 x i16>* [[TMP15]], align 2 -; SSE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 4 -; SSE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 4 -; SSE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 4 -; SSE-NEXT: [[TMP19:%.*]] = bitcast i8* [[TMP16]] to <4 x i8>* -; SSE-NEXT: [[TMP20:%.*]] = load <4 x i8>, <4 x i8>* [[TMP19]], align 1 -; SSE-NEXT: [[TMP21:%.*]] = zext <4 x i8> [[TMP20]] to <4 x i32> -; SSE-NEXT: [[TMP22:%.*]] = bitcast i8* [[TMP17]] to <4 x i8>* -; SSE-NEXT: [[TMP23:%.*]] = load <4 x i8>, <4 x i8>* [[TMP22]], align 1 -; SSE-NEXT: [[TMP24:%.*]] = zext <4 x i8> [[TMP23]] to <4 x i32> -; SSE-NEXT: [[TMP25:%.*]] = add nuw nsw <4 x i32> [[TMP24]], [[TMP21]] -; SSE-NEXT: [[TMP26:%.*]] = lshr <4 x i32> [[TMP25]], -; SSE-NEXT: [[TMP27:%.*]] = add nuw nsw <4 x i32> [[TMP26]], [[TMP25]] -; SSE-NEXT: [[TMP28:%.*]] = lshr <4 x i32> [[TMP27]], -; SSE-NEXT: [[TMP29:%.*]] = trunc <4 x i32> [[TMP28]] to <4 x i16> -; SSE-NEXT: [[TMP30:%.*]] = bitcast i16* [[TMP18]] to <4 x i16>* -; SSE-NEXT: store <4 x i16> [[TMP29]], <4 x i16>* [[TMP30]], align 2 -; SSE-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8 -; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 8 -; SSE-NEXT: [[TMP33:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8 -; SSE-NEXT: [[TMP34:%.*]] = bitcast i8* [[TMP31]] to <4 x i8>* -; SSE-NEXT: [[TMP35:%.*]] = load <4 x i8>, <4 x i8>* [[TMP34]], align 1 -; SSE-NEXT: [[TMP36:%.*]] = zext <4 x i8> [[TMP35]] to <4 x i32> -; SSE-NEXT: [[TMP37:%.*]] = bitcast i8* [[TMP32]] to <4 x i8>* -; SSE-NEXT: [[TMP38:%.*]] = load <4 x i8>, <4 x i8>* [[TMP37]], align 1 -; SSE-NEXT: [[TMP39:%.*]] = zext <4 x i8> [[TMP38]] to <4 x i32> -; SSE-NEXT: [[TMP40:%.*]] = add nuw nsw <4 x i32> [[TMP39]], [[TMP36]] -; SSE-NEXT: [[TMP41:%.*]] = lshr <4 x i32> [[TMP40]], -; SSE-NEXT: [[TMP42:%.*]] = add nuw nsw <4 x i32> [[TMP41]], [[TMP40]] -; SSE-NEXT: [[TMP43:%.*]] = lshr <4 x i32> [[TMP42]], -; SSE-NEXT: [[TMP44:%.*]] = trunc <4 x i32> [[TMP43]] to <4 x i16> -; SSE-NEXT: [[TMP45:%.*]] = bitcast i16* [[TMP33]] to <4 x i16>* -; SSE-NEXT: store <4 x i16> [[TMP44]], <4 x i16>* [[TMP45]], align 2 -; SSE-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 12 -; SSE-NEXT: [[TMP47:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 12 -; SSE-NEXT: [[TMP48:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 12 -; SSE-NEXT: [[TMP49:%.*]] = bitcast i8* [[TMP46]] to <4 x i8>* -; SSE-NEXT: [[TMP50:%.*]] = load <4 x i8>, <4 x i8>* [[TMP49]], align 1 -; SSE-NEXT: [[TMP51:%.*]] = zext <4 x i8> [[TMP50]] to <4 x i32> -; SSE-NEXT: [[TMP52:%.*]] = bitcast i8* [[TMP47]] to <4 x i8>* -; SSE-NEXT: [[TMP53:%.*]] = load <4 x i8>, <4 x i8>* [[TMP52]], align 1 -; SSE-NEXT: [[TMP54:%.*]] = zext <4 x i8> [[TMP53]] to <4 x i32> -; SSE-NEXT: [[TMP55:%.*]] = add nuw nsw <4 x i32> [[TMP54]], [[TMP51]] -; SSE-NEXT: [[TMP56:%.*]] = lshr <4 x i32> [[TMP55]], -; SSE-NEXT: [[TMP57:%.*]] = add nuw nsw <4 x i32> [[TMP56]], [[TMP55]] -; SSE-NEXT: [[TMP58:%.*]] = lshr <4 x i32> [[TMP57]], -; SSE-NEXT: [[TMP59:%.*]] = trunc <4 x i32> [[TMP58]] to <4 x i16> -; SSE-NEXT: [[TMP60:%.*]] = bitcast i16* [[TMP48]] to <4 x i16>* -; SSE-NEXT: store <4 x i16> [[TMP59]], <4 x i16>* [[TMP60]], align 2 +; SSE-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>* +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[TMP4]], align 1 +; SSE-NEXT: [[TMP6:%.*]] = zext <8 x i8> [[TMP5]] to <8 x i16> +; SSE-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <8 x i8>* +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1 +; SSE-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[TMP8]] to <8 x i16> +; SSE-NEXT: [[TMP10:%.*]] = add <8 x i16> [[TMP9]], [[TMP6]] +; SSE-NEXT: [[TMP11:%.*]] = lshr <8 x i16> [[TMP10]], +; SSE-NEXT: [[TMP12:%.*]] = add <8 x i16> [[TMP11]], [[TMP10]] +; SSE-NEXT: [[TMP13:%.*]] = lshr <8 x i16> [[TMP12]], +; SSE-NEXT: [[TMP14:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>* +; SSE-NEXT: store <8 x i16> [[TMP13]], <8 x i16>* [[TMP14]], align 2 +; SSE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8 +; SSE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 8 +; SSE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8 +; SSE-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP15]] to <8 x i8>* +; SSE-NEXT: [[TMP19:%.*]] = load <8 x i8>, <8 x i8>* [[TMP18]], align 1 +; SSE-NEXT: [[TMP20:%.*]] = zext <8 x i8> [[TMP19]] to <8 x i16> +; SSE-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP16]] to <8 x i8>* +; SSE-NEXT: [[TMP22:%.*]] = load <8 x i8>, <8 x i8>* [[TMP21]], align 1 +; SSE-NEXT: [[TMP23:%.*]] = zext <8 x i8> [[TMP22]] to <8 x i16> +; SSE-NEXT: [[TMP24:%.*]] = add <8 x i16> [[TMP23]], [[TMP20]] +; SSE-NEXT: [[TMP25:%.*]] = lshr <8 x i16> [[TMP24]], +; SSE-NEXT: [[TMP26:%.*]] = add <8 x i16> [[TMP25]], [[TMP24]] +; SSE-NEXT: [[TMP27:%.*]] = lshr <8 x i16> [[TMP26]], +; SSE-NEXT: [[TMP28:%.*]] = bitcast i16* [[TMP17]] to <8 x i16>* +; SSE-NEXT: store <8 x i16> [[TMP27]], <8 x i16>* [[TMP28]], align 2 ; SSE-NEXT: ret void ; ; AVX-LABEL: @trunc_through_two_adds( -; AVX-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>* -; AVX-NEXT: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[TMP4]], align 1 -; AVX-NEXT: [[TMP6:%.*]] = zext <8 x i8> [[TMP5]] to <8 x i32> -; AVX-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <8 x i8>* -; AVX-NEXT: [[TMP8:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1 -; AVX-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[TMP8]] to <8 x i32> -; AVX-NEXT: [[TMP10:%.*]] = add nuw nsw <8 x i32> [[TMP9]], [[TMP6]] -; AVX-NEXT: [[TMP11:%.*]] = lshr <8 x i32> [[TMP10]], -; AVX-NEXT: [[TMP12:%.*]] = add nuw nsw <8 x i32> [[TMP11]], [[TMP10]] -; AVX-NEXT: [[TMP13:%.*]] = lshr <8 x i32> [[TMP12]], -; AVX-NEXT: [[TMP14:%.*]] = trunc <8 x i32> [[TMP13]] to <8 x i16> -; AVX-NEXT: [[TMP15:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>* -; AVX-NEXT: store <8 x i16> [[TMP14]], <8 x i16>* [[TMP15]], align 2 -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8 -; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 8 -; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8 -; AVX-NEXT: [[TMP19:%.*]] = bitcast i8* [[TMP16]] to <8 x i8>* -; AVX-NEXT: [[TMP20:%.*]] = load <8 x i8>, <8 x i8>* [[TMP19]], align 1 -; AVX-NEXT: [[TMP21:%.*]] = zext <8 x i8> [[TMP20]] to <8 x i32> -; AVX-NEXT: [[TMP22:%.*]] = bitcast i8* [[TMP17]] to <8 x i8>* -; AVX-NEXT: [[TMP23:%.*]] = load <8 x i8>, <8 x i8>* [[TMP22]], align 1 -; AVX-NEXT: [[TMP24:%.*]] = zext <8 x i8> [[TMP23]] to <8 x i32> -; AVX-NEXT: [[TMP25:%.*]] = add nuw nsw <8 x i32> [[TMP24]], [[TMP21]] -; AVX-NEXT: [[TMP26:%.*]] = lshr <8 x i32> [[TMP25]], -; AVX-NEXT: [[TMP27:%.*]] = add nuw nsw <8 x i32> [[TMP26]], [[TMP25]] -; AVX-NEXT: [[TMP28:%.*]] = lshr <8 x i32> [[TMP27]], -; AVX-NEXT: [[TMP29:%.*]] = trunc <8 x i32> [[TMP28]] to <8 x i16> -; AVX-NEXT: [[TMP30:%.*]] = bitcast i16* [[TMP18]] to <8 x i16>* -; AVX-NEXT: store <8 x i16> [[TMP29]], <8 x i16>* [[TMP30]], align 2 +; AVX-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <16 x i8>* +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 1 +; AVX-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[TMP5]] to <16 x i16> +; AVX-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <16 x i8>* +; AVX-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[TMP7]], align 1 +; AVX-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[TMP8]] to <16 x i16> +; AVX-NEXT: [[TMP10:%.*]] = add <16 x i16> [[TMP9]], [[TMP6]] +; AVX-NEXT: [[TMP11:%.*]] = lshr <16 x i16> [[TMP10]], +; AVX-NEXT: [[TMP12:%.*]] = add <16 x i16> [[TMP11]], [[TMP10]] +; AVX-NEXT: [[TMP13:%.*]] = lshr <16 x i16> [[TMP12]], +; AVX-NEXT: [[TMP14:%.*]] = bitcast i16* [[TMP0:%.*]] to <16 x i16>* +; AVX-NEXT: store <16 x i16> [[TMP13]], <16 x i16>* [[TMP14]], align 2 ; AVX-NEXT: ret void ; %4 = load i8, i8* %1, align 1