Index: llvm/trunk/include/llvm/IR/PatternMatch.h =================================================================== --- llvm/trunk/include/llvm/IR/PatternMatch.h +++ llvm/trunk/include/llvm/IR/PatternMatch.h @@ -1271,6 +1271,12 @@ } template +inline match_combine_or, OpTy> +m_ZExtOrSelf(const OpTy &Op) { + return m_CombineOr(m_ZExt(Op), Op); +} + +template inline match_combine_or, CastClass_match> m_ZExtOrSExt(const OpTy &Op) { Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineShifts.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -27,42 +27,84 @@ // This is valid for any shift, but they must be identical. static Instruction * reassociateShiftAmtsOfTwoSameDirectionShifts(BinaryOperator *Sh0, - const SimplifyQuery &SQ) { - // Look for: (x shiftopcode ShAmt0) shiftopcode ShAmt1 - Value *X, *ShAmt1, *ShAmt0; + const SimplifyQuery &SQ, + InstCombiner::BuilderTy &Builder) { + // Look for a shift of some instruction, ignore zext of shift amount if any. + Instruction *Sh0Op0; + Value *ShAmt0; + if (!match(Sh0, + m_Shift(m_Instruction(Sh0Op0), m_ZExtOrSelf(m_Value(ShAmt0))))) + return nullptr; + + // If there is a truncation between the two shifts, we must make note of it + // and look through it. The truncation imposes additional constraints on the + // transform. Instruction *Sh1; - if (!match(Sh0, m_Shift(m_CombineAnd(m_Shift(m_Value(X), m_Value(ShAmt1)), - m_Instruction(Sh1)), - m_Value(ShAmt0)))) + Value *Trunc = nullptr; + match(Sh0Op0, + m_CombineOr(m_CombineAnd(m_Trunc(m_Instruction(Sh1)), m_Value(Trunc)), + m_Instruction(Sh1))); + + // Inner shift: (x shiftopcode ShAmt1) + Value *X, *ShAmt1; + if (!match(Sh1, m_Shift(m_Value(X), m_ZExtOrSelf(m_Value(ShAmt1))))) return nullptr; // The shift opcodes must be identical. Instruction::BinaryOps ShiftOpcode = Sh0->getOpcode(); if (ShiftOpcode != Sh1->getOpcode()) return nullptr; + + // Did we match a pattern with truncation ? + if (Trunc) { + // For right-shifts we can't do any such simplifications. Leave as-is. + if (ShiftOpcode != Instruction::BinaryOps::Shl) + return nullptr; // FIXME: still could perform constant-folding. + // If we saw truncation, we'll need to produce extra instruction, + // and for that one of the operands of the shift must be one-use. + if (!match(Sh0, m_c_BinOp(m_OneUse(m_Value()), m_Value()))) + return nullptr; + } + // Can we fold (ShAmt0+ShAmt1) ? - Value *NewShAmt = SimplifyBinOp(Instruction::BinaryOps::Add, ShAmt0, ShAmt1, - SQ.getWithInstruction(Sh0)); + auto *NewShAmt = dyn_cast_or_null( + SimplifyAddInst(ShAmt0, ShAmt1, /*isNSW=*/false, /*isNUW=*/false, + SQ.getWithInstruction(Sh0))); if (!NewShAmt) return nullptr; // Did not simplify. - // Is the new shift amount smaller than the bit width? - // FIXME: could also rely on ConstantRange. - unsigned BitWidth = X->getType()->getScalarSizeInBits(); - if (!match(NewShAmt, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT, - APInt(BitWidth, BitWidth)))) - return nullptr; + // Is the new shift amount smaller than the bit width of inner shift? + if (!match(NewShAmt, m_SpecificInt_ICMP( + ICmpInst::Predicate::ICMP_ULT, + APInt(NewShAmt->getType()->getScalarSizeInBits(), + X->getType()->getScalarSizeInBits())))) + return nullptr; // FIXME: could perform constant-folding. + // All good, we can do this fold. + NewShAmt = ConstantExpr::getZExtOrBitCast(NewShAmt, X->getType()); + BinaryOperator *NewShift = BinaryOperator::Create(ShiftOpcode, X, NewShAmt); - // If both of the original shifts had the same flag set, preserve the flag. - if (ShiftOpcode == Instruction::BinaryOps::Shl) { - NewShift->setHasNoUnsignedWrap(Sh0->hasNoUnsignedWrap() && - Sh1->hasNoUnsignedWrap()); - NewShift->setHasNoSignedWrap(Sh0->hasNoSignedWrap() && - Sh1->hasNoSignedWrap()); - } else { - NewShift->setIsExact(Sh0->isExact() && Sh1->isExact()); + + // The flags can only be propagated if there wasn't a trunc. + if (!Trunc) { + // If the pattern did not involve trunc, and both of the original shifts + // had the same flag set, preserve the flag. + if (ShiftOpcode == Instruction::BinaryOps::Shl) { + NewShift->setHasNoUnsignedWrap(Sh0->hasNoUnsignedWrap() && + Sh1->hasNoUnsignedWrap()); + NewShift->setHasNoSignedWrap(Sh0->hasNoSignedWrap() && + Sh1->hasNoSignedWrap()); + } else { + NewShift->setIsExact(Sh0->isExact() && Sh1->isExact()); + } } - return NewShift; + + Instruction *Ret = NewShift; + if (Trunc) { + Builder.Insert(NewShift); + Ret = CastInst::Create(Instruction::Trunc, NewShift, Sh0->getType()); + } + + return Ret; } // If we have some pattern that leaves only some low bits set, and then performs @@ -158,7 +200,7 @@ return Res; if (Instruction *NewShift = - reassociateShiftAmtsOfTwoSameDirectionShifts(&I, SQ)) + reassociateShiftAmtsOfTwoSameDirectionShifts(&I, SQ, Builder)) return NewShift; // (C1 shift (A add C2)) -> (C1 shift C2) shift A) Index: llvm/trunk/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-shl.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-shl.ll +++ llvm/trunk/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-shl.ll @@ -12,12 +12,8 @@ define i16 @t0(i32 %x, i16 %y) { ; CHECK-LABEL: @t0( -; CHECK-NEXT: [[T0:%.*]] = sub i16 32, [[Y:%.*]] -; CHECK-NEXT: [[T1:%.*]] = zext i16 [[T0]] to i32 -; CHECK-NEXT: [[T2:%.*]] = shl i32 [[X:%.*]], [[T1]] -; CHECK-NEXT: [[T3:%.*]] = trunc i32 [[T2]] to i16 -; CHECK-NEXT: [[T4:%.*]] = add i16 [[Y]], -24 -; CHECK-NEXT: [[T5:%.*]] = shl i16 [[T3]], [[T4]] +; CHECK-NEXT: [[X_TR:%.*]] = trunc i32 [[X:%.*]] to i16 +; CHECK-NEXT: [[T5:%.*]] = shl i16 [[X_TR]], 8 ; CHECK-NEXT: ret i16 [[T5]] ; %t0 = sub i16 32, %y @@ -31,12 +27,8 @@ define <2 x i16> @t1_vec_splat(<2 x i32> %x, <2 x i16> %y) { ; CHECK-LABEL: @t1_vec_splat( -; CHECK-NEXT: [[T0:%.*]] = sub <2 x i16> , [[Y:%.*]] -; CHECK-NEXT: [[T1:%.*]] = zext <2 x i16> [[T0]] to <2 x i32> -; CHECK-NEXT: [[T2:%.*]] = shl <2 x i32> [[X:%.*]], [[T1]] -; CHECK-NEXT: [[T3:%.*]] = trunc <2 x i32> [[T2]] to <2 x i16> -; CHECK-NEXT: [[T4:%.*]] = add <2 x i16> [[Y]], -; CHECK-NEXT: [[T5:%.*]] = shl <2 x i16> [[T3]], [[T4]] +; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[T5:%.*]] = trunc <2 x i32> [[TMP1]] to <2 x i16> ; CHECK-NEXT: ret <2 x i16> [[T5]] ; %t0 = sub <2 x i16> , %y @@ -50,12 +42,8 @@ define <2 x i16> @t2_vec_nonsplat(<2 x i32> %x, <2 x i16> %y) { ; CHECK-LABEL: @t2_vec_nonsplat( -; CHECK-NEXT: [[T0:%.*]] = sub <2 x i16> , [[Y:%.*]] -; CHECK-NEXT: [[T1:%.*]] = zext <2 x i16> [[T0]] to <2 x i32> -; CHECK-NEXT: [[T2:%.*]] = shl <2 x i32> [[X:%.*]], [[T1]] -; CHECK-NEXT: [[T3:%.*]] = trunc <2 x i32> [[T2]] to <2 x i16> -; CHECK-NEXT: [[T4:%.*]] = add <2 x i16> [[Y]], -; CHECK-NEXT: [[T5:%.*]] = shl <2 x i16> [[T3]], [[T4]] +; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[T5:%.*]] = trunc <2 x i32> [[TMP1]] to <2 x i16> ; CHECK-NEXT: ret <2 x i16> [[T5]] ; %t0 = sub <2 x i16> , %y @@ -71,12 +59,8 @@ define <3 x i16> @t3_vec_nonsplat_undef0(<3 x i32> %x, <3 x i16> %y) { ; CHECK-LABEL: @t3_vec_nonsplat_undef0( -; CHECK-NEXT: [[T0:%.*]] = sub <3 x i16> , [[Y:%.*]] -; CHECK-NEXT: [[T1:%.*]] = zext <3 x i16> [[T0]] to <3 x i32> -; CHECK-NEXT: [[T2:%.*]] = shl <3 x i32> [[X:%.*]], [[T1]] -; CHECK-NEXT: [[T3:%.*]] = trunc <3 x i32> [[T2]] to <3 x i16> -; CHECK-NEXT: [[T4:%.*]] = add <3 x i16> [[Y]], -; CHECK-NEXT: [[T5:%.*]] = shl <3 x i16> [[T3]], [[T4]] +; CHECK-NEXT: [[TMP1:%.*]] = shl <3 x i32> [[X:%.*]], +; CHECK-NEXT: [[T5:%.*]] = trunc <3 x i32> [[TMP1]] to <3 x i16> ; CHECK-NEXT: ret <3 x i16> [[T5]] ; %t0 = sub <3 x i16> , %y @@ -90,12 +74,8 @@ define <3 x i16> @t4_vec_nonsplat_undef1(<3 x i32> %x, <3 x i16> %y) { ; CHECK-LABEL: @t4_vec_nonsplat_undef1( -; CHECK-NEXT: [[T0:%.*]] = sub <3 x i16> , [[Y:%.*]] -; CHECK-NEXT: [[T1:%.*]] = zext <3 x i16> [[T0]] to <3 x i32> -; CHECK-NEXT: [[T2:%.*]] = shl <3 x i32> [[X:%.*]], [[T1]] -; CHECK-NEXT: [[T3:%.*]] = trunc <3 x i32> [[T2]] to <3 x i16> -; CHECK-NEXT: [[T4:%.*]] = add <3 x i16> [[Y]], -; CHECK-NEXT: [[T5:%.*]] = shl <3 x i16> [[T3]], [[T4]] +; CHECK-NEXT: [[TMP1:%.*]] = shl <3 x i32> [[X:%.*]], +; CHECK-NEXT: [[T5:%.*]] = trunc <3 x i32> [[TMP1]] to <3 x i16> ; CHECK-NEXT: ret <3 x i16> [[T5]] ; %t0 = sub <3 x i16> , %y @@ -109,12 +89,8 @@ define <3 x i16> @t5_vec_nonsplat_undef1(<3 x i32> %x, <3 x i16> %y) { ; CHECK-LABEL: @t5_vec_nonsplat_undef1( -; CHECK-NEXT: [[T0:%.*]] = sub <3 x i16> , [[Y:%.*]] -; CHECK-NEXT: [[T1:%.*]] = zext <3 x i16> [[T0]] to <3 x i32> -; CHECK-NEXT: [[T2:%.*]] = shl <3 x i32> [[X:%.*]], [[T1]] -; CHECK-NEXT: [[T3:%.*]] = trunc <3 x i32> [[T2]] to <3 x i16> -; CHECK-NEXT: [[T4:%.*]] = add <3 x i16> [[Y]], -; CHECK-NEXT: [[T5:%.*]] = shl <3 x i16> [[T3]], [[T4]] +; CHECK-NEXT: [[TMP1:%.*]] = shl <3 x i32> [[X:%.*]], +; CHECK-NEXT: [[T5:%.*]] = trunc <3 x i32> [[TMP1]] to <3 x i16> ; CHECK-NEXT: ret <3 x i16> [[T5]] ; %t0 = sub <3 x i16> , %y @@ -137,9 +113,9 @@ ; CHECK-NEXT: [[T1:%.*]] = zext i16 [[T0]] to i32 ; CHECK-NEXT: [[T2:%.*]] = shl i32 [[X:%.*]], [[T1]] ; CHECK-NEXT: [[T3:%.*]] = trunc i32 [[T2]] to i16 -; CHECK-NEXT: [[T4:%.*]] = add i16 [[Y]], -24 ; CHECK-NEXT: call void @use16(i16 [[T3]]) -; CHECK-NEXT: [[T5:%.*]] = shl i16 [[T3]], [[T4]] +; CHECK-NEXT: [[X_TR:%.*]] = trunc i32 [[X]] to i16 +; CHECK-NEXT: [[T5:%.*]] = shl i16 [[X_TR]], 8 ; CHECK-NEXT: ret i16 [[T5]] ; %t0 = sub i16 32, %y @@ -154,13 +130,10 @@ define i16 @t7_extrause1(i32 %x, i16 %y) { ; CHECK-LABEL: @t7_extrause1( -; CHECK-NEXT: [[T0:%.*]] = sub i16 32, [[Y:%.*]] -; CHECK-NEXT: [[T1:%.*]] = zext i16 [[T0]] to i32 -; CHECK-NEXT: [[T2:%.*]] = shl i32 [[X:%.*]], [[T1]] -; CHECK-NEXT: [[T3:%.*]] = trunc i32 [[T2]] to i16 -; CHECK-NEXT: [[T4:%.*]] = add i16 [[Y]], -24 +; CHECK-NEXT: [[T4:%.*]] = add i16 [[Y:%.*]], -24 ; CHECK-NEXT: call void @use16(i16 [[T4]]) -; CHECK-NEXT: [[T5:%.*]] = shl i16 [[T3]], [[T4]] +; CHECK-NEXT: [[X_TR:%.*]] = trunc i32 [[X:%.*]] to i16 +; CHECK-NEXT: [[T5:%.*]] = shl i16 [[X_TR]], 8 ; CHECK-NEXT: ret i16 [[T5]] ; %t0 = sub i16 32, %y