Index: lib/Transforms/InstCombine/InstCombineAndOrXor.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -741,6 +741,49 @@ return nullptr; } +namespace { + +struct BitGroupCheck { + // If the Cmp, checks the bits in the group are nonzero? + bool CheckIfSet {false}; + // The mask that identifies the bitgroup in question. + const APInt *Mask {nullptr}; +}; +} +/// For an ICMP where RHS is zero, we want to check if the ICMP is equivalent to +/// comparing a group of bits in an integer value against zero. +BitGroupCheck isAnyBitSet(Value *LHS, ICmpInst::Predicate CC) { + + BitGroupCheck BGC; + auto *Inst = dyn_cast(LHS); + + if (!Inst || Inst->getOpcode() != Instruction::And) + return BGC; + + // TODO Currently this does not work for vectors. + ConstantInt *Mask; + if (!match(LHS, m_And(m_Value(), m_ConstantInt(Mask)))) + return BGC; + // At this point we know that LHS of ICMP is "and" of a value with a constant. + // Also we know that the RHS is zero. That means we are checking if a certain + // group of bits in a given integer value are all zero or at least one of them + // is set to one. + switch (CC) { + default: + return BGC; + case ICmpInst::ICMP_EQ: + BGC.CheckIfSet = false; + break; + case ICmpInst::ICMP_NE: + case ICmpInst::ICMP_UGT: + BGC.CheckIfSet = true; + break; + } + + BGC.Mask = &Mask->getValue(); + return BGC; +} + /// Try to fold a signed range checked with lower bound 0 to an unsigned icmp. /// Example: (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n /// If \p Inverted is true then the check is for the inverted range, e.g. @@ -797,6 +840,32 @@ return Builder->CreateICmp(NewPred, Input, RangeEnd); } +Value *InstCombiner::FoldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS) { + + Value *Val = LHS->getOperand(0), *Val2 = RHS->getOperand(0); + // TODO The lines below does not work for vectors. ConstantInt is scalar. + auto *LHSCst = dyn_cast(LHS->getOperand(1)); + auto *RHSCst = dyn_cast(RHS->getOperand(1)); + if (!LHSCst || !RHSCst) + return nullptr; + ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate(); + + // E.g. (icmp ne %x, 0) ^ (icmp ne %y, 0) => icmp ne %x, %y if the following + // conditions hold: + // 1- (%x = and %a, %mask) and (%y = and %b, %mask) + // 2- %mask is a power of 2. + if (RHSCst->isZero() && LHSCst == RHSCst) { + + BitGroupCheck BGC1 = isAnyBitSet(Val, LHSCC); + BitGroupCheck BGC2 = isAnyBitSet(Val2, RHSCC); + if (BGC1.Mask && BGC2.Mask && BGC1.CheckIfSet == BGC2.CheckIfSet && + *BGC1.Mask == *BGC2.Mask && BGC1.Mask->isPowerOf2()) { + return Builder->CreateICmp(ICmpInst::ICMP_NE, Val2, Val); + } + } + return nullptr; +} + /// Fold (icmp)&(icmp) if possible. Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate(); @@ -879,6 +948,29 @@ } } + // E.g. (icmp eq %x, 0) & (icmp ne %y, 0) => icmp ult %x, %y if the following + // conditions hold: + // 1- (%x = and %a, %mask1) and (%y = and %b, %mask2) + // 2- Let %t be the smallest power of 2 where %mask1 & %t != 0. Then for any + // %s that is a power of 2 and %s & %mask2 != 0, we must have %s <= %t. + // For example if %mask1 = 24 and %mask2 = 16, setting %s = 16 and %t = 8 + // violates condition (2) above. So this optimization cannot be applied. + if (RHSCst->isZero() && LHSCst == RHSCst) { + BitGroupCheck BGC1 = isAnyBitSet(Val, LHSCC); + BitGroupCheck BGC2 = isAnyBitSet(Val2, RHSCC); + + if (BGC1.Mask && BGC2.Mask && (BGC1.CheckIfSet != BGC2.CheckIfSet)) { + if (!BGC1.CheckIfSet && + BGC1.Mask->countTrailingZeros() >= + BGC2.Mask->getBitWidth() - BGC2.Mask->countLeadingZeros() - 1) + return Builder->CreateICmp(ICmpInst::ICMP_ULT, Val, Val2); + else if (!BGC2.CheckIfSet && + BGC2.Mask->countTrailingZeros() >= + BGC1.Mask->getBitWidth() - BGC1.Mask->countLeadingZeros() - 1) + return Builder->CreateICmp(ICmpInst::ICMP_ULT, Val2, Val); + } + } + // From here on, we only handle: // (icmp1 A, C1) & (icmp2 A, C2) --> something simpler. if (Val != Val2) return nullptr; @@ -2714,9 +2806,16 @@ match(Op1, m_Not(m_Specific(A)))) return BinaryOperator::CreateNot(Builder->CreateAnd(A, B)); - // (icmp1 A, B) ^ (icmp2 A, B) --> (icmp3 A, B) if (ICmpInst *RHS = dyn_cast(I.getOperand(1))) - if (ICmpInst *LHS = dyn_cast(I.getOperand(0))) + if (ICmpInst *LHS = dyn_cast(I.getOperand(0))) { + + // E.g. if we have xor (icmp eq %A, 0), (icmp eq %B, 0) + // and we know both A and B are either 8 (power of 2) or 0 + // we can simplify to (icmp ne A, B). + if (Value *Res = FoldXorOfICmps(LHS, RHS)) + return replaceInstUsesWith(I, Res); + + // (icmp1 A, B) ^ (icmp2 A, B) --> (icmp3 A, B) if (PredicatesFoldable(LHS->getPredicate(), RHS->getPredicate())) { if (LHS->getOperand(0) == RHS->getOperand(1) && LHS->getOperand(1) == RHS->getOperand(0)) @@ -2731,6 +2830,7 @@ Builder)); } } + } if (Instruction *CastedXor = foldCastedBitwiseLogic(I)) return CastedXor; Index: lib/Transforms/InstCombine/InstCombineInternal.h =================================================================== --- lib/Transforms/InstCombine/InstCombineInternal.h +++ lib/Transforms/InstCombine/InstCombineInternal.h @@ -225,6 +225,7 @@ Instruction *visitFDiv(BinaryOperator &I); Value *simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, bool Inverted); Value *FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS); + Value *FoldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS); Value *FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS); Instruction *visitAnd(BinaryOperator &I); Value *FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction *CxtI); Index: test/Transforms/InstCombine/and-or-icmps.ll =================================================================== --- test/Transforms/InstCombine/and-or-icmps.ll +++ test/Transforms/InstCombine/and-or-icmps.ll @@ -51,3 +51,207 @@ ret i1 %tmp1042 } +; Last three instructions (ignoring ret) are equivalent of %val2 < %val1. +define i1 @test2(i32 %a, i32 %b) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: [[VAL1:%.*]] = and i32 %a, 8 +; CHECK-NEXT: [[VAL2:%.*]] = and i32 %b, 8 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[VAL2]], [[VAL1]] +; CHECK-NEXT: ret i1 [[TMP1]] +; + %val1 = and i32 %a, 8 + %val2 = and i32 %b, 8 + %cmp.a = icmp ne i32 %val1, 0 + %cmp.b = icmp eq i32 %val2, 0 + %and = and i1 %cmp.b, %cmp.a + ret i1 %and +} + +; Last three instructions (ignoring ret) are equivalent of %val2 < %val1. +define i1 @test3(i32 %a, i32 %b) { +; CHECK-LABEL: @test3( +; CHECK-NEXT: [[VAL1:%.*]] = and i32 %a, 8 +; CHECK-NEXT: [[VAL2:%.*]] = and i32 %b, 8 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[VAL2]], [[VAL1]] +; CHECK-NEXT: ret i1 [[TMP1]] +; + %val1 = and i32 %a, 8 + %val2 = and i32 %b, 8 + %cmp.a = icmp ne i32 %val1, 0 + %cmp.b = icmp eq i32 %val2, 0 + %and = and i1 %cmp.a, %cmp.b + ret i1 %and +} + +; Last three instructions (ignoring ret) are equivalent of %val2 < %val1. +define i1 @test4(i32 %a, i32 %b) { +; CHECK-LABEL: @test4( +; CHECK-NEXT: [[VAL1:%.*]] = and i32 %a, 15 +; CHECK-NEXT: [[VAL2:%.*]] = and i32 %b, 24 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[VAL2]], [[VAL1]] +; CHECK-NEXT: ret i1 [[TMP1]] +; + %val1 = and i32 %a, 15 + %val2 = and i32 %b, 24 + %cmp.a = icmp ne i32 %val1, 0 + %cmp.b = icmp eq i32 %val2, 0 + %and = and i1 %cmp.a, %cmp.b + ret i1 %and +} + +; Last three instructions (ignoring ret) are equivalent of %val2 < %val1. +define i1 @test5(i32 %a, i32 %b) { +; CHECK-LABEL: @test5( +; CHECK-NEXT: [[VAL1:%.*]] = and i32 %a, 15 +; CHECK-NEXT: [[VAL2:%.*]] = and i32 %b, 24 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[VAL2]], [[VAL1]] +; CHECK-NEXT: ret i1 [[TMP1]] +; + %val1 = and i32 %a, 15 + %val2 = and i32 %b, 24 + %cmp.a = icmp ne i32 %val1, 0 + %cmp.b = icmp eq i32 %val2, 0 + %and = and i1 %cmp.b, %cmp.a + ret i1 %and +} + +; An optimization like those of previous tests is not possible +; for example if %b = 8 and %a = 16, we have %val2 = 8 and +; % %val1 = 16 so %val2 < %val1 but %and == 0. +define i1 @test6(i32 %a, i32 %b) { +; CHECK-LABEL: @test6( +; CHECK-NEXT: [[VAL1:%.*]] = and i32 %a, 16 +; CHECK-NEXT: [[VAL2:%.*]] = and i32 %b, 24 +; CHECK-NEXT: [[CMP_A:%.*]] = icmp ne i32 [[VAL1]], 0 +; CHECK-NEXT: [[CMP_B:%.*]] = icmp eq i32 [[VAL2]], 0 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP_B]], [[CMP_A]] +; CHECK-NEXT: ret i1 [[AND]] +; + %val1 = and i32 %a, 16 + %val2 = and i32 %b, 24 + %cmp.a = icmp ne i32 %val1, 0 + %cmp.b = icmp eq i32 %val2, 0 + %and = and i1 %cmp.b, %cmp.a + ret i1 %and +} + +; %a and %b have different widths. So optimization is not possible. +define i1 @test7(i16 %a, i32 %b) { +; CHECK-LABEL: @test7( +; CHECK-NEXT: [[VAL1:%.*]] = and i16 %a, 15 +; CHECK-NEXT: [[VAL2:%.*]] = and i32 %b, 24 +; CHECK-NEXT: [[CMP_A:%.*]] = icmp ne i16 [[VAL1]], 0 +; CHECK-NEXT: [[CMP_B:%.*]] = icmp eq i32 [[VAL2]], 0 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP_B]], [[CMP_A]] +; CHECK-NEXT: ret i1 [[AND]] +; + %val1 = and i16 %a, 15 + %val2 = and i32 %b, 24 + %cmp.a = icmp ne i16 %val1, 0 + %cmp.b = icmp eq i32 %val2, 0 + %and = and i1 %cmp.b, %cmp.a + ret i1 %and +} + +; The last three instructions can be simplified to checking %val1 != %val2. +; After that other transformations change the code further. +define i1 @test8(i32 %a, i32 %b) { +; CHECK-LABEL: @test8( +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 %a, %b +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 +; CHECK-NEXT: ret i1 [[TMP3]] +; + %val1 = and i32 %a, 8 + %val2 = and i32 %b, 8 + %cmp.a = icmp ne i32 %val1, 0 + %cmp.b = icmp ne i32 %val2, 0 + %and = xor i1 %cmp.b, %cmp.a + ret i1 %and +} + +; Operands of and instructions, must be identical powers of 2 otherwise +; a simplification, like that of previous testcase is not possible. +define i1 @test9(i32 %a, i32 %b) { +; CHECK-LABEL: @test9( +; CHECK-NEXT: [[VAL1:%.*]] = and i32 %a, 24 +; CHECK-NEXT: [[VAL2:%.*]] = and i32 %b, 24 +; CHECK-NEXT: [[CMP_A:%.*]] = icmp ne i32 [[VAL1]], 0 +; CHECK-NEXT: [[CMP_B:%.*]] = icmp ne i32 [[VAL2]], 0 +; CHECK-NEXT: [[AND:%.*]] = xor i1 [[CMP_B]], [[CMP_A]] +; CHECK-NEXT: ret i1 [[AND]] +; + %val1 = and i32 %a, 24 + %val2 = and i32 %b, 24 + %cmp.a = icmp ne i32 %val1, 0 + %cmp.b = icmp ne i32 %val2, 0 + %and = xor i1 %cmp.b, %cmp.a + ret i1 %and +} + +; The last three instructions are equivalent of checking %val1 != %val2. +; After making this change, other transformation further change the code. +define i1 @test10(i32 %a, i32 %b) { +; CHECK-LABEL: @test10( +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 %a, %b +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 +; CHECK-NEXT: ret i1 [[TMP3]] +; + %val1 = and i32 %a, 8 + %val2 = and i32 %b, 8 + %cmp.a = icmp eq i32 %val1, 0 + %cmp.b = icmp eq i32 %val2, 0 + %and = xor i1 %cmp.b, %cmp.a + ret i1 %and +} + +; Cannot be simplified because of different width of %a and %b +define i1 @test11(i16 %a, i32 %b) { +; CHECK-LABEL: @test11( +; CHECK-NEXT: [[VAL1:%.*]] = and i16 %a, 8 +; CHECK-NEXT: [[VAL2:%.*]] = and i32 %b, 8 +; CHECK-NEXT: [[CMP_A:%.*]] = icmp ne i16 [[VAL1]], 0 +; CHECK-NEXT: [[CMP_B:%.*]] = icmp ne i32 [[VAL2]], 0 +; CHECK-NEXT: [[AND:%.*]] = xor i1 [[CMP_B]], [[CMP_A]] +; CHECK-NEXT: ret i1 [[AND]] +; + %val1 = and i16 %a, 8 + %val2 = and i32 %b, 8 + %cmp.a = icmp ne i16 %val1, 0 + %cmp.b = icmp ne i32 %val2, 0 + %and = xor i1 %cmp.b, %cmp.a + ret i1 %and +} + +; Similar to @test8 except that icmp instns use ugt here instead of ne. +define i1 @test12(i32 %a, i32 %b) { +; CHECK-LABEL: @test12( +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 %a, %b +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 +; CHECK-NEXT: ret i1 [[TMP3]] +; + %val1 = and i32 %a, 8 + %val2 = and i32 %b, 8 + %cmp.a = icmp ugt i32 %val1, 0 + %cmp.b = icmp ugt i32 %val2, 0 + %and = xor i1 %cmp.b, %cmp.a + ret i1 %and +} + +; Similar to @test3 except that the first icmp uses ugt instead of ne. +define i1 @test13(i32 %a, i32 %b) { +; CHECK-LABEL: @test13( +; CHECK-NEXT: [[VAL1:%.*]] = and i32 %a, 8 +; CHECK-NEXT: [[VAL2:%.*]] = and i32 %b, 8 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[VAL2]], [[VAL1]] +; CHECK-NEXT: ret i1 [[TMP1]] +; + %val1 = and i32 %a, 8 + %val2 = and i32 %b, 8 + %cmp.a = icmp ugt i32 %val1, 0 + %cmp.b = icmp eq i32 %val2, 0 + %and = and i1 %cmp.a, %cmp.b + ret i1 %and +}