Index: lib/Transforms/InstCombine/InstCombineAndOrXor.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -126,21 +126,6 @@ switch (Op->getOpcode()) { default: break; - case Instruction::Or: - if (Op->hasOneUse()){ - ConstantInt *TogetherCI = dyn_cast(Together); - if (TogetherCI && !TogetherCI->isZero()){ - // (X | C1) & C2 --> (X & (C2^(C1&C2))) | C1 - // NOTE: This reduces the number of bits set in the & mask, which - // can expose opportunities for store narrowing. - Together = ConstantExpr::getXor(AndRHS, Together); - Value *And = Builder.CreateAnd(X, Together); - And->takeName(Op); - return BinaryOperator::CreateOr(And, OpRHS); - } - } - - break; case Instruction::Add: if (Op->hasOneUse()) { // Adding a one to a single bit bit-field should be turned into an XOR @@ -1223,6 +1208,22 @@ return BinaryOperator::CreateXor(And, NewC); } + const APInt *OrC; + if (match(Op0, m_OneUse(m_Or(m_Value(X), m_APInt(OrC))))) { + // (X | C1) & C2 --> (X & C2^(C1&C2)) | (C1&C2) + // NOTE: This reduces the number of bits set in the & mask, which + // can expose opportunities for store narrowing for scalars. + // NOTE: SimplifyDemandedBits should have already removed bits from C1 + // that aren't set in C2. Meaning we can replace (C1&C2) with C1 in + // above, but this feels safer. + APInt Together = *C & *OrC; + Value *And = Builder.CreateAnd(X, ConstantInt::get(I.getType(), + Together ^ *C)); + And->takeName(Op0); + return BinaryOperator::CreateOr(And, ConstantInt::get(I.getType(), + Together)); + } + // If the mask is only needed on one incoming arm, push the 'and' op up. if (match(Op0, m_OneUse(m_Xor(m_Value(X), m_Value(Y)))) || match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) { Index: test/Transforms/InstCombine/or.ll =================================================================== --- test/Transforms/InstCombine/or.ll +++ test/Transforms/InstCombine/or.ll @@ -268,6 +268,21 @@ ret i32 %E } +define <2 x i32> @test30vec(<2 x i32> %A) { +; CHECK-LABEL: @test30vec( +; CHECK-NEXT: [[C:%.*]] = and <2 x i32> [[A:%.*]], +; CHECK-NEXT: [[B:%.*]] = and <2 x i32> [[A]], +; CHECK-NEXT: [[D:%.*]] = or <2 x i32> [[B]], +; CHECK-NEXT: [[E:%.*]] = or <2 x i32> [[D]], [[C]] +; CHECK-NEXT: ret <2 x i32> [[E]] +; + %B = or <2 x i32> %A, + %C = and <2 x i32> %A, + %D = and <2 x i32> %B, + %E = or <2 x i32> %D, %C + ret <2 x i32> %E +} + ; PR4216 define i64 @test31(i64 %A) { ; CHECK-LABEL: @test31( @@ -285,6 +300,22 @@ ret i64 %F } +define <2 x i64> @test31vec(<2 x i64> %A) { +; CHECK-LABEL: @test31vec( +; CHECK-NEXT: [[E:%.*]] = and <2 x i64> [[A:%.*]], +; CHECK-NEXT: [[F:%.*]] = or <2 x i64> [[E]], +; CHECK-NEXT: ret <2 x i64> [[F]] +; + %B = or <2 x i64> %A, + %D = and <2 x i64> %B, + + %C = or <2 x i64> %A, + %E = and <2 x i64> %C, + + %F = or <2 x i64> %D, %E + ret <2 x i64> %F +} + ; codegen is mature enough to handle vector selects. define <4 x i32> @test32(<4 x i1> %and.i1352, <4 x i32> %vecinit6.i176, <4 x i32> %vecinit6.i191) { ; CHECK-LABEL: @test32(