Index: lib/Target/AArch64/AArch64ISelDAGToDAG.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -1981,6 +1981,97 @@ return isShiftedMask_64(Mask); } +// Generate a BFI/BFXIL from 'or (and X, MaskImm), OrImm' iff the value being +// inserted only sets known zero bits. +static bool tryBitfieldInsertOpFromOrAndImm(SDNode *N, SelectionDAG *CurDAG) { + assert(N->getOpcode() == ISD::OR && "Expect a OR operation"); + + EVT VT = N->getValueType(0); + if (VT != MVT::i32 && VT != MVT::i64) + return false; + + unsigned BitWidth = VT.getSizeInBits(); + + uint64_t OrImm; + if (!isOpcWithIntImmediate(N, ISD::OR, OrImm)) + return false; + + // Skip this transformation if the ORR immediate can be encoded in the ORR. + // Otherwise, we'll trade an AND+ORR for ORR+BFI/BFXIL, which is most likely + // performance neutral. + if (AArch64_AM::isLogicalImmediate(OrImm, BitWidth)) + return false; + + uint64_t MaskImm; + SDValue And = N->getOperand(0); + // Must be a single use AND with an immediate operand. + if (!And.hasOneUse() || + !isOpcWithIntImmediate(And.getNode(), ISD::AND, MaskImm)) + return false; + + // Compute the Known Zero for the AND as this allows us to catch more general + // cases than just looking for AND with imm. + APInt KnownZero, KnownOne; + CurDAG->computeKnownBits(And, KnownZero, KnownOne); + + // Non-zero in the sense that they're not provably zero, which is the key + // point if we want to use this value. + uint64_t NotKnownZero = (~KnownZero).getZExtValue(); + + // The KnownZero mask must be a shifted mask (e.g., 1110..011, 11100..00). + if (!isShiftedMask(KnownZero.getZExtValue(), VT)) + return false; + + // The bits being inserted must only set those bits that are known to be zero. + if ((OrImm & NotKnownZero) != 0) { + // FIXME: It's okay if the OrImm sets NotKnownZero bits to 1, but we don't + // currently handle this case. + return false; + } + + // BFI/BFXIL dst, src, #lsb, #width. + int LSB = countTrailingOnes(NotKnownZero); + int Width = BitWidth - APInt(BitWidth, NotKnownZero).countPopulation(); + + // BFI/BFXIL is an alias of BFM, so translate to BFM operands. + unsigned ImmR = (BitWidth - LSB) % BitWidth; + unsigned ImmS = Width - 1; + + // If we're creating a BFI instruction avoid cases where we need more + // instructions to materialize the BFI constant as compared to the original + // ORR. A BFXIL will use the same constant as the original ORR, so the code + // should be no worse in this case. + bool IsBFI = LSB != 0; + uint64_t BFIImm = OrImm >> LSB; + if (IsBFI && !AArch64_AM::isLogicalImmediate(BFIImm, BitWidth)) { + // We have a BFI instruction and we know the constant can't be materialized + // with a ORR-immediate with the zero register. + unsigned OrChunks = 0, BFIChunks = 0; + for (unsigned Shift = 0; Shift < BitWidth; Shift += 16) { + if (((OrImm >> Shift) & 0xFFFF) != 0) + ++OrChunks; + if (((BFIImm >> Shift) & 0xFFFF) != 0) + ++BFIChunks; + } + if (BFIChunks > OrChunks) + return false; + } + + // Materialize the constant to be inserted. + SDLoc DL(N); + unsigned MOVIOpc = VT == MVT::i32 ? AArch64::MOVi32imm : AArch64::MOVi64imm; + SDNode *MOVI = CurDAG->getMachineNode( + MOVIOpc, DL, VT, CurDAG->getTargetConstant(BFIImm, DL, VT)); + + // Create the BFI/BFXIL instruction. + SDValue Ops[] = {And.getOperand(0), SDValue(MOVI, 0), + CurDAG->getTargetConstant(ImmR, DL, VT), + CurDAG->getTargetConstant(ImmS, DL, VT)}; + unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri; + CurDAG->SelectNodeTo(N, Opc, VT, Ops); + return true; +} + static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits, SelectionDAG *CurDAG) { assert(N->getOpcode() == ISD::OR && "Expect a OR operation"); @@ -2159,7 +2250,10 @@ return true; } - return tryBitfieldInsertOpFromOr(N, NUsefulBits, CurDAG); + if (tryBitfieldInsertOpFromOr(N, NUsefulBits, CurDAG)) + return true; + + return tryBitfieldInsertOpFromOrAndImm(N, CurDAG); } /// SelectBitfieldInsertInZeroOp - Match a UBFIZ instruction that is the Index: test/CodeGen/AArch64/bitfield-insert.ll =================================================================== --- test/CodeGen/AArch64/bitfield-insert.ll +++ test/CodeGen/AArch64/bitfield-insert.ll @@ -378,3 +378,88 @@ %or = or i32 %and, %and1 ret i32 %or } + +; CHECK-LABEL: @test1 +; CHECK: movz [[REG:w[0-9]+]], #5 +; CHECK: bfxil w0, [[REG]], #0, #4 +define i32 @test1(i32 %a) { + %1 = and i32 %a, -16 ; 0xfffffff0 + %2 = or i32 %1, 5 ; 0x00000005 + ret i32 %2 +} + +; CHECK-LABEL: @test2 +; CHECK: movz [[REG:w[0-9]+]], #10 +; CHECK: bfi w0, [[REG]], #22, #4 +define i32 @test2(i32 %a) { + %1 = and i32 %a, -62914561 ; 0xfc3fffff + %2 = or i32 %1, 41943040 ; 0x06400000 + ret i32 %2 +} + +; CHECK-LABEL: @test3 +; CHECK: movz [[REG:x[0-9]+]], #5 +; CHECK: bfxil x0, [[REG]], #0, #3 +define i64 @test3(i64 %a) { + %1 = and i64 %a, -8 ; 0xfffffffffffffff8 + %2 = or i64 %1, 5 ; 0x0000000000000005 + ret i64 %2 +} + +; CHECK-LABEL: @test4 +; CHECK: movz [[REG:x[0-9]+]], #9 +; CHECK: bfi x0, [[REG]], #1, #7 +define i64 @test4(i64 %a) { + %1 = and i64 %a, -255 ; 0xffffffffffffff01 + %2 = or i64 %1, 18 ; 0x0000000000000012 + ret i64 %2 +} + +; Don't generate BFI/BFXIL if the immediate can be encoded in the ORR. +; CHECK-LABEL: @test5 +; CHECK: and [[REG:w[0-9]+]], w0, #0xfffffff0 +; CHECK: orr w0, [[REG]], #0x6 +define i32 @test5(i32 %a) { + %1 = and i32 %a, 4294967280 ; 0xfffffff0 + %2 = or i32 %1, 6 ; 0x00000006 + ret i32 %2 +} + +; BFXIL will use the same constant as the ORR, so we don't care how the constant +; is materialized (it's an equal cost either way). +; CHECK-LABEL: @test6 +; CHECK: movz [[REG:w[0-9]+]], #11, lsl #16 +; CHECK: movk [[REG]], #23250 +; CHECK: bfxil w0, [[REG]], #0, #20 +define i32 @test6(i32 %a) { + %1 = and i32 %a, 4293918720 ; 0xfff00000 + %2 = or i32 %1, 744146 ; 0x000b5ad2 + ret i32 %2 +} + +; BFIs that require the same number of instruction to materialize the constant +; as the original ORR are okay. +; CHECK-LABEL: @test7 +; CHECK: movz [[REG:w[0-9]+]], #5, lsl #16 +; CHECK: movk [[REG]], #44393 +; CHECK: bfi w0, [[REG]], #1, #19 +define i32 @test7(i32 %a) { + %1 = and i32 %a, 4293918721 ; 0xfff00001 + %2 = or i32 %1, 744146 ; 0x000b5ad2 + ret i32 %2 +} + +; BFIs that require more instructions to materialize the constant as compared +; to the original ORR are not okay. In this case we would be replacing the +; 'and' with a 'movk', which would decrease ILP while using the same number of +; instructions. +; CHECK: @test8 +; CHECK: movz [[REG2:x[0-9]+]], #36694, lsl #32 +; CHECK: and [[REG1:x[0-9]+]], x0, #0xff000000000000ff +; CHECK: movk [[REG2]], #31059, lsl #16 +; CHECK: orr x0, [[REG1]], [[REG2]] +define i64 @test8(i64 %a) { + %1 = and i64 %a, -72057594037927681 ; 0xff000000000000ff + %2 = or i64 %1, 157601565442048 ; 0x00008f5679530000 + ret i64 %2 +}