diff --git a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp --- a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp @@ -240,6 +240,113 @@ return true; } +static inline uint64_t rotateLeft(uint64_t Val, int N) { + return (Val << (N & 63)) | (Val >> ((64 - N) & 63)); +} + +static inline uint64_t rotateRight(uint64_t Val, int N) { + return (Val >> (N & 63)) | (Val << ((64 - N) & 63)); +} + +/// Check whether the constant can be represented by exclusive-or of two 64-bit +/// logical immediates. If so, materialize it with an ORR instruction followed +/// by an EOR instruction. +/// +/// This encoding allows all remaining repeated byte patterns, and many repeated +/// 16-bit values, to be encoded without needing four instructions. It can also +/// represent some irregular bitmasks (although those would mostly only need +/// three instructions otherwise). +static bool tryEorTwoLogicalImm64s(uint64_t Imm, + SmallVectorImpl &Insn) { + // Determine the larger repetition size of the two possible logical + // immediates, by finding the repetition size of Imm. + unsigned BigSize = 64; + + do { + BigSize /= 2; + uint64_t Mask = (1ULL << BigSize) - 1; + + if ((Imm & Mask) != ((Imm >> BigSize) & Mask)) { + BigSize *= 2; + break; + } + } while (BigSize > 2); + + uint64_t BigMask = ((uint64_t)-1LL) >> (64 - BigSize); + + // Find the last bit of each run of ones, circularly. For runs which wrap + // around from bit 0 to bit 63, this is the bit before the most-significant + // zero, otherwise it is the least-significant bit in the run of ones. + uint64_t RunStarts = Imm & ~rotateLeft(Imm, 1); + + // Find the smaller repetition size of the two possible logical immediates by + // counting the number of runs of one-bits within the BigSize-bit value. Both + // sizes may be the same. The EOR may add one or subtract one from the + // power-of-two count that can be represented by a logical immediate, or it + // may be left unchanged. + int RunsPerBigChunk = countPopulation(RunStarts & BigMask); + + static const int8_t BigToSmallSizeTable[32] = { + -1, -1, 0, 1, 2, 2, -1, 3, 3, 3, -1, -1, -1, -1, -1, 4, + 4, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, + }; + + int BigToSmallShift = BigToSmallSizeTable[RunsPerBigChunk]; + + // Early-exit if the big chunk couldn't be a power-of-two number of runs + // EORed with another single run. + if (BigToSmallShift == -1) + return false; + + unsigned SmallSize = BigSize >> BigToSmallShift; + + // 64-bit values with a bit set every (1 << index) bits. + static const uint64_t RepeatedOnesTable[] = { + 0xffffffffffffffff, 0x5555555555555555, 0x1111111111111111, + 0x0101010101010101, 0x0001000100010001, 0x0000000100000001, + 0x0000000000000001, + }; + + // This RepeatedOnesTable lookup is a faster implementation of the division + // 0xffffffffffffffff / ((1 << SmallSize) - 1), and can be thought of as + // dividing the 64-bit value into fields of width SmallSize, and placing a + // one in the least significant bit of each field. + uint64_t SmallOnes = RepeatedOnesTable[countTrailingZeros(SmallSize)]; + + // Now we try to find the number of ones in each of the smaller repetitions, + // by looking at runs of ones in Imm. This can take three attempts, as the + // EOR may have changed the length of the first two runs we find. + + // Rotate a run of ones so we can count it in the trailing bits. + int Rotation = countTrailingZeros(RunStarts); + uint64_t RotatedImm = rotateRight(Imm, Rotation); + for (int Attempt = 0; Attempt < 3; ++Attempt) { + unsigned RunLength = countTrailingOnes(RotatedImm); + + // Construct candidate values BigImm and SmallImm, such that if these two + // values are encodable, we have a solution. (SmallImm is constructed to be + // encodable, but this isn't guaranteed when RunLength >= SmallSize) + uint64_t SmallImm = + rotateLeft((SmallOnes << RunLength) - SmallOnes, Rotation); + uint64_t BigImm = Imm ^ SmallImm; + + uint64_t BigEncoding = 0; + uint64_t SmallEncoding = 0; + if (AArch64_AM::processLogicalImmediate(BigImm, 64, BigEncoding) && + AArch64_AM::processLogicalImmediate(SmallImm, 64, SmallEncoding)) { + Insn.push_back({AArch64::ORRXri, 0, BigEncoding}); + Insn.push_back({AArch64::EORXri, 0, SmallEncoding}); + return true; + } + + // Rotate to the next run of ones + Rotation += countTrailingZeros(rotateRight(RunStarts, Rotation) & ~1); + RotatedImm = rotateRight(Imm, Rotation); + } + + return false; +} + /// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to a /// MOVZ or MOVN of width BitSize followed by up to 3 MOVK instructions. static inline void expandMOVImmSimple(uint64_t Imm, unsigned BitSize, @@ -373,6 +480,11 @@ } } + // Check if the instruction can be encoded with a 64-bit ORR followed by a + // 64-bit EOR. + if (BitSize == 64 && tryEorTwoLogicalImm64s(UImm, Insn)) + return; + // FIXME: Add more two-instruction sequences. // Three instruction sequences. diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -149,6 +149,16 @@ .addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR) .addImm(I->Op2)); break; + case AArch64::EORXri: { + bool DstIsDead = MI.getOperand(0).isDead(); + MIBS.push_back( + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode)) + .addReg(DstReg, RegState::Define | + getDeadRegState(DstIsDead && LastItem) | + RenamableState) + .addReg(DstReg) + .addImm(I->Op2)); + } break; case AArch64::MOVNWi: case AArch64::MOVNXi: case AArch64::MOVZWi: diff --git a/llvm/test/CodeGen/AArch64/arm64-movi.ll b/llvm/test/CodeGen/AArch64/arm64-movi.ll --- a/llvm/test/CodeGen/AArch64/arm64-movi.ll +++ b/llvm/test/CodeGen/AArch64/arm64-movi.ll @@ -322,13 +322,11 @@ ret i64 549621596159 } -; FIXME: prefer "mov x0, #2147483646; orr x0, x0, #36028659580010496" -define i64 @orr_movk16() nounwind { -; CHECK-LABEL: orr_movk16: +define i64 @orr_eor() nounwind { +; CHECK-LABEL: orr_eor: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x0, #36028659580010496 -; CHECK-NEXT: movk x0, #65534 -; CHECK-NEXT: movk x0, #32767, lsl #16 +; CHECK-NEXT: eor x0, x0, #0x7ffffffe ; CHECK-NEXT: ret ret i64 36028661727494142 } @@ -395,52 +393,199 @@ ret i64 3689348814437076172 } -; FIXME: prefer "mov x0, #536866816; orr x0, x0, #0x3fff800000000000" -define i64 @orr_orr_64() nounwind { -; CHECK-LABEL: orr_orr_64: +;==--------------------------------------------------------------------------== +; Tests for ORR with EOR. +;==--------------------------------------------------------------------------== + +define i64 @orr_eor_64() nounwind { +; CHECK-LABEL: orr_eor_64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x0, #4611545280939032576 -; CHECK-NEXT: movk x0, #61440 -; CHECK-NEXT: movk x0, #8191, lsl #16 +; CHECK-NEXT: eor x0, x0, #0x1ffff000 ; CHECK-NEXT: ret ret i64 4611545281475899392 } -; FIXME: prefer "mov x0, #558551907040256; orr x0, x0, #0x1000100010001000" -define i64 @orr_orr_32() nounwind { -; CHECK-LABEL: orr_orr_32: +define i64 @orr_eor_32() nounwind { +; CHECK-LABEL: orr_eor_32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x0, #-287953294993589248 -; CHECK-NEXT: movk x0, #7169, lsl #16 -; CHECK-NEXT: movk x0, #7169, lsl #48 +; CHECK-NEXT: mov x0, #2017612633531744256 +; CHECK-NEXT: eor x0, x0, #0x1fc000001fc00 ; CHECK-NEXT: ret ret i64 2018171185438784512 } -; FIXME: prefer "mov x0, #281479271743489; orr x0, x0, #0x1000100010001000" -define i64 @orr_orr_16() nounwind { -; CHECK-LABEL: orr_orr_16: +define i64 @orr_eor_16() nounwind { +; CHECK-LABEL: orr_eor_16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x0, #4097 -; CHECK-NEXT: movk x0, #4097, lsl #16 -; CHECK-NEXT: movk x0, #4097, lsl #32 -; CHECK-NEXT: movk x0, #4097, lsl #48 +; CHECK-NEXT: mov x0, #1152939097061330944 +; CHECK-NEXT: eor x0, x0, #0x1000100010001 ; CHECK-NEXT: ret ret i64 1153220576333074433 } -; FIXME: prefer "mov x0, #144680345676153346; orr x0, x0, #0x1818181818181818" -define i64 @orr_orr_8() nounwind { -; CHECK-LABEL: orr_orr_8: +define i64 @orr_eor_8() nounwind { +; CHECK-LABEL: orr_eor_8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x0, #6682 -; CHECK-NEXT: movk x0, #6682, lsl #16 -; CHECK-NEXT: movk x0, #6682, lsl #32 -; CHECK-NEXT: movk x0, #6682, lsl #48 +; CHECK-NEXT: mov x0, #1736164148113840152 +; CHECK-NEXT: eor x0, x0, #0x202020202020202 ; CHECK-NEXT: ret ret i64 1880844493789993498 } +define i64 @orr_eor_2_16() nounwind { +; CHECK-LABEL: orr_eor_2_16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #7881419608817692 +; CHECK-NEXT: eor x0, x0, #0xaaaaaaaaaaaaaaaa +; CHECK-NEXT: ret + ret i64 -6145536939975595338 +} + +define i64 @orr_eor_2_32() nounwind { +; CHECK-LABEL: orr_eor_2_32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #120259084316 +; CHECK-NEXT: eor x0, x0, #0xaaaaaaaaaaaaaaaa +; CHECK-NEXT: ret + ret i64 -6148914639696909642 +} + +define i64 @orr_eor_2_64() nounwind { +; CHECK-LABEL: orr_eor_2_64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #-2147483645 +; CHECK-NEXT: eor x0, x0, #0x5555555555555555 +; CHECK-NEXT: ret + ret i64 -6148914690520689322 +} + +define i64 @orr_eor_4_8() nounwind { +; CHECK-LABEL: orr_eor_4_8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #1012762419733073422 +; CHECK-NEXT: eor x0, x0, #0x4444444444444444 +; CHECK-NEXT: ret + ret i64 5353172790017673802 +} + +define i64 @orr_eor_4_16() nounwind { +; CHECK-LABEL: orr_eor_4_16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #1688875630460934 +; CHECK-NEXT: eor x0, x0, #0x3333333333333333 +; CHECK-NEXT: ret + ret i64 3689911773285397301 +} + +define i64 @orr_eor_4_32() nounwind { +; CHECK-LABEL: orr_eor_4_32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #206158430256 +; CHECK-NEXT: eor x0, x0, #0x9999999999999999 +; CHECK-NEXT: ret + ret i64 -7378697560764343895 +} + +define i64 @orr_eor_4_64() nounwind { +; CHECK-LABEL: orr_eor_4_64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #-36028797018963961 +; CHECK-NEXT: eor x0, x0, #0x8888888888888888 +; CHECK-NEXT: ret + ret i64 8577255610314688655 +} + +define i64 @orr_eor_8_8() nounwind { +; CHECK-LABEL: orr_eor_8_8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #-4557430888798830400 +; CHECK-NEXT: eor x0, x0, #0x1010101010101010 +; CHECK-NEXT: ret + ret i64 -3399988123389603632 +} + +define i64 @orr_eor_8_16() nounwind { +; CHECK-LABEL: orr_eor_8_16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #35747867511423103 +; CHECK-NEXT: eor x0, x0, #0xc0c0c0c0c0c0c0c +; CHECK-NEXT: ret + ret i64 897074439046499443 +} + +define i64 @orr_eor_8_32() nounwind { +; CHECK-LABEL: orr_eor_8_32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #1924145349056 +; CHECK-NEXT: eor x0, x0, #0xe3e3e3e3e3e3e3e3 +; CHECK-NEXT: ret + ret i64 -2025526763611495901 +} + +define i64 @orr_eor_8_64() nounwind { +; CHECK-LABEL: orr_eor_8_64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #261888 +; CHECK-NEXT: eor x0, x0, #0x8080808080808080 +; CHECK-NEXT: ret + ret i64 -9187201950435541120 +} + +define i64 @orr_eor_16_16() nounwind { +; CHECK-LABEL: orr_eor_16_16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #2305878194122661888 +; CHECK-NEXT: eor x0, x0, #0x3f003f003f003f +; CHECK-NEXT: ret + ret i64 2323611388242501695 +} + +define i64 @orr_eor_16_32() nounwind { +; CHECK-LABEL: orr_eor_16_32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #1099511628032 +; CHECK-NEXT: eor x0, x0, #0xffe0ffe0ffe0ffe0 +; CHECK-NEXT: ret + ret i64 -8726956935676192 +} + +define i64 @orr_eor_16_64() nounwind { +; CHECK-LABEL: orr_eor_16_64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #1048512 +; CHECK-NEXT: eor x0, x0, #0xe000e000e000e00 +; CHECK-NEXT: ret + ret i64 1008821709929705920 +} + +define i64 @orr_eor_32_32() nounwind { +; CHECK-LABEL: orr_eor_32_32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #4035225267063488512 +; CHECK-NEXT: eor x0, x0, #0x78000000780 +; CHECK-NEXT: ret + ret i64 4035233513400698752 +} + +define i64 @orr_eor_32_64() nounwind { +; CHECK-LABEL: orr_eor_32_64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #134215680 +; CHECK-NEXT: eor x0, x0, #0x3e0000003e000 +; CHECK-NEXT: ret + ret i64 1090715668715520 +} + +define i64 @orr_eor_64_64() nounwind { +; CHECK-LABEL: orr_eor_64_64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #-4503599627370465 +; CHECK-NEXT: eor x0, x0, #0x7fff8000000 +; CHECK-NEXT: ret + ret i64 -4494803668565985 +} + ; FIXME: prefer "mov x0, #-6148914691236517206; orr x0, x0, #0x0FFFFF0000000000" define i64 @orr_64_orr_8() nounwind { ; CHECK-LABEL: orr_64_orr_8: