Index: llvm/trunk/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -66,6 +66,11 @@ MachineBasicBlock::iterator &NextMBBI); bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned BitSize); + bool expandMOVImmSimple(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned BitSize, + unsigned OneChunks, + unsigned ZeroChunks); bool expandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdarOp, unsigned StlrOp, unsigned CmpOp, @@ -107,57 +112,6 @@ return (Imm >> (ChunkIdx * 16)) & 0xFFFF; } -/// Helper function which replicates a 16-bit chunk within a 64-bit -/// value. Indices correspond to element numbers in a v4i16. -static uint64_t replicateChunk(uint64_t Imm, unsigned FromIdx, unsigned ToIdx) { - assert((FromIdx < 4) && (ToIdx < 4) && "Out of range chunk index specified!"); - const unsigned ShiftAmt = ToIdx * 16; - - // Replicate the source chunk to the destination position. - const uint64_t Chunk = getChunk(Imm, FromIdx) << ShiftAmt; - // Clear the destination chunk. - Imm &= ~(0xFFFFLL << ShiftAmt); - // Insert the replicated chunk. - return Imm | Chunk; -} - -/// Helper function which tries to materialize a 64-bit value with an -/// ORR + MOVK instruction sequence. -static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - const AArch64InstrInfo *TII, unsigned ChunkIdx) { - assert(ChunkIdx < 4 && "Out of range chunk index specified!"); - const unsigned ShiftAmt = ChunkIdx * 16; - - uint64_t Encoding; - if (AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding)) { - // Create the ORR-immediate instruction. - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri)) - .add(MI.getOperand(0)) - .addReg(AArch64::XZR) - .addImm(Encoding); - - // Create the MOVK instruction. - const unsigned Imm16 = getChunk(UImm, ChunkIdx); - const unsigned DstReg = MI.getOperand(0).getReg(); - const bool DstIsDead = MI.getOperand(0).isDead(); - MachineInstrBuilder MIB1 = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) - .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) - .addReg(DstReg) - .addImm(Imm16) - .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt)); - - transferImpOps(MI, MIB, MIB1); - MI.eraseFromParent(); - return true; - } - - return false; -} - /// Check whether the given 16-bit chunk replicated to full 64-bit width /// can be materialized with an ORR instruction. static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) { @@ -440,7 +394,22 @@ return true; } - // Try a MOVI instruction (aka ORR-immediate with the zero register). + // Scan the immediate and count the number of 16-bit chunks which are either + // all ones or all zeros. + unsigned OneChunks = 0; + unsigned ZeroChunks = 0; + for (unsigned Shift = 0; Shift < BitSize; Shift += 16) { + const unsigned Chunk = (Imm >> Shift) & Mask; + if (Chunk == Mask) + OneChunks++; + else if (Chunk == 0) + ZeroChunks++; + } + + // FIXME: Prefer MOVZ/MOVN over ORR because of the rules for the "mov" + // alias. + + // Try a single ORR. uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); uint64_t Encoding; if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { @@ -455,74 +424,69 @@ return true; } - // Scan the immediate and count the number of 16-bit chunks which are either - // all ones or all zeros. - unsigned OneChunks = 0; - unsigned ZeroChunks = 0; + // Two instruction sequences. + // + // Prefer MOVZ/MOVN followed by MOVK; it's more readable, and possibly the + // fastest sequence with fast literal generation. + if (OneChunks >= (BitSize / 16) - 2 || ZeroChunks >= (BitSize / 16) - 2) + return expandMOVImmSimple(MBB, MBBI, BitSize, OneChunks, ZeroChunks); + + assert(BitSize == 64 && "All 32-bit immediates can be expanded with a" + "MOVZ/MOVK pair"); + + // Try other two-instruction sequences. + + // 64-bit ORR followed by MOVK. + // We try to construct the ORR immediate in three different ways: either we + // zero out the chunk which will be replaced, we fill the chunk which will + // be replaced with ones, or we take the bit pattern from the other half of + // the 64-bit immediate. This is comprehensive because of the way ORR + // immediates are constructed. for (unsigned Shift = 0; Shift < BitSize; Shift += 16) { - const unsigned Chunk = (Imm >> Shift) & Mask; - if (Chunk == Mask) - OneChunks++; - else if (Chunk == 0) - ZeroChunks++; - } + uint64_t ShiftedMask = (0xFFFFULL << Shift); + uint64_t ZeroChunk = UImm & ~ShiftedMask; + uint64_t OneChunk = UImm | ShiftedMask; + uint64_t RotatedImm = (UImm << 32) | (UImm >> 32); + uint64_t ReplicateChunk = ZeroChunk | (RotatedImm & ShiftedMask); + if (AArch64_AM::processLogicalImmediate(ZeroChunk, BitSize, Encoding) || + AArch64_AM::processLogicalImmediate(OneChunk, BitSize, Encoding) || + AArch64_AM::processLogicalImmediate(ReplicateChunk, + BitSize, Encoding)) { + // Create the ORR-immediate instruction. + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri)) + .add(MI.getOperand(0)) + .addReg(AArch64::XZR) + .addImm(Encoding); + + // Create the MOVK instruction. + const unsigned Imm16 = getChunk(UImm, Shift / 16); + const unsigned DstReg = MI.getOperand(0).getReg(); + const bool DstIsDead = MI.getOperand(0).isDead(); + MachineInstrBuilder MIB1 = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg) + .addImm(Imm16) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift)); - // Since we can't materialize the constant with a single ORR instruction, - // let's see whether we can materialize 3/4 of the constant with an ORR - // instruction and use an additional MOVK instruction to materialize the - // remaining 1/4. - // - // We are looking for constants with a pattern like: |A|X|B|X| or |X|A|X|B|. - // - // E.g. assuming |A|X|A|X| is a pattern which can be materialized with ORR, - // we would create the following instruction sequence: - // - // ORR x0, xzr, |A|X|A|X| - // MOVK x0, |B|, LSL #16 - // - // Only look at 64-bit constants which can't be materialized with a single - // instruction e.g. which have less than either three all zero or all one - // chunks. - // - // Ignore 32-bit constants here, they always can be materialized with a - // MOVZ/MOVN + MOVK pair. Since the 32-bit constant can't be materialized - // with a single ORR, the best sequence we can achieve is a ORR + MOVK pair. - // Thus we fall back to the default code below which in the best case creates - // a single MOVZ/MOVN instruction (in case one chunk is all zero or all one). - // - if (BitSize == 64 && OneChunks < 3 && ZeroChunks < 3) { - // If we interpret the 64-bit constant as a v4i16, are elements 0 and 2 - // identical? - if (getChunk(UImm, 0) == getChunk(UImm, 2)) { - // See if we can come up with a constant which can be materialized with - // ORR-immediate by replicating element 3 into element 1. - uint64_t OrrImm = replicateChunk(UImm, 3, 1); - if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 1)) - return true; - - // See if we can come up with a constant which can be materialized with - // ORR-immediate by replicating element 1 into element 3. - OrrImm = replicateChunk(UImm, 1, 3); - if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 3)) - return true; - - // If we interpret the 64-bit constant as a v4i16, are elements 1 and 3 - // identical? - } else if (getChunk(UImm, 1) == getChunk(UImm, 3)) { - // See if we can come up with a constant which can be materialized with - // ORR-immediate by replicating element 2 into element 0. - uint64_t OrrImm = replicateChunk(UImm, 2, 0); - if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 0)) - return true; - - // See if we can come up with a constant which can be materialized with - // ORR-immediate by replicating element 1 into element 3. - OrrImm = replicateChunk(UImm, 0, 2); - if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 2)) - return true; + transferImpOps(MI, MIB, MIB1); + MI.eraseFromParent(); + return true; } } + // FIXME: Add more two-instruction sequences. + + // Three instruction sequences. + // + // Prefer MOVZ/MOVN followed by two MOVK; it's more readable, and possibly + // the fastest sequence with fast literal generation. (If neither MOVK is + // part of a fast literal generation pair, it could be slower than the + // four-instruction sequence, but we won't worry about that for now.) + if (OneChunks || ZeroChunks) + return expandMOVImmSimple(MBB, MBBI, BitSize, OneChunks, ZeroChunks); + // Check for identical 16-bit chunks within the constant and if so materialize // them with a single ORR instruction. The remaining one or two 16-bit chunks // will be materialized with MOVK instructions. @@ -537,6 +501,23 @@ if (BitSize == 64 && trySequenceOfOnes(UImm, MI, MBB, MBBI, TII)) return true; + // We found no possible two or three instruction sequence; use the general + // four-instruction sequence. + return expandMOVImmSimple(MBB, MBBI, BitSize, OneChunks, ZeroChunks); +} + +/// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to a +/// MOVZ or MOVN of width BitSize followed by up to 3 MOVK instructions. +bool AArch64ExpandPseudo::expandMOVImmSimple(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned BitSize, + unsigned OneChunks, + unsigned ZeroChunks) { + MachineInstr &MI = *MBBI; + unsigned DstReg = MI.getOperand(0).getReg(); + uint64_t Imm = MI.getOperand(1).getImm(); + const unsigned Mask = 0xFFFF; + // Use a MOVZ or MOVN instruction to set the high bits, followed by one or // more MOVK instructions to insert additional 16-bit portions into the // lower bits. Index: llvm/trunk/test/CodeGen/AArch64/arm64-movi.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-movi.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-movi.ll @@ -134,18 +134,6 @@ ret i64 -279156097024 } -; FIXME: prefer "mov w0, #-63; movk x0, #31, lsl #32" -; or "mov x0, #137438887936; movk x0, #65473" -define i64 @mvn32_pattern() nounwind { -; CHECK-LABEL: mvn32_pattern: -; CHECK: // %bb.0: -; CHECK-NEXT: mov x0, #65473 -; CHECK-NEXT: movk x0, #65535, lsl #16 -; CHECK-NEXT: movk x0, #31, lsl #32 -; CHECK-NEXT: ret - ret i64 137438953409 -} - ; FIXME: prefer "mov w0, #-63; movk x0, #17, lsl #32" define i64 @mvn32_pattern_2() nounwind { ; CHECK-LABEL: mvn32_pattern_2: @@ -281,9 +269,9 @@ define i64 @orr_movk11() nounwind { ; CHECK-LABEL: orr_movk11: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x0, #-4503599627370241 +; CHECK-NEXT: mov x0, #-65281 ; CHECK-NEXT: movk x0, #57005, lsl #16 -; CHECK-NEXT: movk x0, #65535, lsl #32 +; CHECK-NEXT: movk x0, #65520, lsl #48 ; CHECK-NEXT: ret ret i64 -4222125209747201 } @@ -318,24 +306,20 @@ ret i64 -281474976710654 } -; FIXME: prefer "mov x0, #-549755813888; movk x0, 2048, lsl #16" define i64 @orr_movk14() nounwind { ; CHECK-LABEL: orr_movk14: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x0, #134217728 -; CHECK-NEXT: movk x0, #65408, lsl #32 -; CHECK-NEXT: movk x0, #65535, lsl #48 +; CHECK-NEXT: mov x0, #-549755813888 +; CHECK-NEXT: movk x0, #2048, lsl #16 ; CHECK-NEXT: ret ret i64 -549621596160 } -; FIXME: prefer "mov x0, #549755813887; movk x0, #63487, lsl #16" define i64 @orr_movk15() nounwind { ; CHECK-LABEL: orr_movk15: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x0, #65535 +; CHECK-NEXT: mov x0, #549755813887 ; CHECK-NEXT: movk x0, #63487, lsl #16 -; CHECK-NEXT: movk x0, #127, lsl #32 ; CHECK-NEXT: ret ret i64 549621596159 } @@ -351,24 +335,121 @@ ret i64 36028661727494142 } -; FIXME: prefer "mov x0, #-1099511627776; movk x0, #65280, lsl #16" define i64 @orr_movk17() nounwind { ; CHECK-LABEL: orr_movk17: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x0, #-71777214294589696 -; CHECK-NEXT: movk x0, #0 -; CHECK-NEXT: movk x0, #65535, lsl #48 +; CHECK-NEXT: mov x0, #-1099511627776 +; CHECK-NEXT: movk x0, #65280, lsl #16 ; CHECK-NEXT: ret ret i64 -1095233437696 } -; FIXME: prefer "mov x0, #72340172838076673; and x0, x0, #2199023255296" define i64 @orr_movk18() nounwind { ; CHECK-LABEL: orr_movk18: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x0, #72340172838076673 -; CHECK-NEXT: movk x0, #256 -; CHECK-NEXT: movk x0, #0, lsl #48 +; CHECK-NEXT: mov x0, #137438887936 +; CHECK-NEXT: movk x0, #65473 +; CHECK-NEXT: ret + ret i64 137438953409 +} + +; FIXME: prefer "mov x0, #72340172838076673; and x0, x0, #2199023255296" +define i64 @orr_and() nounwind { +; CHECK-LABEL: orr_and: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #256 +; CHECK-NEXT: movk x0, #257, lsl #16 +; CHECK-NEXT: movk x0, #257, lsl #32 ; CHECK-NEXT: ret ret i64 1103823438080 } + +; FIXME: prefer "mov w0, #-1431655766; movk x0, #9, lsl #32" +define i64 @movn_movk() nounwind { +; CHECK-LABEL: movn_movk: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #43690 +; CHECK-NEXT: movk x0, #43690, lsl #16 +; CHECK-NEXT: movk x0, #9, lsl #32 +; CHECK-NEXT: ret + ret i64 41518017194 +} + +; FIXME: prefer "mov w0, #-13690; orr x0, x0, #0x1111111111111111" +define i64 @movn_orr() nounwind { +; CHECK-LABEL: movn_orr: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #-51847 +; CHECK-NEXT: movk x0, #4369, lsl #32 +; CHECK-NEXT: movk x0, #4369, lsl #48 +; CHECK-NEXT: ret + ret i64 1229782942255887737 +} + +; FIXME: prefer "mov w0, #-305397761; eor x0, x0, #0x3333333333333333" +define i64 @movn_eor() nounwind { +; CHECK-LABEL: movn_eor: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #3689348814741910323 +; CHECK-NEXT: movk x0, #52428 +; CHECK-NEXT: movk x0, #8455, lsl #16 +; CHECK-NEXT: ret + ret i64 3689348814437076172 +} + +; FIXME: prefer "mov x0, #536866816; orr x0, x0, #0x3fff800000000000" +define i64 @orr_orr_64() nounwind { +; CHECK-LABEL: orr_orr_64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #4611545280939032576 +; CHECK-NEXT: movk x0, #61440 +; CHECK-NEXT: movk x0, #8191, lsl #16 +; CHECK-NEXT: ret + ret i64 4611545281475899392 +} + +; FIXME: prefer "mov x0, #558551907040256; orr x0, x0, #0x1000100010001000" +define i64 @orr_orr_32() nounwind { +; CHECK-LABEL: orr_orr_32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #-287953294993589248 +; CHECK-NEXT: movk x0, #7169, lsl #16 +; CHECK-NEXT: movk x0, #7169, lsl #48 +; CHECK-NEXT: ret + ret i64 2018171185438784512 +} + +; FIXME: prefer "mov x0, #281479271743489; orr x0, x0, #0x1000100010001000" +define i64 @orr_orr_16() nounwind { +; CHECK-LABEL: orr_orr_16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #4097 +; CHECK-NEXT: movk x0, #4097, lsl #16 +; CHECK-NEXT: movk x0, #4097, lsl #32 +; CHECK-NEXT: movk x0, #4097, lsl #48 +; CHECK-NEXT: ret + ret i64 1153220576333074433 +} + +; FIXME: prefer "mov x0, #144680345676153346; orr x0, x0, #0x1818181818181818" +define i64 @orr_orr_8() nounwind { +; CHECK-LABEL: orr_orr_8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #6682 +; CHECK-NEXT: movk x0, #6682, lsl #16 +; CHECK-NEXT: movk x0, #6682, lsl #32 +; CHECK-NEXT: movk x0, #6682, lsl #48 +; CHECK-NEXT: ret + ret i64 1880844493789993498 +} + +; FIXME: prefer "mov x0, #-6148914691236517206; orr x0, x0, #0x0FFFFF0000000000" +define i64 @orr_64_orr_8() nounwind { +; CHECK-LABEL: orr_64_orr_8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #-6148914691236517206 +; CHECK-NEXT: movk x0, #65450, lsl #32 +; CHECK-NEXT: movk x0, #45055, lsl #48 +; CHECK-NEXT: ret + ret i64 -5764607889538110806 +} Index: llvm/trunk/test/CodeGen/AArch64/arm64-variadic-aapcs.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-variadic-aapcs.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-variadic-aapcs.ll @@ -32,8 +32,8 @@ ; CHECK: add [[VR_TOP:x[0-9]+]], [[VR_TOPTMP]], #128 ; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16] -; CHECK: mov [[GRVR:x[0-9]+]], #-545460846720 -; CHECK: movk [[GRVR]], #65480 +; CHECK: mov [[GRVR:x[0-9]+]], #-56 +; CHECK: movk [[GRVR]], #65408, lsl #32 ; CHECK: str [[GRVR]], [x[[VA_LIST]], #24] %addr = bitcast %va_list* @var to i8*