Index: llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp =================================================================== --- llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp +++ llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp @@ -508,6 +508,66 @@ return true; } +// If a BNE on the cmpxchg comparison result immediately follows the cmpxchg +// operation, it can be folded into the cmpxchg expansion by +// modifying the branch within 'LoopHead' (which performs the same +// comparison). This is a valid transformation because after altering the +// LoopHead's BNE destination, the BNE following the cmpxchg becomes +// redundant and and be deleted. In the case of a masked cmpxchg, an +// appropriate AND and BNE must be matched. +// +// On success, returns true and deletes the matching BNE or AND+BNE, sets the +// LoopHeadBNETarget argument to the target that should be used within the +// loop head, and removes that block as a successor to MBB. +bool tryToFoldBNEOnCmpXchgResult(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + Register DestReg, Register CmpValReg, bool + IsMasked, Register MaskReg, MachineBasicBlock + *&LoopHeadBNETarget) { + SmallVector ToErase; + auto E = MBB.end(); + if (MBBI == E) + return false; + while (MBBI != E && MBBI->isDebugInstr()) + MBBI++; + + // If we have a masked cmpxchg, match AND dst, DestReg, MaskReg. + if (IsMasked) { + if (MBBI == E || MBBI->getOpcode() != RISCV::AND) + return false; + Register ANDOp1 = MBBI->getOperand(1).getReg(); + Register ANDOp2 = MBBI->getOperand(2).getReg(); + if (!((ANDOp1 == DestReg && ANDOp2 == MaskReg) || (ANDOp1 == MaskReg && ANDOp2 == DestReg))) + return false; + // We now expect the BNE to use the result of the AND as an operand. + DestReg = MBBI->getOperand(0).getReg(); + ToErase.push_back(&*MBBI); + MBBI++; + while (MBBI != E && MBBI->isDebugInstr()) + MBBI++; + } + + // Match BNE DestReg, MaskReg. + if (MBBI == E || MBBI->getOpcode() != RISCV::BNE) + return false; + Register BNEOp0 = MBBI->getOperand(0).getReg(); + Register BNEOp1 = MBBI->getOperand(1).getReg(); + if (!((BNEOp0 == DestReg && BNEOp1 == CmpValReg) || (BNEOp0 == CmpValReg && BNEOp1 == DestReg))) + return false; + ToErase.push_back(&*MBBI); + LoopHeadBNETarget = MBBI->getOperand(2).getMBB(); + MBBI++; + while (MBBI != E && MBBI->isDebugInstr()) + MBBI++; + if (MBBI != E) + return false; + + MBB.removeSuccessor(LoopHeadBNETarget); + for (auto *MI : ToErase) + MI->eraseFromParent(); + return true; +} + bool RISCVExpandAtomicPseudo::expandAtomicCmpXchg( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, bool IsMasked, int Width, MachineBasicBlock::iterator &NextMBBI) { @@ -518,6 +578,17 @@ auto LoopTailMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); auto DoneMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + Register DestReg = MI.getOperand(0).getReg(); + Register ScratchReg = MI.getOperand(1).getReg(); + Register AddrReg = MI.getOperand(2).getReg(); + Register CmpValReg = MI.getOperand(3).getReg(); + Register NewValReg = MI.getOperand(4).getReg(); + Register MaskReg = IsMasked ? MI.getOperand(5).getReg() : Register(0); + + MachineBasicBlock *LoopHeadBNETarget = DoneMBB; + tryToFoldBNEOnCmpXchgResult(MBB, std::next(MBBI), DestReg, CmpValReg, IsMasked, MaskReg, + LoopHeadBNETarget); + // Insert new MBBs. MF->insert(++MBB.getIterator(), LoopHeadMBB); MF->insert(++LoopHeadMBB->getIterator(), LoopTailMBB); @@ -525,18 +596,13 @@ // Set up successors and transfer remaining instructions to DoneMBB. LoopHeadMBB->addSuccessor(LoopTailMBB); - LoopHeadMBB->addSuccessor(DoneMBB); + LoopHeadMBB->addSuccessor(LoopHeadBNETarget); LoopTailMBB->addSuccessor(DoneMBB); LoopTailMBB->addSuccessor(LoopHeadMBB); DoneMBB->splice(DoneMBB->end(), &MBB, MI, MBB.end()); DoneMBB->transferSuccessors(&MBB); MBB.addSuccessor(LoopHeadMBB); - Register DestReg = MI.getOperand(0).getReg(); - Register ScratchReg = MI.getOperand(1).getReg(); - Register AddrReg = MI.getOperand(2).getReg(); - Register CmpValReg = MI.getOperand(3).getReg(); - Register NewValReg = MI.getOperand(4).getReg(); AtomicOrdering Ordering = static_cast(MI.getOperand(IsMasked ? 6 : 5).getImm()); @@ -549,7 +615,7 @@ BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BNE)) .addReg(DestReg) .addReg(CmpValReg) - .addMBB(DoneMBB); + .addMBB(LoopHeadBNETarget); // .looptail: // sc.[w|d] scratch, newval, (addr) // bnez scratch, loophead @@ -574,7 +640,7 @@ BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BNE)) .addReg(ScratchReg) .addReg(CmpValReg) - .addMBB(DoneMBB); + .addMBB(LoopHeadBNETarget); // .looptail: // xor scratch, dest, newval Index: llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll =================================================================== --- llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll +++ llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll @@ -17,14 +17,12 @@ ; CHECK-NEXT: # Parent Loop BB0_1 Depth=1 ; CHECK-NEXT: # => This Inner Loop Header: Depth=2 ; CHECK-NEXT: lr.w.aqrl a3, (a0) -; CHECK-NEXT: bne a3, a1, .LBB0_5 +; CHECK-NEXT: bne a3, a1, .LBB0_1 ; CHECK-NEXT: # %bb.4: # %do_cmpxchg ; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=2 ; CHECK-NEXT: sc.w.aqrl a4, a2, (a0) ; CHECK-NEXT: bnez a4, .LBB0_3 -; CHECK-NEXT: .LBB0_5: # %do_cmpxchg -; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: bne a3, a1, .LBB0_1 +; CHECK-NEXT: # %bb.5: # %do_cmpxchg ; CHECK-NEXT: # %bb.2: # %exit ; CHECK-NEXT: ret entry: @@ -86,7 +84,7 @@ ; RV32IA-NEXT: # => This Inner Loop Header: Depth=2 ; RV32IA-NEXT: lr.w.aqrl a4, (a3) ; RV32IA-NEXT: and a5, a4, a0 -; RV32IA-NEXT: bne a5, a1, .LBB2_5 +; RV32IA-NEXT: bne a5, a1, .LBB2_1 ; RV32IA-NEXT: # %bb.4: # %do_cmpxchg ; RV32IA-NEXT: # in Loop: Header=BB2_3 Depth=2 ; RV32IA-NEXT: xor a5, a4, a2 @@ -94,10 +92,7 @@ ; RV32IA-NEXT: xor a5, a4, a5 ; RV32IA-NEXT: sc.w.aqrl a5, a5, (a3) ; RV32IA-NEXT: bnez a5, .LBB2_3 -; RV32IA-NEXT: .LBB2_5: # %do_cmpxchg -; RV32IA-NEXT: # in Loop: Header=BB2_1 Depth=1 -; RV32IA-NEXT: and a4, a4, a0 -; RV32IA-NEXT: bne a1, a4, .LBB2_1 +; RV32IA-NEXT: # %bb.5: # %do_cmpxchg ; RV32IA-NEXT: # %bb.2: # %exit ; RV32IA-NEXT: ret ; @@ -119,7 +114,7 @@ ; RV64IA-NEXT: # => This Inner Loop Header: Depth=2 ; RV64IA-NEXT: lr.w.aqrl a4, (a3) ; RV64IA-NEXT: and a5, a4, a0 -; RV64IA-NEXT: bne a5, a1, .LBB2_5 +; RV64IA-NEXT: bne a5, a1, .LBB2_1 ; RV64IA-NEXT: # %bb.4: # %do_cmpxchg ; RV64IA-NEXT: # in Loop: Header=BB2_3 Depth=2 ; RV64IA-NEXT: xor a5, a4, a2 @@ -127,10 +122,7 @@ ; RV64IA-NEXT: xor a5, a4, a5 ; RV64IA-NEXT: sc.w.aqrl a5, a5, (a3) ; RV64IA-NEXT: bnez a5, .LBB2_3 -; RV64IA-NEXT: .LBB2_5: # %do_cmpxchg -; RV64IA-NEXT: # in Loop: Header=BB2_1 Depth=1 -; RV64IA-NEXT: and a4, a4, a0 -; RV64IA-NEXT: bne a1, a4, .LBB2_1 +; RV64IA-NEXT: # %bb.5: # %do_cmpxchg ; RV64IA-NEXT: # %bb.2: # %exit ; RV64IA-NEXT: ret entry: