[x86][eflags] Fix PR37431 by teaching the EFLAGS copy lowering to

chandlerc · chandlerc · commit 5ecd81aab0dd · 2018-05-15T20:16:57.000Z
specially handle SETB_C* pseudo instructions. Summary: While the logic here is somewhat similar to the arithmetic lowering, it is different enough that it made sense to have its own function. I actually tried a bunch of different optimizations here and none worked well so I gave up and just always do the arithmetic based lowering. Looking at code from the PR test case, we actually pessimize a bunch of code when generating these. Because SETB_C* pseudo instructions clobber EFLAGS, we end up creating a bunch of copies of EFLAGS to feed multiple SETB_C* pseudos from a single set of EFLAGS. This in turn causes the lowering code to ruin all the clever code generation that SETB_C* was hoping to achieve. None of this is needed. Whenever we're generating multiple SETB_C* instructions from a single set of EFLAGS we should instead generate a single maximally wide one and extract subregs for all the different desired widths. That would result in substantially better code generation. But this patch doesn't attempt to address that. The test case from the PR is included as well as more directed testing of the specific lowering pattern used for these pseudos. Reviewers: craig.topper Subscribers: sanjoy, mcrosier, llvm-commits, hiraditya Differential Revision: https://reviews.llvm.org/D46799 llvm-svn: 332389
diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -127,6 +127,10 @@ class X86FlagsCopyLoweringPass : public MachineFunctionPass {
                       MachineInstr &JmpI, CondRegArray &CondRegs);
   void rewriteCopy(MachineInstr &MI, MachineOperand &FlagUse,
                    MachineInstr &CopyDefI);
+  void rewriteSetCarryExtended(MachineBasicBlock &TestMBB,
+                               MachineBasicBlock::iterator TestPos,
+                               DebugLoc TestLoc, MachineInstr &SetBI,
+                               MachineOperand &FlagUse, CondRegArray &CondRegs);
   void rewriteSetCC(MachineBasicBlock &TestMBB,
                     MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
                     MachineInstr &SetCCI, MachineOperand &FlagUse,
@@ -512,8 +516,7 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
         } else if (MI.getOpcode() == TargetOpcode::COPY) {
           rewriteCopy(MI, *FlagUse, CopyDefI);
         } else {
-          // We assume that arithmetic instructions that use flags also def
-          // them.
+          // We assume all other instructions that use flags also def them.
           assert(MI.findRegisterDefOperand(X86::EFLAGS) &&
                  "Expected a def of EFLAGS for this instruction!");
 
@@ -525,7 +528,23 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
           // logic.
           FlagsKilled = true;
 
-          rewriteArithmetic(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+          switch (MI.getOpcode()) {
+          case X86::SETB_C8r:
+          case X86::SETB_C16r:
+          case X86::SETB_C32r:
+          case X86::SETB_C64r:
+            // Use custom lowering for arithmetic that is merely extending the
+            // carry flag. We model this as the SETB_C* pseudo instructions.
+            rewriteSetCarryExtended(TestMBB, TestPos, TestLoc, MI, *FlagUse,
+                                    CondRegs);
+            break;
+
+          default:
+            // Generically handle remaining uses as arithmetic instructions.
+            rewriteArithmetic(TestMBB, TestPos, TestLoc, MI, *FlagUse,
+                              CondRegs);
+            break;
+          }
           break;
         }
 
@@ -753,6 +772,126 @@ void X86FlagsCopyLoweringPass::rewriteCopy(MachineInstr &MI,
   MI.eraseFromParent();
 }
 
+void X86FlagsCopyLoweringPass::rewriteSetCarryExtended(
+    MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
+    DebugLoc TestLoc, MachineInstr &SetBI, MachineOperand &FlagUse,
+    CondRegArray &CondRegs) {
+  // This routine is only used to handle pseudos for setting a register to zero
+  // or all ones based on CF. This is essentially the sign extended from 1-bit
+  // form of SETB and modeled with the SETB_C* pseudos. They require special
+  // handling as they aren't normal SETcc instructions and are lowered to an
+  // EFLAGS clobbering operation (SBB typically). One simplifying aspect is that
+  // they are only provided in reg-defining forms. A complicating factor is that
+  // they can define many different register widths.
+  assert(SetBI.getOperand(0).isReg() &&
+         "Cannot have a non-register defined operand to this variant of SETB!");
+
+  // Little helper to do the common final step of replacing the register def'ed
+  // by this SETB instruction with a new register and removing the SETB
+  // instruction.
+  auto RewriteToReg = [&](unsigned Reg) {
+    MRI->replaceRegWith(SetBI.getOperand(0).getReg(), Reg);
+    SetBI.eraseFromParent();
+  };
+
+  // Grab the register class used for this particular instruction.
+  auto &SetBRC = *MRI->getRegClass(SetBI.getOperand(0).getReg());
+
+  MachineBasicBlock &MBB = *SetBI.getParent();
+  auto SetPos = SetBI.getIterator();
+  auto SetLoc = SetBI.getDebugLoc();
+
+  auto AdjustReg = [&](unsigned Reg) {
+    auto &OrigRC = *MRI->getRegClass(Reg);
+    if (&OrigRC == &SetBRC)
+      return Reg;
+
+    unsigned NewReg;
+
+    int OrigRegSize = TRI->getRegSizeInBits(OrigRC) / 8;
+    int TargetRegSize = TRI->getRegSizeInBits(SetBRC) / 8;
+    assert(OrigRegSize <= 8 && "No GPRs larger than 64-bits!");
+    assert(TargetRegSize <= 8 && "No GPRs larger than 64-bits!");
+    int SubRegIdx[] = {X86::NoSubRegister, X86::sub_8bit, X86::sub_16bit,
+                       X86::NoSubRegister, X86::sub_32bit};
+
+    // If the original size is smaller than the target *and* is smaller than 4
+    // bytes, we need to explicitly zero extend it. We always extend to 4-bytes
+    // to maximize the chance of being able to CSE that operation and to avoid
+    // partial dependency stalls extending to 2-bytes.
+    if (OrigRegSize < TargetRegSize && OrigRegSize < 4) {
+      NewReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+      BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOVZX32rr8), NewReg)
+          .addReg(Reg);
+      if (&SetBRC == &X86::GR32RegClass)
+        return NewReg;
+      Reg = NewReg;
+      OrigRegSize = 4;
+    }
+
+    NewReg = MRI->createVirtualRegister(&SetBRC);
+    if (OrigRegSize < TargetRegSize) {
+      BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::SUBREG_TO_REG),
+              NewReg)
+          .addImm(0)
+          .addReg(Reg)
+          .addImm(SubRegIdx[OrigRegSize]);
+    } else if (OrigRegSize > TargetRegSize) {
+      BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::EXTRACT_SUBREG),
+              NewReg)
+          .addReg(Reg)
+          .addImm(SubRegIdx[TargetRegSize]);
+    } else {
+      BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::COPY), NewReg)
+          .addReg(Reg);
+    }
+    return NewReg;
+  };
+
+  unsigned &CondReg = CondRegs[X86::COND_B];
+  if (!CondReg)
+    CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, X86::COND_B);
+
+  // Adjust the condition to have the desired register width by zero-extending
+  // as needed.
+  // FIXME: We should use a better API to avoid the local reference and using a
+  // different variable here.
+  unsigned ExtCondReg = AdjustReg(CondReg);
+
+  // Now we need to turn this into a bitmask. We do this by subtracting it from
+  // zero.
+  unsigned ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+  BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOV32r0), ZeroReg);
+  ZeroReg = AdjustReg(ZeroReg);
+
+  unsigned Sub;
+  switch (SetBI.getOpcode()) {
+  case X86::SETB_C8r:
+    Sub = X86::SUB8rr;
+    break;
+
+  case X86::SETB_C16r:
+    Sub = X86::SUB16rr;
+    break;
+
+  case X86::SETB_C32r:
+    Sub = X86::SUB32rr;
+    break;
+
+  case X86::SETB_C64r:
+    Sub = X86::SUB64rr;
+    break;
+
+  default:
+    llvm_unreachable("Invalid SETB_C* opcode!");
+  }
+  unsigned ResultReg = MRI->createVirtualRegister(&SetBRC);
+  BuildMI(MBB, SetPos, SetLoc, TII->get(Sub), ResultReg)
+      .addReg(ZeroReg)
+      .addReg(ExtCondReg);
+  return RewriteToReg(ResultReg);
+}
+
 void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB,
                                             MachineBasicBlock::iterator TestPos,
                                             DebugLoc TestLoc,
diff --git a/llvm/test/CodeGen/X86/copy-eflags.ll b/llvm/test/CodeGen/X86/copy-eflags.ll
@@ -304,3 +304,62 @@ bb1:
   %tmp12 = trunc i32 %tmp11 to i16
   br label %bb1
 }
+
+; Use a particular instruction pattern in order to lower to the post-RA pseudo
+; used to lower SETB into an SBB pattern in order to make sure that kind of
+; usage of a copied EFLAGS continues to work.
+define void @PR37431(i32* %arg1, i8* %arg2, i8* %arg3) {
+; X32-LABEL: PR37431:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    .cfi_offset %esi, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    sarl $31, %ecx
+; X32-NEXT:    cmpl %eax, %eax
+; X32-NEXT:    sbbl %ecx, %eax
+; X32-NEXT:    setb %al
+; X32-NEXT:    sbbb %cl, %cl
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movb %cl, (%edx)
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    xorl %ecx, %ecx
+; X32-NEXT:    subl %eax, %ecx
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    xorl %edx, %edx
+; X32-NEXT:    idivl %ecx
+; X32-NEXT:    movb %dl, (%esi)
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl
+;
+; X64-LABEL: PR37431:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movslq (%rdi), %rax
+; X64-NEXT:    cmpq %rax, %rax
+; X64-NEXT:    sbbb %dl, %dl
+; X64-NEXT:    cmpq %rax, %rax
+; X64-NEXT:    movb %dl, (%rsi)
+; X64-NEXT:    sbbl %esi, %esi
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    idivl %esi
+; X64-NEXT:    movb %dl, (%rcx)
+; X64-NEXT:    retq
+entry:
+  %tmp = load i32, i32* %arg1
+  %tmp1 = sext i32 %tmp to i64
+  %tmp2 = icmp ugt i64 %tmp1, undef
+  %tmp3 = zext i1 %tmp2 to i8
+  %tmp4 = sub i8 0, %tmp3
+  store i8 %tmp4, i8* %arg2
+  %tmp5 = sext i8 %tmp4 to i32
+  %tmp6 = srem i32 0, %tmp5
+  %tmp7 = trunc i32 %tmp6 to i8
+  store i8 %tmp7, i8* %arg3
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/flags-copy-lowering.mir b/llvm/test/CodeGen/X86/flags-copy-lowering.mir
@@ -66,6 +66,12 @@
     call void @foo()
     ret void
   }
+
+  define void @test_setb_c(i64 %a, i64 %b) {
+  entry:
+    call void @foo()
+    ret void
+  }
 ...
 ---
 name:            test_branch
@@ -482,3 +488,68 @@ body:             |
     RET 0
 
 ...
+---
+name:            test_setb_c
+# CHECK-LABEL: name: test_setb_c
+liveins:
+  - { reg: '$rdi', virtual-reg: '%0' }
+  - { reg: '$rsi', virtual-reg: '%1' }
+body:             |
+  bb.0:
+    liveins: $rdi, $rsi
+
+    %0:gr64 = COPY $rdi
+    %1:gr64 = COPY $rsi
+    %2:gr64 = ADD64rr %0, %1, implicit-def $eflags
+    %3:gr64 = COPY $eflags
+  ; CHECK-NOT:  COPY{{( killed)?}} $eflags
+  ; CHECK:      %[[CF_REG:[^:]*]]:gr8 = SETBr implicit $eflags
+  ; CHECK-NOT:  COPY{{( killed)?}} $eflags
+
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+
+    $eflags = COPY %3
+    %4:gr8 = SETB_C8r implicit-def $eflags, implicit $eflags
+    MOV8mr $rsp, 1, $noreg, -16, $noreg, killed %4
+  ; CHECK-NOT:     $eflags =
+  ; CHECK:         %[[ZERO:[^:]*]]:gr32 = MOV32r0 implicit-def $eflags
+  ; CHECK-NEXT:    %[[ZERO_SUBREG:[^:]*]]:gr8 = EXTRACT_SUBREG %[[ZERO]], %subreg.sub_8bit
+  ; CHECK-NEXT:    %[[REPLACEMENT:[^:]*]]:gr8 = SUB8rr %[[ZERO_SUBREG]], %[[CF_REG]]
+  ; CHECK-NEXT:    MOV8mr $rsp, 1, $noreg, -16, $noreg, killed %[[REPLACEMENT]]
+
+    $eflags = COPY %3
+    %5:gr16 = SETB_C16r implicit-def $eflags, implicit $eflags
+    MOV16mr $rsp, 1, $noreg, -16, $noreg, killed %5
+  ; CHECK-NOT:     $eflags =
+  ; CHECK:         %[[CF_EXT:[^:]*]]:gr32 = MOVZX32rr8 %[[CF_REG]]
+  ; CHECK-NEXT:    %[[CF_TRUNC:[^:]*]]:gr16 = EXTRACT_SUBREG %[[CF_EXT]], %subreg.sub_16bit
+  ; CHECK-NEXT:    %[[ZERO:[^:]*]]:gr32 = MOV32r0 implicit-def $eflags
+  ; CHECK-NEXT:    %[[ZERO_SUBREG:[^:]*]]:gr16 = EXTRACT_SUBREG %[[ZERO]], %subreg.sub_16bit
+  ; CHECK-NEXT:    %[[REPLACEMENT:[^:]*]]:gr16 = SUB16rr %[[ZERO_SUBREG]], %[[CF_TRUNC]]
+  ; CHECK-NEXT:    MOV16mr $rsp, 1, $noreg, -16, $noreg, killed %[[REPLACEMENT]]
+
+    $eflags = COPY %3
+    %6:gr32 = SETB_C32r implicit-def $eflags, implicit $eflags
+    MOV32mr $rsp, 1, $noreg, -16, $noreg, killed %6
+  ; CHECK-NOT:     $eflags =
+  ; CHECK:         %[[CF_EXT:[^:]*]]:gr32 = MOVZX32rr8 %[[CF_REG]]
+  ; CHECK-NEXT:    %[[ZERO:[^:]*]]:gr32 = MOV32r0 implicit-def $eflags
+  ; CHECK-NEXT:    %[[REPLACEMENT:[^:]*]]:gr32 = SUB32rr %[[ZERO]], %[[CF_EXT]]
+  ; CHECK-NEXT:    MOV32mr $rsp, 1, $noreg, -16, $noreg, killed %[[REPLACEMENT]]
+
+    $eflags = COPY %3
+    %7:gr64 = SETB_C64r implicit-def $eflags, implicit $eflags
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %7
+  ; CHECK-NOT:     $eflags =
+  ; CHECK:         %[[CF_EXT1:[^:]*]]:gr32 = MOVZX32rr8 %[[CF_REG]]
+  ; CHECK-NEXT:    %[[CF_EXT2:[^:]*]]:gr64 = SUBREG_TO_REG 0, %[[CF_EXT1]], %subreg.sub_32bit
+  ; CHECK-NEXT:    %[[ZERO:[^:]*]]:gr32 = MOV32r0 implicit-def $eflags
+  ; CHECK-NEXT:    %[[ZERO_EXT:[^:]*]]:gr64 = SUBREG_TO_REG 0, %[[ZERO]], %subreg.sub_32bit
+  ; CHECK-NEXT:    %[[REPLACEMENT:[^:]*]]:gr64 = SUB64rr %[[ZERO_EXT]], %[[CF_EXT2]]
+  ; CHECK-NEXT:    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %[[REPLACEMENT]]
+
+    RET 0
+
+...