Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -87,6 +87,7 @@ bool selectG_TRUNC(MachineInstr &I) const; bool selectG_SZA_EXT(MachineInstr &I) const; bool selectG_CONSTANT(MachineInstr &I) const; + bool selectG_FNEG(MachineInstr &I) const; bool selectG_AND_OR_XOR(MachineInstr &I) const; bool selectG_ADD_SUB(MachineInstr &I) const; bool selectG_UADDO_USUBO_UADDE_USUBE(MachineInstr &I) const; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1664,6 +1664,61 @@ return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); } +bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const { + // Only manually handle the f64 SGPR case. + // + // FIXME: This is a workaround for 2.5 different tablegen problems. Because + // the bit ops theoretically have a second result due to the implicit def of + // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing + // that is easy by disabling the check. The result works, but uses a + // nonsensical sreg32orldS_and_sreg_1 regclass. + // + // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to + // the variadic REG_SEQUENCE operands. + + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); + if (DstRB->getID() != AMDGPU::SGPRRegBankID || + MRI->getType(Dst) != LLT::scalar(64)) + return false; + + Register Src = MI.getOperand(1).getReg(); + MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI); + if (Fabs) + Src = Fabs->getOperand(1).getReg(); + + if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || + !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) + return false; + + MachineBasicBlock *BB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + + BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) + .addReg(Src, 0, AMDGPU::sub0); + BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) + .addReg(Src, 0, AMDGPU::sub1); + BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) + .addImm(0x80000000); + + // Set or toggle sign bit. + unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32; + BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg) + .addReg(HiReg) + .addReg(ConstReg); + BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) + .addReg(LoReg) + .addImm(AMDGPU::sub0) + .addReg(OpReg) + .addImm(AMDGPU::sub1); + MI.eraseFromParent(); + return true; +} + static bool isConstant(const MachineInstr &MI) { return MI.getOpcode() == TargetOpcode::G_CONSTANT; } @@ -2075,6 +2130,10 @@ case TargetOpcode::G_CONSTANT: case TargetOpcode::G_FCONSTANT: return selectG_CONSTANT(I); + case TargetOpcode::G_FNEG: + if (selectImpl(I, *CoverageInfo)) + return true; + return selectG_FNEG(I); case TargetOpcode::G_EXTRACT: return selectG_EXTRACT(I); case TargetOpcode::G_MERGE_VALUES: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fneg.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fneg.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fneg.mir @@ -200,9 +200,13 @@ liveins: $sgpr0_sgpr1 ; GCN-LABEL: name: fneg_s64_ss ; GCN: liveins: $sgpr0_sgpr1 - ; GCN: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 - ; GCN: [[FNEG:%[0-9]+]]:sreg_64(s64) = G_FNEG [[COPY]] - ; GCN: $sgpr0_sgpr1 = COPY [[FNEG]](s64) + ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GCN: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GCN: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 + ; GCN: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc + ; GCN: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[S_XOR_B32_]], %subreg.sub1 + ; GCN: $sgpr0_sgpr1 = COPY [[REG_SEQUENCE]] %0:sgpr(s64) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_FNEG %0 $sgpr0_sgpr1 = COPY %1 @@ -462,10 +466,13 @@ liveins: $sgpr0_sgpr1 ; GCN-LABEL: name: fneg_fabs_s64_ss ; GCN: liveins: $sgpr0_sgpr1 - ; GCN: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 - ; GCN: [[FABS:%[0-9]+]]:sgpr(s64) = G_FABS [[COPY]] - ; GCN: [[FNEG:%[0-9]+]]:sreg_64(s64) = G_FNEG [[FABS]] - ; GCN: $sgpr0_sgpr1 = COPY [[FNEG]](s64) + ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GCN: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GCN: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 + ; GCN: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc + ; GCN: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[S_OR_B32_]], %subreg.sub1 + ; GCN: $sgpr0_sgpr1 = COPY [[REG_SEQUENCE]] %0:sgpr(s64) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_FABS %0 %2:sgpr(s64) = G_FNEG %1