Index: lib/Target/AMDGPU/SIDefines.h =================================================================== --- lib/Target/AMDGPU/SIDefines.h +++ lib/Target/AMDGPU/SIDefines.h @@ -65,8 +65,8 @@ SOPK_ZEXT = UINT64_C(1) << 38, SCALAR_STORE = UINT64_C(1) << 39, FIXED_SIZE = UINT64_C(1) << 40, - VOPAsmPrefer32Bit = UINT64_C(1) << 41 - + VOPAsmPrefer32Bit = UINT64_C(1) << 41, + HasFPClamp = UINT64_C(1) << 42 }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. Index: lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- lib/Target/AMDGPU/SIFoldOperands.cpp +++ lib/Target/AMDGPU/SIFoldOperands.cpp @@ -66,6 +66,7 @@ MachineRegisterInfo *MRI; const SIInstrInfo *TII; const SIRegisterInfo *TRI; + const SISubtarget *ST; void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, @@ -75,6 +76,10 @@ void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; + const MachineOperand *isClamp(const SIInstrInfo &TII, + const MachineInstr &MI) const; + bool tryFoldClamp(MachineInstr &MI); + public: SIFoldOperands() : MachineFunctionPass(ID) { initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); @@ -686,14 +691,77 @@ } } +const MachineOperand *SIFoldOperands::isClamp(const SIInstrInfo &TII, + const MachineInstr &MI) const { + unsigned Op = MI.getOpcode(); + switch (Op) { + case AMDGPU::V_MAX_F32_e64: + case AMDGPU::V_MAX_F16_e64: { + // XXX - Do denormals work with clamp? At least for f16? + if ((ST->hasFP32Denormals() && Op == AMDGPU::V_MAX_F32_e64) || + (ST->hasFP16Denormals() && Op == AMDGPU::V_MAX_F16_e64)) + return nullptr; + + if (!TII.getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm()) + return nullptr; + + // Make sure sources are identical. + const MachineOperand *Src0 = TII.getNamedOperand(MI, AMDGPU::OpName::src0); + const MachineOperand *Src1 = TII.getNamedOperand(MI, AMDGPU::OpName::src1); + if (!Src0->isReg() || Src0->getSubReg() != Src1->getSubReg() || + Src0->getSubReg() != AMDGPU::NoSubRegister) + return nullptr; + + // Can't fold up if we have source modifiers. + if (TII.hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || + TII.hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)) + return nullptr; + return Src0; + } + default: + return nullptr; + } +} + +// We obviously have multiple uses in a clamp since the register is used twice +// in the same instruction. +static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) { + int Count = 0; + for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end(); + I != E; ++I) { + if (++Count > 1) + return false; + } + + return true; +} + +// FIXME: Does this need to check IEEE bit on function? +bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) { + const MachineOperand *ClampSrc = isClamp(*TII, MI); + if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg())) + return false; + + MachineInstr *Def = MRI->getUniqueVRegDef(ClampSrc->getReg()); + if (!TII->hasFPClamp(*Def)) + return false; + MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp); + if (!DefClamp) + return false; + + DefClamp->setImm(1); + MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg()); + MI.eraseFromParent(); + return true; +} + bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction())) return false; - const SISubtarget &ST = MF.getSubtarget(); - MRI = &MF.getRegInfo(); - TII = ST.getInstrInfo(); + ST = &MF.getSubtarget(); + TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); @@ -705,8 +773,11 @@ Next = std::next(I); MachineInstr &MI = *I; - if (!isSafeToFold(MI)) + if (!isSafeToFold(MI)) { + // TODO: Try omod also. + tryFoldClamp(MI); continue; + } MachineOperand &OpToFold = MI.getOperand(1); bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); Index: lib/Target/AMDGPU/SIInstrFormats.td =================================================================== --- lib/Target/AMDGPU/SIInstrFormats.td +++ lib/Target/AMDGPU/SIInstrFormats.td @@ -78,6 +78,10 @@ // is unable to infer the encoding from the operands. field bit VOPAsmPrefer32Bit = 0; + // This bit indicates that this has a floating point result type, so + // the clamp modifier has floating point semantics. + field bit FPClamp = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = SALU; let TSFlags{1} = VALU; @@ -120,6 +124,7 @@ let TSFlags{39} = ScalarStore; let TSFlags{40} = FixedSize; let TSFlags{41} = VOPAsmPrefer32Bit; + let TSFlags{42} = FPClamp; let SchedRW = [Write32Bit]; Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -474,6 +474,14 @@ return get(Opcode).TSFlags & SIInstrFlags::FIXED_SIZE; } + static bool hasFPClamp(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::HasFPClamp; + } + + bool hasFPClamp(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::HasFPClamp; + } + bool isVGPRCopy(const MachineInstr &MI) const { assert(MI.isCopy()); unsigned Dest = MI.getOperand(0).getReg(); Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -1086,6 +1086,7 @@ field bit HasOMod = HasModifiers; field bit HasClamp = HasModifiers; field bit HasSDWAClamp = HasSrc0; + field bit HasFPClamp = BitAnd.ret, HasClamp>.ret; field bit HasExt = getHasExt.ret; Index: lib/Target/AMDGPU/VOPInstructions.td =================================================================== --- lib/Target/AMDGPU/VOPInstructions.td +++ lib/Target/AMDGPU/VOPInstructions.td @@ -100,6 +100,7 @@ let VOP3 = 1; let VALU = 1; + let FPClamp = P.HasFPClamp; let Uses = [EXEC]; let AsmVariantName = AMDGPUAsmVariants.VOP3; Index: test/CodeGen/AMDGPU/clamp.ll =================================================================== --- test/CodeGen/AMDGPU/clamp.ll +++ test/CodeGen/AMDGPU/clamp.ll @@ -87,9 +87,8 @@ ; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] ; VI: v_max_f16_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} -; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]] -; SI: v_max_f32_e64 v{{[0-9]+}}, [[CVT]], [[CVT]] clamp{{$}} -; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], [[A]] clamp{{$}} +; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]] define amdgpu_kernel void @v_clamp_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid @@ -107,9 +106,8 @@ ; VI: v_max_f16_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}} ; FIXME: Better to fold neg into max -; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] -; SI: v_max_f32_e64 v{{[0-9]+}}, [[CVT]], [[CVT]] clamp{{$}} -; SI: v_cvt_f16_f32 +; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] clamp{{$}} +; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]] define amdgpu_kernel void @v_clamp_neg_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid @@ -129,9 +127,8 @@ ; FIXME: Better to fold neg/abs into max -; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[A]]| -; SI: v_max_f32_e64 v{{[0-9]+}}, [[CVT]], [[CVT]] clamp{{$}} -; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[A]]| clamp{{$}} +; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]] define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid