Index: lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- lib/Target/AMDGPU/SIFoldOperands.cpp +++ lib/Target/AMDGPU/SIFoldOperands.cpp @@ -697,7 +697,8 @@ unsigned Op = MI.getOpcode(); switch (Op) { case AMDGPU::V_MAX_F32_e64: - case AMDGPU::V_MAX_F16_e64: { + case AMDGPU::V_MAX_F16_e64: + case AMDGPU::V_MAX_F64: { if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm()) return nullptr; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -4054,7 +4054,8 @@ } // No med3 for f16, but clamp is possible. - if (VT == MVT::f16) + // TODO: gfx9 has med3 f16 + if (VT == MVT::f16 || VT == MVT::f64) return SDValue(); // This isn't safe with signaling NaNs because in IEEE mode, min/max on a @@ -4072,6 +4073,7 @@ DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); unsigned Opc = N->getOpcode(); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); @@ -4079,7 +4081,9 @@ // Only do this if the inner op has one use since this will just increases // register pressure for no benefit. - if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) { + + if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY && + VT != MVT::f64) { // max(max(a, b), c) -> max3(a, b, c) // min(min(a, b), c) -> min3(a, b, c) if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { @@ -4121,8 +4125,8 @@ if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) || (Opc == AMDGPUISD::FMIN_LEGACY && Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && - (N->getValueType(0) == MVT::f32 || - (N->getValueType(0) == MVT::f16 && Subtarget->has16BitInsts())) && + (VT == MVT::f32 || VT == MVT::f64 || + (VT == MVT::f16 && Subtarget->has16BitInsts())) && Op0.hasOneUse()) { if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) return Res; @@ -4403,7 +4407,6 @@ case AMDGPUISD::FMIN_LEGACY: case AMDGPUISD::FMAX_LEGACY: { if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && - N->getValueType(0) != MVT::f64 && getTargetMachine().getOptLevel() > CodeGenOpt::None) return performMinMaxCombine(N, DCI); break; Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -657,8 +657,8 @@ i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, $omod) >; -// TODO: Does f64 support clamp? def : ClampPat; +def : ClampPat; def : ClampPat; /********** ================================ **********/ Index: test/CodeGen/AMDGPU/clamp-modifier.ll =================================================================== --- test/CodeGen/AMDGPU/clamp-modifier.ll +++ test/CodeGen/AMDGPU/clamp-modifier.ll @@ -153,6 +153,21 @@ ret void } +; GCN-LABEL: {{^}}v_clamp_add_src_f64: +; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] +; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, [[A]], 1.0 clamp{{$}} +define amdgpu_kernel void @v_clamp_add_src_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid + %a = load double, double addrspace(1)* %gep0 + %add = fadd double %a, 1.0 + %max = call double @llvm.maxnum.f64(double %add, double 0.0) + %clamp = call double @llvm.minnum.f64(double %max, double 1.0) + store double %clamp, double addrspace(1)* %out.gep + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 declare float @llvm.fabs.f32(float) #1 declare float @llvm.floor.f32(float) #1 Index: test/CodeGen/AMDGPU/clamp.ll =================================================================== --- test/CodeGen/AMDGPU/clamp.ll +++ test/CodeGen/AMDGPU/clamp.ll @@ -147,8 +147,7 @@ ; FIXME: Do f64 instructions support clamp? ; GCN-LABEL: {{^}}v_clamp_f64: ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] -; GCN: v_max_f64 -; GCN: v_min_f64 +; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, [[A]], [[A]] clamp{{$}} define amdgpu_kernel void @v_clamp_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid @@ -163,8 +162,7 @@ ; GCN-LABEL: {{^}}v_clamp_neg_f64: ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] -; GCN: v_max_f64 -; GCN: v_min_f64 +; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -[[A]], -[[A]] clamp{{$}} define amdgpu_kernel void @v_clamp_neg_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid @@ -180,8 +178,7 @@ ; GCN-LABEL: {{^}}v_clamp_negabs_f64: ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] -; GCN: v_max_f64 -; GCN: v_min_f64 +; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -|[[A]]|, -|[[A]]| clamp{{$}} define amdgpu_kernel void @v_clamp_negabs_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid