Index: include/llvm/IR/IntrinsicsR600.td =================================================================== --- include/llvm/IR/IntrinsicsR600.td +++ include/llvm/IR/IntrinsicsR600.td @@ -38,8 +38,12 @@ let TargetPrefix = "AMDGPU" in { def int_AMDGPU_div_scale : GCCBuiltin<"__builtin_amdgpu_div_scale">, + // 1st parameter: Numerator + // 2nd parameter: Denominator + // 3rd parameter: Constant to select select between first and + // second. (0 = first, 1 = second). Intrinsic<[llvm_anyfloat_ty, llvm_i1_ty], - [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], [IntrNoMem]>; def int_AMDGPU_div_fmas : GCCBuiltin<"__builtin_amdgpu_div_fmas">, Index: lib/Target/R600/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -85,6 +85,8 @@ bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); + SDNode *SelectDIV_SCALE(SDNode *N); + // Include the pieces autogenerated from the target description. #include "AMDGPUGenDAGISel.inc" }; @@ -489,6 +491,9 @@ PackedOffsetWidth); } + case AMDGPUISD::DIV_SCALE: { + return SelectDIV_SCALE(N); + } } return SelectCode(N); } @@ -682,6 +687,31 @@ return true; } +SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { + SDLoc SL(N); + EVT VT = N->getValueType(0); + + assert(VT == MVT::f32 || VT == MVT::f64); + + unsigned Opc + = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32; + + const SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32); + + SDValue Ops[] = { + N->getOperand(0), + N->getOperand(1), + N->getOperand(2), + Zero, + Zero, + Zero, + Zero + }; + + return CurDAG->SelectNodeTo(N, Opc, MVT::f64, MVT::i1, Ops); +} + + void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = *static_cast(getTargetLowering()); Index: lib/Target/R600/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/R600/AMDGPUISelLowering.cpp +++ lib/Target/R600/AMDGPUISelLowering.cpp @@ -849,9 +849,21 @@ return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - case Intrinsic::AMDGPU_div_scale: + case Intrinsic::AMDGPU_div_scale: { + // 3rd parameter required to be a constant. + const ConstantSDNode *Param = dyn_cast(Op.getOperand(3)); + if (!Param) + return DAG.getUNDEF(VT); + + // Translate to the operands expected by the machine instruction. The + // first parameter must be the same as the first instruction. + SDValue Numerator = Op.getOperand(1); + SDValue Denominator = Op.getOperand(2); + SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; + return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, VT, - Op.getOperand(1), Op.getOperand(2)); + Src0, Denominator, Numerator); + } case Intrinsic::AMDGPU_div_fmas: return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Index: lib/Target/R600/SIInstrInfo.cpp =================================================================== --- lib/Target/R600/SIInstrInfo.cpp +++ lib/Target/R600/SIInstrInfo.cpp @@ -524,6 +524,23 @@ return (MO.isImm() || MO.isFPImm()) && !isInlineConstant(MO); } +static bool compareMachineOp(const MachineOperand &Op0, + const MachineOperand &Op1) { + if (Op0.getType() != Op1.getType()) + return false; + + switch (Op0.getType()) { + case MachineOperand::MO_Register: + return Op0.getReg() == Op1.getReg(); + case MachineOperand::MO_Immediate: + return Op0.getImm() == Op1.getImm(); + case MachineOperand::MO_FPImmediate: + return Op0.getFPImm() == Op1.getFPImm(); + default: + llvm_unreachable("Didn't expect to be comparing these operand types"); + } +} + bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, StringRef &ErrInfo) const { uint16_t Opcode = MI->getOpcode(); @@ -630,6 +647,24 @@ return false; } } + + // Verify misc. restrictions on specific instructions. + if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || + Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { + MI->dump(); + + const MachineOperand &Src0 = MI->getOperand(2); + const MachineOperand &Src1 = MI->getOperand(3); + const MachineOperand &Src2 = MI->getOperand(4); + if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { + if (!compareMachineOp(Src0, Src1) && + !compareMachineOp(Src0, Src2)) { + ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; + return false; + } + } + } + return true; } Index: lib/Target/R600/SIInstrInfo.td =================================================================== --- lib/Target/R600/SIInstrInfo.td +++ lib/Target/R600/SIInstrInfo.td @@ -433,6 +433,22 @@ opName#" $dst, $src0_modifiers, $src1_modifiers, $src2_modifiers, $clamp, $omod", pattern >, VOP ; + +class VOP3b_Helper op, RegisterClass vrc, RegisterClass arc, + string opName, list pattern> : VOP3 < + op, (outs vrc:$dst0, SReg_64:$dst1), + (ins arc:$src0, arc:$src1, arc:$src2, + InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg), + opName#" $dst0, $dst1, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern +>, VOP ; + + +class VOP3b_64 op, string opName, list pattern> : + VOP3b_Helper ; + +class VOP3b_32 op, string opName, list pattern> : + VOP3b_Helper ; + //===----------------------------------------------------------------------===// // Vector I/O classes //===----------------------------------------------------------------------===// Index: lib/Target/R600/SIInstructions.td =================================================================== --- lib/Target/R600/SIInstructions.td +++ lib/Target/R600/SIInstructions.td @@ -1459,8 +1459,10 @@ } // isCommutable = 1 -defm V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>; -def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>; +def V_DIV_SCALE_F32 : VOP3b_32 <0x0000016d, "V_DIV_SCALE_F32", []>; + +// Double precision division pre-scale. +def V_DIV_SCALE_F64 : VOP3b_64 <0x0000016e, "V_DIV_SCALE_F64", []>; defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", [(set f32:$dst, (AMDGPUdiv_fmas f32:$src0, f32:$src1, f32:$src2))] Index: test/CodeGen/R600/llvm.AMDGPU.div_scale.ll =================================================================== --- test/CodeGen/R600/llvm.AMDGPU.div_scale.ll +++ test/CodeGen/R600/llvm.AMDGPU.div_scale.ll @@ -1,41 +1,48 @@ -; XFAIL: * ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -declare float @llvm.AMDGPU.div.scale.f32(float, float, float) nounwind readnone -declare double @llvm.AMDGPU.div.scale.f64(double, double, double) nounwind readnone +declare { float, i1 } @llvm.AMDGPU.div.scale.f32(float, float, i1) nounwind readnone +declare { double, i1 } @llvm.AMDGPU.div.scale.f64(double, double, i1) nounwind readnone ; SI-LABEL @test_div_scale_f32_1: +; SI: V_DIV_SCALE_F32 define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr) nounwind { %a = load float addrspace(1)* %aptr, align 4 %b = load float addrspace(1)* %bptr, align 4 - %result = call float @llvm.AMDGPU.div.scale.f32(float %a, float %b, float %a) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 ret void } ; SI-LABEL @test_div_scale_f32_2: +; SI: V_DIV_SCALE_F32 define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr) nounwind { %a = load float addrspace(1)* %aptr, align 4 %b = load float addrspace(1)* %bptr, align 4 - %result = call float @llvm.AMDGPU.div.scale.f32(float %a, float %b, float %b) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 ret void } ; SI-LABEL @test_div_scale_f64_1: +; SI: V_DIV_SCALE_F64 define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %bptr, double addrspace(1)* %cptr) nounwind { %a = load double addrspace(1)* %aptr, align 8 %b = load double addrspace(1)* %bptr, align 8 - %result = call double @llvm.AMDGPU.div.scale.f64(double %a, double %b, double %a) nounwind readnone - store double %result, double addrspace(1)* %out, align 8 + %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 ret void } ; SI-LABEL @test_div_scale_f64_1: +; SI: V_DIV_SCALE_F64 define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %bptr, double addrspace(1)* %cptr) nounwind { %a = load double addrspace(1)* %aptr, align 8 %b = load double addrspace(1)* %bptr, align 8 - %result = call double @llvm.AMDGPU.div.scale.f64(double %a, double %b, double %b) nounwind readnone - store double %result, double addrspace(1)* %out, align 8 + %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 ret void }