Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -135,6 +135,8 @@ void SelectADD_SUB_I64(SDNode *N); void SelectDIV_SCALE(SDNode *N); + void SelectFMA_W_CHAIN(SDNode *N); + void SelectFMUL_W_CHAIN(SDNode *N); SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val, uint32_t Offset, uint32_t Width); @@ -296,6 +298,15 @@ SelectADD_SUB_I64(N); return; } + case AMDGPUISD::FMUL_W_CHAIN: { + SelectFMUL_W_CHAIN(N); + return; + } + case AMDGPUISD::FMA_W_CHAIN: { + SelectFMA_W_CHAIN(N); + return; + } + case ISD::SCALAR_TO_VECTOR: case AMDGPUISD::BUILD_VERTICAL_VECTOR: case ISD::BUILD_VECTOR: { @@ -653,6 +664,33 @@ CurDAG->RemoveDeadNode(N); } +void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) { + SDLoc SL(N); + // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod + SDValue Ops[10]; + + SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]); + SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]); + SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]); + Ops[8] = N->getOperand(0); + Ops[9] = N->getOperand(4); + + CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32, N->getVTList(), Ops); +} + +void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) { + SDLoc SL(N); + // src0_modifiers, src0, src1_modifiers, src1, clamp, omod + SDValue Ops[8]; + + SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]); + SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]); + Ops[6] = N->getOperand(0); + Ops[7] = N->getOperand(3); + + CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops); +} + // We need to handle this here because tablegen doesn't support matching // instructions with multiple outputs. void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -230,6 +230,10 @@ // This is SETCC with the full mask result which is used for a compare with a // result bit per item in the wavefront. SETCC, + SETREG, + // FP ops with input and output chain. + FMA_W_CHAIN, + FMUL_W_CHAIN, // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. // Denormals handled on some parts. Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2953,6 +2953,9 @@ NODE_NAME_CASE(DWORDADDR) NODE_NAME_CASE(FRACT) NODE_NAME_CASE(SETCC) + NODE_NAME_CASE(SETREG) + NODE_NAME_CASE(FMA_W_CHAIN) + NODE_NAME_CASE(FMUL_W_CHAIN) NODE_NAME_CASE(CLAMP) NODE_NAME_CASE(COS_HW) NODE_NAME_CASE(SIN_HW) Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -150,6 +150,19 @@ def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>; +def AMDGPUSetRegOp : SDTypeProfile<0, 2, [ + SDTCisInt<0>, SDTCisInt<1> +]>; + +def AMDGPUsetreg : SDNode<"AMDGPUISD::SETREG", AMDGPUSetRegOp, [ + SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>; + +def AMDGPUfma : SDNode<"AMDGPUISD::FMA_W_CHAIN", SDTFPTernaryOp, [ + SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def AMDGPUmul : SDNode<"AMDGPUISD::FMUL_W_CHAIN", SDTFPBinOp, [ + SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0", SDTIntToFPOp, []>; def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1", Index: lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- lib/Target/AMDGPU/SIFoldOperands.cpp +++ lib/Target/AMDGPU/SIFoldOperands.cpp @@ -173,6 +173,16 @@ MI->setDesc(TII->get(Opc)); } + // Special case for s_setreg_b32 + if (Opc == AMDGPU::S_SETREG_B32) { + MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32)); + bool FoldAsImm32 = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII); + if (!FoldAsImm32) { + MI->setDesc(TII->get(Opc)); + } + return FoldAsImm32; + } + // If we are already folding into another operand of MI, then // we can't commute the instruction, otherwise we risk making the // other fold illegal. Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -21,6 +21,7 @@ #include "AMDGPU.h" #include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" +#include "SIDefines.h" #include "SIISelLowering.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" @@ -2858,6 +2859,47 @@ return SDValue(); } +static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, + EVT VT, SDValue A, SDValue B, SDValue GlueChain) { + if (GlueChain->getNumValues() <= 1) { + return DAG.getNode(Opcode, SL, VT, A, B); + } + + assert(GlueChain->getNumValues() == 3); + + SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue); + switch (Opcode) { + default: llvm_unreachable("no chain equivalent for opcode"); + case ISD::FMUL: + Opcode = AMDGPUISD::FMUL_W_CHAIN; + break; + } + + return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, + GlueChain.getValue(2)); +} + +static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, + EVT VT, SDValue A, SDValue B, SDValue C, + SDValue GlueChain) { + if (GlueChain->getNumValues() <= 1) { + return DAG.getNode(Opcode, SL, VT, A, B, C); + } + + assert(GlueChain->getNumValues() == 3); + + SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue); + switch (Opcode) { + default: llvm_unreachable("no chain equivalent for opcode"); + case ISD::FMA: + Opcode = AMDGPUISD::FMA_W_CHAIN; + break; + } + + return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C, + GlueChain.getValue(2)); +} + // Faster 2.5 ULP division that does not support denormals. SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); @@ -2904,25 +2946,73 @@ SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); - SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS); - SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS); + SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, + RHS, RHS, LHS); + SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, + LHS, RHS, LHS); // Denominator is scaled to not be denormal, so using rcp is ok. - SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled); + SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, + DenominatorScaled); + SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, + DenominatorScaled); + + const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE | + (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | + (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); + + const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16); + + if (!Subtarget->hasFP32Denormals()) { + SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE, + SL, MVT::i32); + SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs, + DAG.getEntryNode(), + EnableDenormValue, BitField); + SDValue Ops[3] = { + NegDivScale0, + EnableDenorm.getValue(0), + EnableDenorm.getValue(1) + }; + + NegDivScale0 = DAG.getMergeValues(Ops, SL); + } + + SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, + ApproxRcp, One, NegDivScale0); + + SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, + ApproxRcp, Fma0); - SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled); + SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, + Fma1, Fma1); - SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, ApproxRcp, One); - SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, ApproxRcp); + SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, + NumeratorScaled, Mul); - SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1); + SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2); - SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled); - SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul); - SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled); + SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, + NumeratorScaled, Fma3); + + if (!Subtarget->hasFP32Denormals()) { + const SDValue DisableDenormValue = + DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); + SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other, + Fma4.getValue(1), + DisableDenormValue, + BitField, + Fma4.getValue(2)); + + SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, + DisableDenorm, DAG.getRoot()); + DAG.setRoot(OutputChain); + } SDValue Scale = NumeratorScaled.getValue(1); - SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4, Fma1, Fma3, Scale); + SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, + Fma4, Fma1, Fma3, Scale); return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS); } Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1669,6 +1669,8 @@ // boundaries prevents incorrect movements of such instructions. return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) || MI.modifiesRegister(AMDGPU::EXEC, &RI) || + MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || + MI.getOpcode() == AMDGPU::S_SETREG_B32 || changesVGPRIndexingMode(MI); } Index: lib/Target/AMDGPU/SOPInstructions.td =================================================================== --- lib/Target/AMDGPU/SOPInstructions.td +++ lib/Target/AMDGPU/SOPInstructions.td @@ -590,10 +590,13 @@ >; } +let hasSideEffects = 1 in { + def S_SETREG_B32 : SOPK_Pseudo < "s_setreg_b32", (outs), (ins SReg_32:$sdst, hwreg:$simm16), - "$simm16, $sdst" + "$simm16, $sdst", + [(AMDGPUsetreg i32:$sdst, (i16 timm:$simm16))] >; // FIXME: Not on SI? @@ -607,6 +610,7 @@ let has_sdst = 0; } +} // End hasSideEffects = 1 //===----------------------------------------------------------------------===// // SOPC Instructions Index: test/CodeGen/AMDGPU/fdiv.ll =================================================================== --- test/CodeGen/AMDGPU/fdiv.ll +++ test/CodeGen/AMDGPU/fdiv.ll @@ -1,4 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s ; These tests check that fdiv is expanded correctly and also test that the @@ -11,18 +13,20 @@ ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS -; SI: v_div_scale_f32 -; SI-DAG: v_div_scale_f32 - -; SI-DAG: v_rcp_f32 -; SI: v_fma_f32 -; SI: v_fma_f32 -; SI: v_mul_f32 -; SI: v_fma_f32 -; SI: v_fma_f32 -; SI: v_fma_f32 -; SI: v_div_fmas_f32 -; SI: v_div_fixup_f32 +; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]] +; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] +; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] + +; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 +; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] +; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]] +; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] +; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] +; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] +; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] +; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 { entry: %fdiv = fdiv float %a, %b @@ -30,12 +34,37 @@ ret void } +; FUNC-LABEL: {{^}}fdiv_f32_denormals: +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS + +; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]] +; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] +; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] + +; GCN-NOT: s_setreg +; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 +; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] +; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]] +; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] +; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] +; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] +; GCN-NOT: s_setreg +; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] +; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], +define void @fdiv_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 { +entry: + %fdiv = fdiv float %a, %b + store float %fdiv, float addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}fdiv_25ulp_f32: -; SI: v_cndmask_b32 -; SI: v_mul_f32 -; SI: v_rcp_f32 -; SI: v_mul_f32 -; SI: v_mul_f32 +; GCN: v_cndmask_b32 +; GCN: v_mul_f32 +; GCN: v_rcp_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 define void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 { entry: %fdiv = fdiv float %a, %b, !fpmath !0 @@ -45,9 +74,9 @@ ; Use correct fdiv ; FUNC-LABEL: {{^}}fdiv_25ulp_denormals_f32: -; SI: v_fma_f32 -; SI: v_div_fmas_f32 -; SI: v_div_fixup_f32 +; GCN: v_fma_f32 +; GCN: v_div_fmas_f32 +; GCN: v_div_fixup_f32 define void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { entry: %fdiv = fdiv float %a, %b, !fpmath !0 @@ -56,10 +85,10 @@ } ; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32: -; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} -; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] -; SI-NOT: [[RESULT]] -; SI: buffer_store_dword [[RESULT]] +; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} +; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] +; GCN-NOT: [[RESULT]] +; GCN: buffer_store_dword [[RESULT]] define void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { entry: %fdiv = fdiv fast float %a, %b @@ -71,10 +100,10 @@ ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS -; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} -; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] -; SI-NOT: [[RESULT]] -; SI: buffer_store_dword [[RESULT]] +; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} +; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] +; GCN-NOT: [[RESULT]] +; GCN: buffer_store_dword [[RESULT]] define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 { entry: %fdiv = fdiv fast float %a, %b @@ -86,10 +115,10 @@ ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS -; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} -; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] -; SI-NOT: [[RESULT]] -; SI: buffer_store_dword [[RESULT]] +; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} +; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] +; GCN-NOT: [[RESULT]] +; GCN: buffer_store_dword [[RESULT]] define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 { entry: %fdiv = fdiv arcp float %a, %b @@ -103,10 +132,10 @@ ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS -; SI: v_div_scale_f32 -; SI: v_div_scale_f32 -; SI: v_div_scale_f32 -; SI: v_div_scale_f32 +; GCN: v_div_scale_f32 +; GCN: v_div_scale_f32 +; GCN: v_div_scale_f32 +; GCN: v_div_scale_f32 define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { entry: %fdiv = fdiv <2 x float> %a, %b @@ -115,8 +144,8 @@ } ; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32: -; SI: v_cmp_gt_f32 -; SI: v_cmp_gt_f32 +; GCN: v_cmp_gt_f32 +; GCN: v_cmp_gt_f32 define void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { entry: %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0 @@ -130,8 +159,8 @@ ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS -; SI: v_rcp_f32 -; SI: v_rcp_f32 +; GCN: v_rcp_f32 +; GCN: v_rcp_f32 define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { entry: %fdiv = fdiv fast <2 x float> %a, %b @@ -145,8 +174,8 @@ ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS -; SI: v_rcp_f32 -; SI: v_rcp_f32 +; GCN: v_rcp_f32 +; GCN: v_rcp_f32 define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { entry: %fdiv = fdiv arcp <2 x float> %a, %b @@ -164,10 +193,10 @@ ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; SI: v_div_fixup_f32 -; SI: v_div_fixup_f32 -; SI: v_div_fixup_f32 -; SI: v_div_fixup_f32 +; GCN: v_div_fixup_f32 +; GCN: v_div_fixup_f32 +; GCN: v_div_fixup_f32 +; GCN: v_div_fixup_f32 define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float>, <4 x float> addrspace(1) * %in @@ -187,10 +216,10 @@ ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; SI: v_rcp_f32 -; SI: v_rcp_f32 -; SI: v_rcp_f32 -; SI: v_rcp_f32 +; GCN: v_rcp_f32 +; GCN: v_rcp_f32 +; GCN: v_rcp_f32 +; GCN: v_rcp_f32 define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float>, <4 x float> addrspace(1) * %in @@ -210,10 +239,10 @@ ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; SI: v_rcp_f32 -; SI: v_rcp_f32 -; SI: v_rcp_f32 -; SI: v_rcp_f32 +; GCN: v_rcp_f32 +; GCN: v_rcp_f32 +; GCN: v_rcp_f32 +; GCN: v_rcp_f32 define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float>, <4 x float> addrspace(1) * %in