Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -1459,19 +1459,24 @@ multiclass VOP3_C_m pattern, string opName, - bit HasMods, bit defExec, string revOp> { + bit HasMods, bit defExec, + string revOp, list sched> { def "" : VOP3_Pseudo , - VOP2_REV; + VOP2_REV { + let SchedRW = sched; + } def _si : VOP3_Real_si , VOP3DisableFields<1, 0, HasMods> { let Defs = !if(defExec, [EXEC], []); + let SchedRW = sched; } def _vi : VOP3_Real_vi , VOP3DisableFields<1, 0, HasMods> { let Defs = !if(defExec, [EXEC], []); + let SchedRW = sched; } } @@ -1657,39 +1662,40 @@ multiclass VOPC_m pattern, string opName, bit DefExec, VOPProfile p, + list sched, string revOpName = "", string asm = opName#"_e32 "#op_asm, string alias_asm = opName#" "#op_asm> { - def "" : VOPC_Pseudo ; + def "" : VOPC_Pseudo { + let SchedRW = sched; + } let AssemblerPredicates = [isSICI] in { + def _si : VOPC, + SIMCInstr { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let hasSideEffects = DefExec; + let SchedRW = sched; + } - def _si : VOPC, - SIMCInstr { - let Defs = !if(DefExec, [VCC, EXEC], [VCC]); - let hasSideEffects = DefExec; - } - - def : SIInstAlias < - alias_asm, - (!cast(NAME#"_e32_si") p.Src0RC32:$src0, p.Src1RC32:$src1) - >; + def : SIInstAlias < + alias_asm, + (!cast(NAME#"_e32_si") p.Src0RC32:$src0, p.Src1RC32:$src1) + >; } // End AssemblerPredicates = [isSICI] - let AssemblerPredicates = [isVI] in { + def _vi : VOPC, + SIMCInstr { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let hasSideEffects = DefExec; + let SchedRW = sched; + } - def _vi : VOPC, - SIMCInstr { - let Defs = !if(DefExec, [VCC, EXEC], [VCC]); - let hasSideEffects = DefExec; - } - - def : SIInstAlias < - alias_asm, - (!cast(NAME#"_e32_vi") p.Src0RC32:$src0, p.Src1RC32:$src1) - >; - + def : SIInstAlias < + alias_asm, + (!cast(NAME#"_e32_vi") p.Src0RC32:$src0, p.Src1RC32:$src1) + >; } // End AssemblerPredicates = [isVI] } @@ -1697,11 +1703,13 @@ dag ins32, string asm32, list pat32, dag out64, dag ins64, string asm64, list pat64, bit HasMods, bit DefExec, string revOp, - VOPProfile p> { - defm _e32 : VOPC_m ; + VOPProfile p, + list sched> { + defm _e32 : VOPC_m ; defm _e64 : VOP3_C_m ; + opName, HasMods, DefExec, revOp, + sched>; } // Special case for class instructions which only have modifiers on @@ -1710,18 +1718,21 @@ dag ins32, string asm32, list pat32, dag out64, dag ins64, string asm64, list pat64, bit HasMods, bit DefExec, string revOp, - VOPProfile p> { - defm _e32 : VOPC_m ; + VOPProfile p, + list sched> { + defm _e32 : VOPC_m ; defm _e64 : VOP3_C_m , + opName, HasMods, DefExec, revOp, sched>, VOP3DisableModFields<1, 0, 0>; } multiclass VOPCInst : VOPC_Helper < + bit DefExec = 0, + list sched = [Write32Bit]> : + VOPC_Helper < op, opName, P.Ins32, P.Asm32, [], (outs VOPDstS64:$dst), P.Ins64, P.Asm64, @@ -1732,11 +1743,12 @@ (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), cond))], [(set i1:$dst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]), - P.HasModifiers, DefExec, revOp, P + P.HasModifiers, DefExec, revOp, P, sched >; multiclass VOPCClassInst : VOPC_Class_Helper < + bit DefExec = 0, + list sched> : VOPC_Class_Helper < op, opName, P.Ins32, P.Asm32, [], (outs VOPDstS64:$dst), P.Ins64, P.Asm64, @@ -1744,7 +1756,7 @@ [(set i1:$dst, (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))], [(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]), - P.HasModifiers, DefExec, opName, P + P.HasModifiers, DefExec, opName, P, sched >; @@ -1752,31 +1764,32 @@ VOPCInst ; multiclass VOPC_F64 : - VOPCInst ; + VOPCInst ; multiclass VOPC_I32 : VOPCInst ; multiclass VOPC_I64 : - VOPCInst ; + VOPCInst ; multiclass VOPCX sched, string revOp = ""> - : VOPCInst ; + : VOPCInst ; multiclass VOPCX_F32 : - VOPCX ; + VOPCX ; multiclass VOPCX_F64 : - VOPCX ; + VOPCX ; multiclass VOPCX_I32 : - VOPCX ; + VOPCX ; multiclass VOPCX_I64 : - VOPCX ; + VOPCX ; multiclass VOP3_Helper pat, int NumSrcArgs, bit HasMods> : VOP3_m < @@ -1784,16 +1797,16 @@ >; multiclass VOPC_CLASS_F32 : - VOPCClassInst ; + VOPCClassInst ; multiclass VOPCX_CLASS_F32 : - VOPCClassInst ; + VOPCClassInst ; multiclass VOPC_CLASS_F64 : - VOPCClassInst ; + VOPCClassInst ; multiclass VOPCX_CLASS_F64 : - VOPCClassInst ; + VOPCClassInst ; multiclass VOP3Inst : VOP3_Helper < Index: lib/Target/AMDGPU/SISchedule.td =================================================================== --- lib/Target/AMDGPU/SISchedule.td +++ lib/Target/AMDGPU/SISchedule.td @@ -21,12 +21,23 @@ // Vector ALU instructions def Write32Bit : SchedWrite; def WriteQuarterRate32 : SchedWrite; +def WriteFullOrQuarterRate32 : SchedWrite; def WriteFloatFMA : SchedWrite; -def WriteDouble : SchedWrite; +// Slow quarter rate f64 instruction. +def WriteDouble : SchedWrite; + +// half rate f64 instruction (same as v_add_f64) def WriteDoubleAdd : SchedWrite; +// Half rate 64-bit instructions. +def Write64Bit : SchedWrite; + +// FIXME: Should there be a class for instructions which are VALU +// instructions and have VALU rates, but write to the SALU (i.e. VOPC +// instructions) + def SIFullSpeedModel : SchedMachineModel; def SIQuarterSpeedModel : SchedMachineModel; @@ -53,7 +64,7 @@ // The latency numbers are taken from AMD Accelerated Parallel Processing -// guide. They may not be acurate. +// guide. They may not be accurate. // The latency values are 1 / (operations / cycle) / 4. multiclass SICommonWriteRes { @@ -66,6 +77,7 @@ def : HWWriteRes; // 300 - 600 def : HWVALUWriteRes; + def : HWVALUWriteRes; def : HWVALUWriteRes; } Index: test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll +++ test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll @@ -271,7 +271,8 @@ ; SI: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb ; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} ; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI-NOT: vcc +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 { @@ -285,7 +286,8 @@ ; SI-DAG: buffer_load_dwordx2 [[VA:v\[[0-9]+:[0-9]+\]]] ; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} ; SI: v_cmp_class_f64_e32 vcc, [[VA]], [[MASK]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI-NOT: vcc +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 { Index: test/CodeGen/AMDGPU/valu-i1.ll =================================================================== --- test/CodeGen/AMDGPU/valu-i1.ll +++ test/CodeGen/AMDGPU/valu-i1.ll @@ -128,18 +128,18 @@ ; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]] ; SI-DAG: v_cmp_ne_i32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]] ; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]] -; SI: s_and_saveexec_b64 [[ORNEG1]], [[ORNEG1]] -; SI: s_xor_b64 [[ORNEG1]], exec, [[ORNEG1]] +; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]] +; SI: s_xor_b64 [[ORNEG2]], exec, [[ORNEG2]] ; SI: s_cbranch_execz BB3_5 ; SI: BB#4: ; SI: buffer_store_dword -; SI: v_cmp_ge_i64_e32 vcc -; SI: s_or_b64 [[COND_STATE]], vcc, [[COND_STATE]] +; SI: v_cmp_ge_i64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]] +; SI: s_or_b64 [[COND_STATE]], [[CMP]], [[COND_STATE]] ; SI: BB3_5: -; SI: s_or_b64 exec, exec, [[ORNEG1]] -; SI: s_or_b64 [[COND_STATE]], [[ORNEG1]], [[COND_STATE]] +; SI: s_or_b64 exec, exec, [[ORNEG2]] +; SI: s_or_b64 [[COND_STATE]], [[ORNEG2]], [[COND_STATE]] ; SI: s_andn2_b64 exec, exec, [[COND_STATE]] ; SI: s_cbranch_execnz BB3_3