Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h @@ -349,6 +349,14 @@ return get(Opcode).TSFlags & SIInstrFlags::DPP; } + bool isVGPRCopy(const MachineInstr &MI) const { + assert(MI.isCopy()); + unsigned Dest = MI.getOperand(0).getReg(); + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + return !RI.isSGPRReg(MRI, Dest); + } + bool isInlineConstant(const APInt &Imm) const; bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const; bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const; Index: llvm/trunk/lib/Target/AMDGPU/SISchedule.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SISchedule.td +++ llvm/trunk/lib/Target/AMDGPU/SISchedule.td @@ -11,6 +11,12 @@ // //===----------------------------------------------------------------------===// +def : PredicateProlog<[{ + const SIInstrInfo *TII = + static_cast(SchedModel->getInstrInfo()); + (void)TII; +}]>; + def WriteBranch : SchedWrite; def WriteExport : SchedWrite; def WriteLDS : SchedWrite; @@ -96,6 +102,12 @@ def : HWVALUWriteRes; } +def PredIsVGPR32Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) <= 32}]>; +def PredIsVGPR64Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) > 32}]>; +def WriteCopy : SchedWriteVariant<[ + SchedVar, + SchedVar, + SchedVar]>; let SchedModel = SIFullSpeedModel in { @@ -105,6 +117,8 @@ def : HWVALUWriteRes; def : HWVALUWriteRes; +def : InstRW<[WriteCopy], (instrs COPY)>; + } // End SchedModel = SIFullSpeedModel let SchedModel = SIQuarterSpeedModel in { @@ -115,4 +129,6 @@ def : HWVALUWriteRes; def : HWVALUWriteRes; +def : InstRW<[WriteCopy], (instrs COPY)>; + } // End SchedModel = SIQuarterSpeedModel Index: llvm/trunk/test/CodeGen/AMDGPU/ctlz.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/ctlz.ll +++ llvm/trunk/test/CodeGen/AMDGPU/ctlz.ll @@ -136,7 +136,8 @@ } ; FUNC-LABEL: {{^}}v_ctlz_i64: -; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} +; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}} +; SI-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} ; SI-DAG: v_cmp_eq_i32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]] ; SI-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]] ; SI-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]] @@ -145,7 +146,6 @@ ; SI-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[LO]], v[[HI]] ; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[OR]] ; SI-DAG: v_cndmask_b32_e64 v[[CLTZ_LO:[0-9]+]], v[[CTLZ:[0-9]+]], 64, vcc -; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}} ; SI: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}} define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() Index: llvm/trunk/test/CodeGen/AMDGPU/ctlz_zero_undef.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ llvm/trunk/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -116,7 +116,7 @@ } ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64: -; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} +; SI-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} ; SI-DAG: v_cmp_eq_i32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]] ; SI-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]] ; SI-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]] Index: llvm/trunk/test/CodeGen/AMDGPU/ctpop64.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/ctpop64.ll +++ llvm/trunk/test/CodeGen/AMDGPU/ctpop64.ll @@ -145,7 +145,7 @@ ; FUNC-LABEL: {{^}}s_ctpop_i128: ; GCN: s_bcnt1_i32_b64 [[SRESULT0:s[0-9]+]], ; GCN: s_bcnt1_i32_b64 [[SRESULT1:s[0-9]+]], -; GCN: s_add_i32 s{{[0-9]+}}, [[SRESULT0]], [[SRESULT1]] +; GCN: s_add_i32 s{{[0-9]+}}, [[SRESULT1]], [[SRESULT0]] ; GCN: s_endpgm define void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind { %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone Index: llvm/trunk/test/CodeGen/AMDGPU/ftrunc.f64.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/ftrunc.f64.ll +++ llvm/trunk/test/CodeGen/AMDGPU/ftrunc.f64.ll @@ -25,8 +25,8 @@ ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 -; SI-DAG: s_addk_i32 [[SEXP]], 0xfc01 -; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP]] +; SI-DAG: s_add_i32 [[SEXP1:s[0-9]+]], [[SEXP]], 0xfffffc01 +; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP1]] ; SI-DAG: s_not_b64 ; SI-DAG: s_and_b64 ; SI-DAG: cmp_gt_i32 Index: llvm/trunk/test/CodeGen/AMDGPU/load-local-i32.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/load-local-i32.ll +++ llvm/trunk/test/CodeGen/AMDGPU/load-local-i32.ll @@ -56,9 +56,9 @@ } ; FUNC-LABEL: {{^}}local_load_v16i32: -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7{{$}} ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:4{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:5{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:5 offset1:6{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7{{$}} ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}} define void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 { entry: Index: llvm/trunk/test/CodeGen/AMDGPU/local-memory-two-objects.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/local-memory-two-objects.ll +++ llvm/trunk/test/CodeGen/AMDGPU/local-memory-two-objects.ll @@ -32,8 +32,7 @@ ; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0 -; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*}} offset:16 -; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*$}} +; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]*}}, {{v[0-9]+}} offset0:4 ; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]] Index: llvm/trunk/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll +++ llvm/trunk/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll @@ -5,13 +5,13 @@ ; FIXME: Due to changes in the load clustering heuristics. We no longer ; cluster all argument loads together on SI. ; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe ; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9 ; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe ; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 -; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38 ; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24 ; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38 define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind { store i32 %x, i32 addrspace(1)* %out0, align 4 store i32 %y, i32 addrspace(1)* %out1, align 4 Index: llvm/trunk/test/CodeGen/AMDGPU/shl_add_constant.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/shl_add_constant.ll +++ llvm/trunk/test/CodeGen/AMDGPU/shl_add_constant.ll @@ -57,8 +57,8 @@ ; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb ; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc ; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3 -; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], [[Y]] -; SI: s_add_i32 [[RESULT:s[0-9]+]], [[TMP]], 0x3d8 +; SI: s_add_i32 [[RESULT:s[0-9]+]], [[SHL3]], [[Y]] +; SI: s_addk_i32 [[RESULT]], 0x3d8 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]] ; SI: buffer_store_dword [[VRESULT]] define void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { @@ -74,8 +74,8 @@ ; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc ; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3 ; SI: s_add_i32 [[TMP:s[0-9]+]], [[Y]], [[SHL3]] -; SI: s_addk_i32 [[TMP]], 0x3d8 -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[TMP]] +; SI: s_add_i32 [[RESULT:s[0-9]+]], [[TMP]], 0x3d8 +; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]] ; SI: buffer_store_dword [[VRESULT]] define void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {