diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1237,6 +1237,11 @@ foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); } else { + // Skip updating literal use if it's used in the same REQ_SQUENCE as, + // if that literal could be inlined, it's just a single use. + if (NonInlineUse && NonInlineUse->getParent() == UseMI && + UseMI->isRegSequence()) + continue; if (++NumLiteralUses == 1) { NonInlineUse = &*Use; NonInlineUseOpNo = OpNo; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -102,6 +102,10 @@ cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false)); +static cl::opt EnableLowerSGPRToVGPRCopy( + "lower-sgpr-to-vgpr-copy", cl::Hidden, + cl::desc("Enable lowering copy from SGPR to VGPR"), cl::init(true)); + static bool hasFP32Denormals(const MachineFunction &MF) { const SIMachineFunctionInfo *Info = MF.getInfo(); return Info->getMode().allFP32Denormals(); @@ -11329,6 +11333,59 @@ return false; } +// Lower COPY from SGPR to VGPR to real one as they are real transfer instead +// of COPY. +static void lowerSGPRToVGPRCopy(MachineFunction &MF, MachineRegisterInfo &MRI, + const SIRegisterInfo &TRI, + const SIInstrInfo &TII) { + for (MachineBasicBlock &MBB : MF) { + for (auto BI = MBB.begin(), BE = MBB.end(); BI != BE; /*EMPTY*/) { + MachineInstr &MI = *BI++; + + auto isSGPRToVGPRCopy = [&MRI, &TRI](const MachineInstr &MI) { + if (!MI.isCopy()) + return false; + + auto DstReg = MI.getOperand(0).getReg(); + auto SrcReg = MI.getOperand(1).getReg(); + auto DstRC = DstReg.isVirtual() ? MRI.getRegClass(DstReg) + : TRI.getPhysRegClass(DstReg); + auto SrcRC = SrcReg.isVirtual() ? MRI.getRegClass(SrcReg) + : TRI.getPhysRegClass(SrcReg); + return (DstRC == &AMDGPU::VGPR_32RegClass || + DstRC == &AMDGPU::VReg_64RegClass) && + (SrcRC == &AMDGPU::SGPR_32RegClass || + SrcRC == &AMDGPU::SGPR_64RegClass); + }; + + // Skip if it's not a copy from SGPR to VGPR. + if (!isSGPRToVGPRCopy(MI)) + continue; + + const MachineOperand &Src = MI.getOperand(1); + // FIXME: Need subreg support. + if (Src.getSubReg() != AMDGPU::NoSubRegister) + continue; + // FIXME: Need undef support. + if (Src.getReg().isVirtual()) { + auto DefMI = MRI.getVRegDef(Src.getReg()); + if (!DefMI || DefMI->isImplicitDef()) + continue; + } + + LLVM_DEBUG(dbgs() << "Lower COPY: " << MI); + unsigned Opcode = (TRI.getRegSizeInBits(Src.getReg(), MRI) == 64) + ? AMDGPU::V_MOV_B64_PSEUDO + : AMDGPU::V_MOV_B32_e32; + auto DstReg = MI.getOperand(0).getReg(); + auto MIB = BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(Opcode), DstReg) + .add(MI.getOperand(1)); + LLVM_DEBUG(dbgs() << " to: " << *MIB.getInstr()); + MI.eraseFromParent(); + } + } +} + // Figure out which registers should be reserved for stack access. Only after // the function is legalized do we know all of the non-spill stack objects or if // calls are present. @@ -11337,6 +11394,10 @@ SIMachineFunctionInfo *Info = MF.getInfo(); const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + + if (EnableLowerSGPRToVGPRCopy) + lowerSGPRToVGPRCopy(MF, MRI, *TRI, *TII); if (Info->isEntryFunction()) { // Callable functions have fixed registers used for stack access. diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll --- a/llvm/test/CodeGen/AMDGPU/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.ll @@ -11,7 +11,7 @@ ; R600-NOT: AND ; R600: |PV.{{[XYZW]}}| -; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff +; SI: s_bitset0_b32 s{{[0-9]+}}, 31 ; VI: s_bitset0_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @s_fabs_fn_free(float addrspace(1)* %out, i32 %in) { %bc= bitcast i32 %in to float @@ -24,7 +24,7 @@ ; R600-NOT: AND ; R600: |PV.{{[XYZW]}}| -; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff +; SI: s_bitset0_b32 s{{[0-9]+}}, 31 ; VI: s_bitset0_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @s_fabs_free(float addrspace(1)* %out, i32 %in) { %bc= bitcast i32 %in to float @@ -36,7 +36,7 @@ ; FUNC-LABEL: {{^}}s_fabs_f32: ; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff +; SI: s_bitset0_b32 s{{[0-9]+}}, 31 ; VI: s_bitset0_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @s_fabs_f32(float addrspace(1)* %out, float %in) { %fabs = call float @llvm.fabs.f32(float %in) diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -34,7 +34,7 @@ ; R600: |PV.{{[XYZW]}}| ; R600: -PV -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 ; VI: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) { %bc = bitcast i32 %in to float @@ -49,7 +49,7 @@ ; R600: |PV.{{[XYZW]}}| ; R600: -PV -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) { %bc = bitcast i32 %in to float %fabs = call float @fabs(float %bc) @@ -59,7 +59,7 @@ } ; FUNC-LABEL: {{^}}fneg_fabs_f32: -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabs_f32(float addrspace(1)* %out, float %in) { %fabs = call float @llvm.fabs.f32(float %in) %fsub = fsub float -0.000000e+00, %fabs diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy-cse.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy-cse.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy-cse.ll @@ -0,0 +1,26 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs -o - %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7" +target triple = "amdgcn-amd-amdhsa" + +; CHECK-LABEL: {{^}}t0: +; CHECK: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], s[4:5], 0x0 +; CHECK-COUNT-1: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]] +; CHECK-NOT: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]] +define protected amdgpu_kernel void @t0(float addrspace(1)* %p, i32 %i0, i32 %j0, i32 %k0) { +entry: + %0 = tail call i32 @llvm.amdgcn.workitem.id.x() + %i = add i32 %0, %i0 + %j = add i32 %0, %j0 + %k = add i32 %0, %k0 + %pi = getelementptr float, float addrspace(1)* %p, i32 %i + %vi = load float, float addrspace(1)* %pi + %pj = getelementptr float, float addrspace(1)* %p, i32 %j + %vj = load float, float addrspace(1)* %pj + %sum = fadd float %vi, %vj + %pk = getelementptr float, float addrspace(1)* %p, i32 %k + store float %sum, float addrspace(1)* %pk + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll @@ -153,7 +153,9 @@ ; GCN-LABEL: barrier_vmcnt_vscnt_flat_workgroup: ; GCN: flat_load_dword -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8_9: s_waitcnt lgkmcnt(0){{$}} +; GFX8_9: s_waitcnt vmcnt(0){{$}} +; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX10: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_barrier define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(i32* %arg) { diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -650,12 +650,12 @@ ; CHECK: image_store ; CHECK: s_wqm_b64 exec, exec ; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0 -; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000 +; CHECK-DAG: v_mov_b32_e32 [[SEVEN:v[0-9]+]], 0x40e00000 ; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body ; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]] ; CHECK: [[LOOP:BB[0-9]+_[0-9]+]]: ; %loop -; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]] +; CHECK: v_cmp_gt_f32_e32 vcc, [[CTR]], [[SEVEN]] ; CHECK: s_cbranch_vccz [[LOOPHDR]] ; CHECK: ; %break