Index: lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -174,6 +174,31 @@ return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC); } +static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, + const SIRegisterInfo *TRI, + const SIInstrInfo *TII) { + MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + auto &Src = MI.getOperand(1); + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned SrcReg = Src.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || + !TargetRegisterInfo::isVirtualRegister(DstReg)) + return false; + + for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) { + const auto *UseMI = MO.getParent(); + if (UseMI == &MI) + continue; + if (MO.isDef() || UseMI->getParent() != MI.getParent() || + UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END || + !TII->isOperandLegal(*UseMI, UseMI->getOperandNo(&MO), &Src)) + return false; + } + // Change VGPR to SGPR destination. + MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg))); + return true; +} + // Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. // // SGPRx = ... @@ -214,6 +239,9 @@ if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) return false; + if (tryChangeVGPRtoSGPRinCopy(CopyUse, TRI, TII)) + return true; + // TODO: Could have multiple extracts? unsigned SubReg = CopyUse.getOperand(1).getSubReg(); if (SubReg != AMDGPU::NoSubRegister) @@ -563,6 +591,8 @@ break; } TII->moveToVALU(MI); + } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) { + tryChangeVGPRtoSGPRinCopy(MI, TRI, TII); } break; Index: test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll @@ -4,7 +4,7 @@ declare i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64, i32, i64) #0 ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8: -; GCN: v_mqsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GCN: v_mqsad_pk_u16_u8 v[0:1], v[4:5], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] ; GCN-DAG: v_mov_b32_e32 v5, v1 ; GCN-DAG: v_mov_b32_e32 v4, v0 define amdgpu_kernel void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) { Index: test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll @@ -4,7 +4,7 @@ declare i64 @llvm.amdgcn.qsad.pk.u16.u8(i64, i32, i64) #0 ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8: -; GCN: v_qsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GCN: v_qsad_pk_u16_u8 v[0:1], v[4:5], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] ; GCN-DAG: v_mov_b32_e32 v5, v1 ; GCN-DAG: v_mov_b32_e32 v4, v0 define amdgpu_kernel void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) { Index: test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir @@ -0,0 +1,337 @@ +# RUN: llc -march=amdgcn -run-pass si-fix-sgpr-copies,si-fold-operands,dead-mi-elimination -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s + +# Check that constant is in SGPR registers + +# GCN-LABEL: {{^}}name: const_to_sgpr{{$}} +# GCN: %[[HI:[0-9]+]] = S_MOV_B32 0 +# GCN-NEXT: %[[LO:[0-9]+]] = S_MOV_B32 1048576 +# GCN-NEXT: %[[SGPR_PAIR:[0-9]+]] = REG_SEQUENCE killed %[[LO]], 1, killed %[[HI]], 2 +# GCN-NEXT: V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit %exec + + +# GCN-LABEL: {{^}}name: const_to_sgpr_multiple_use{{$}} +# GCN: %[[HI:[0-9]+]] = S_MOV_B32 0 +# GCN-NEXT: %[[LO:[0-9]+]] = S_MOV_B32 1048576 +# GCN-NEXT: %[[SGPR_PAIR:[0-9]+]] = REG_SEQUENCE killed %[[LO]], 1, killed %[[HI]], 2 +# GCN-NEXT: V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit %exec +# GCN-NEXT: V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit %exec + + +--- | + define amdgpu_kernel void @const_to_sgpr(i32 addrspace(1)* nocapture %arg, i64 %id) { + bb: + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0, !range !0 + %tid64 = zext i32 %tid to i64 + %x = add i64 %tid64, %id + %cmp = icmp ugt i64 %x, 1048575 + %0 = xor i1 %cmp, true + %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) + %2 = extractvalue { i1, i64 } %1, 0 + %3 = extractvalue { i1, i64 } %1, 1 + br i1 %2, label %bb1, label %bb2 + + bb1: ; preds = %bb + %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tid64 + store i32 0, i32 addrspace(1)* %gep, align 4 + br label %bb2 + + bb2: ; preds = %bb1, %bb + call void @llvm.amdgcn.end.cf(i64 %3) + ret void + } + + define amdgpu_kernel void @const_to_sgpr_multiple_use(i32 addrspace(1)* nocapture %arg, i64 %id1, i64 %id2) { + bb: + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0, !range !0 + %tid64 = zext i32 %tid to i64 + %x1 = add i64 %tid64, %id1 + %x2 = add i64 %tid64, %id2 + %cmp1 = icmp ugt i64 %x1, 1048575 + %cmp2 = icmp ugt i64 %x2, 1048575 + %cond = or i1 %cmp1, %cmp2 + %0 = xor i1 %cond, true + %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) + %2 = extractvalue { i1, i64 } %1, 0 + %3 = extractvalue { i1, i64 } %1, 1 + br i1 %2, label %bb1, label %bb2 + + bb1: ; preds = %bb + %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tid64 + store i32 0, i32 addrspace(1)* %gep, align 4 + br label %bb2 + + bb2: ; preds = %bb1, %bb + call void @llvm.amdgcn.end.cf(i64 %3) + ret void + } + + ; Function Attrs: nounwind readnone speculatable + declare i32 @llvm.amdgcn.workitem.id.x() #0 + + ; Function Attrs: convergent nounwind + declare { i1, i64 } @llvm.amdgcn.if(i1) #1 + + ; Function Attrs: convergent nounwind + declare { i1, i64 } @llvm.amdgcn.else(i64) #1 + + ; Function Attrs: convergent nounwind readnone + declare i64 @llvm.amdgcn.break(i64) #2 + + ; Function Attrs: convergent nounwind readnone + declare i64 @llvm.amdgcn.if.break(i1, i64) #2 + + ; Function Attrs: convergent nounwind readnone + declare i64 @llvm.amdgcn.else.break(i64, i64) #2 + + ; Function Attrs: convergent nounwind + declare i1 @llvm.amdgcn.loop(i64) #1 + + ; Function Attrs: convergent nounwind + declare void @llvm.amdgcn.end.cf(i64) #1 + + ; Function Attrs: nounwind + declare void @llvm.stackprotector(i8*, i8**) #3 + + attributes #0 = { nounwind readnone speculatable } + attributes #1 = { convergent nounwind } + attributes #2 = { convergent nounwind readnone } + attributes #3 = { nounwind } + + !0 = !{i32 0, i32 256} + +... +--- +name: const_to_sgpr +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64, preferred-register: '' } + - { id: 1, class: sreg_64, preferred-register: '' } + - { id: 2, class: vgpr_32, preferred-register: '' } + - { id: 3, class: sgpr_64, preferred-register: '' } + - { id: 4, class: sreg_32_xm0, preferred-register: '' } + - { id: 5, class: sgpr_32, preferred-register: '' } + - { id: 6, class: sreg_64, preferred-register: '' } + - { id: 7, class: sreg_64_xexec, preferred-register: '' } + - { id: 8, class: sreg_64_xexec, preferred-register: '' } + - { id: 9, class: sreg_32, preferred-register: '' } + - { id: 10, class: sreg_64, preferred-register: '' } + - { id: 11, class: sreg_32_xm0, preferred-register: '' } + - { id: 12, class: sreg_32_xm0, preferred-register: '' } + - { id: 13, class: sreg_32_xm0, preferred-register: '' } + - { id: 14, class: sreg_32_xm0, preferred-register: '' } + - { id: 15, class: sreg_32_xm0, preferred-register: '' } + - { id: 16, class: sreg_32_xm0, preferred-register: '' } + - { id: 17, class: sreg_64, preferred-register: '' } + - { id: 18, class: sreg_32_xm0, preferred-register: '' } + - { id: 19, class: sreg_32_xm0, preferred-register: '' } + - { id: 20, class: sreg_64, preferred-register: '' } + - { id: 21, class: sreg_64, preferred-register: '' } + - { id: 22, class: vreg_64, preferred-register: '' } + - { id: 23, class: sreg_32_xm0, preferred-register: '' } + - { id: 24, class: sreg_64, preferred-register: '' } + - { id: 25, class: sreg_32_xm0, preferred-register: '' } + - { id: 26, class: sreg_32_xm0, preferred-register: '' } + - { id: 27, class: sgpr_64, preferred-register: '' } + - { id: 28, class: sgpr_128, preferred-register: '' } + - { id: 29, class: vgpr_32, preferred-register: '' } + - { id: 30, class: vreg_64, preferred-register: '' } +liveins: + - { reg: '%vgpr0', virtual-reg: '%2' } + - { reg: '%sgpr0_sgpr1', virtual-reg: '%3' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.bb: + successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000) + liveins: %vgpr0, %sgpr0_sgpr1 + + %3 = COPY %sgpr0_sgpr1 + %2 = COPY %vgpr0 + %7 = S_LOAD_DWORDX2_IMM %3, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %8 = S_LOAD_DWORDX2_IMM %3, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %6 = COPY %7 + %9 = S_MOV_B32 0 + %10 = REG_SEQUENCE %2, 1, killed %9, 2 + %0 = COPY %10 + %11 = COPY %10.sub0 + %12 = COPY %10.sub1 + %13 = COPY %8.sub0 + %14 = COPY %8.sub1 + %15 = S_ADD_U32 killed %11, killed %13, implicit-def %scc + %16 = S_ADDC_U32 killed %12, killed %14, implicit-def dead %scc, implicit %scc + %17 = REG_SEQUENCE killed %15, 1, killed %16, 2 + %18 = S_MOV_B32 0 + %19 = S_MOV_B32 1048576 + %20 = REG_SEQUENCE killed %19, 1, killed %18, 2 + %22 = COPY killed %20 + %21 = V_CMP_LT_U64_e64 killed %17, %22, implicit %exec + %1 = SI_IF killed %21, %bb.2.bb2, implicit-def dead %exec, implicit-def dead %scc, implicit %exec + S_BRANCH %bb.1.bb1 + + bb.1.bb1: + successors: %bb.2.bb2(0x80000000) + + %23 = S_MOV_B32 2 + %24 = S_LSHL_B64 %0, killed %23, implicit-def dead %scc + %25 = S_MOV_B32 61440 + %26 = S_MOV_B32 0 + %27 = REG_SEQUENCE killed %26, 1, killed %25, 2 + %28 = REG_SEQUENCE %6, 17, killed %27, 18 + %29 = V_MOV_B32_e32 0, implicit %exec + %30 = COPY %24 + BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, implicit %exec :: (store 4 into %ir.gep) + + bb.2.bb2: + SI_END_CF %1, implicit-def dead %exec, implicit-def dead %scc, implicit %exec + S_ENDPGM + +... +--- +name: const_to_sgpr_multiple_use +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64, preferred-register: '' } + - { id: 1, class: sreg_64, preferred-register: '' } + - { id: 2, class: vgpr_32, preferred-register: '' } + - { id: 3, class: sgpr_64, preferred-register: '' } + - { id: 4, class: sreg_32_xm0, preferred-register: '' } + - { id: 5, class: sgpr_32, preferred-register: '' } + - { id: 6, class: sreg_64, preferred-register: '' } + - { id: 7, class: sreg_64_xexec, preferred-register: '' } + - { id: 8, class: sreg_64_xexec, preferred-register: '' } + - { id: 9, class: sreg_64_xexec, preferred-register: '' } + - { id: 10, class: sreg_32, preferred-register: '' } + - { id: 11, class: sreg_64, preferred-register: '' } + - { id: 12, class: sreg_32_xm0, preferred-register: '' } + - { id: 13, class: sreg_32_xm0, preferred-register: '' } + - { id: 14, class: sreg_32_xm0, preferred-register: '' } + - { id: 15, class: sreg_32_xm0, preferred-register: '' } + - { id: 16, class: sreg_32_xm0, preferred-register: '' } + - { id: 17, class: sreg_32_xm0, preferred-register: '' } + - { id: 18, class: sreg_64, preferred-register: '' } + - { id: 19, class: sreg_32_xm0, preferred-register: '' } + - { id: 20, class: sreg_32_xm0, preferred-register: '' } + - { id: 21, class: sreg_32_xm0, preferred-register: '' } + - { id: 22, class: sreg_32_xm0, preferred-register: '' } + - { id: 23, class: sreg_64, preferred-register: '' } + - { id: 24, class: sreg_32_xm0, preferred-register: '' } + - { id: 25, class: sreg_32_xm0, preferred-register: '' } + - { id: 26, class: sreg_64, preferred-register: '' } + - { id: 27, class: sreg_64, preferred-register: '' } + - { id: 28, class: vreg_64, preferred-register: '' } + - { id: 29, class: sreg_64, preferred-register: '' } + - { id: 30, class: vreg_64, preferred-register: '' } + - { id: 31, class: sreg_64, preferred-register: '' } + - { id: 32, class: sreg_32_xm0, preferred-register: '' } + - { id: 33, class: sreg_64, preferred-register: '' } + - { id: 34, class: sreg_32_xm0, preferred-register: '' } + - { id: 35, class: sreg_32_xm0, preferred-register: '' } + - { id: 36, class: sgpr_64, preferred-register: '' } + - { id: 37, class: sgpr_128, preferred-register: '' } + - { id: 38, class: vgpr_32, preferred-register: '' } + - { id: 39, class: vreg_64, preferred-register: '' } +liveins: + - { reg: '%vgpr0', virtual-reg: '%2' } + - { reg: '%sgpr0_sgpr1', virtual-reg: '%3' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.bb: + successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000) + liveins: %vgpr0, %sgpr0_sgpr1 + + %3 = COPY %sgpr0_sgpr1 + %2 = COPY %vgpr0 + %7 = S_LOAD_DWORDX2_IMM %3, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %8 = S_LOAD_DWORDX2_IMM %3, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %9 = S_LOAD_DWORDX2_IMM %3, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %6 = COPY %7 + %10 = S_MOV_B32 0 + %11 = REG_SEQUENCE %2, 1, killed %10, 2 + %0 = COPY %11 + %12 = COPY %11.sub0 + %13 = COPY %11.sub1 + %14 = COPY %8.sub0 + %15 = COPY %8.sub1 + %16 = S_ADD_U32 %12, killed %14, implicit-def %scc + %17 = S_ADDC_U32 %13, killed %15, implicit-def dead %scc, implicit %scc + %18 = REG_SEQUENCE killed %16, 1, killed %17, 2 + %19 = COPY %9.sub0 + %20 = COPY %9.sub1 + %21 = S_ADD_U32 %12, killed %19, implicit-def %scc + %22 = S_ADDC_U32 %13, killed %20, implicit-def dead %scc, implicit %scc + %23 = REG_SEQUENCE killed %21, 1, killed %22, 2 + %24 = S_MOV_B32 0 + %25 = S_MOV_B32 1048576 + %26 = REG_SEQUENCE killed %25, 1, killed %24, 2 + %28 = COPY %26 + %27 = V_CMP_LT_U64_e64 killed %18, %28, implicit %exec + %29 = V_CMP_LT_U64_e64 killed %23, %28, implicit %exec + %31 = S_AND_B64 killed %27, killed %29, implicit-def dead %scc + %1 = SI_IF killed %31, %bb.2.bb2, implicit-def dead %exec, implicit-def dead %scc, implicit %exec + S_BRANCH %bb.1.bb1 + + bb.1.bb1: + successors: %bb.2.bb2(0x80000000) + + %32 = S_MOV_B32 2 + %33 = S_LSHL_B64 %0, killed %32, implicit-def dead %scc + %34 = S_MOV_B32 61440 + %35 = S_MOV_B32 0 + %36 = REG_SEQUENCE killed %35, 1, killed %34, 2 + %37 = REG_SEQUENCE %6, 17, killed %36, 18 + %38 = V_MOV_B32_e32 0, implicit %exec + %39 = COPY %33 + BUFFER_STORE_DWORD_ADDR64 killed %38, killed %39, killed %37, 0, 0, 0, 0, 0, implicit %exec :: (store 4 into %ir.gep) + + bb.2.bb2: + SI_END_CF %1, implicit-def dead %exec, implicit-def dead %scc, implicit %exec + S_ENDPGM + +... Index: test/CodeGen/AMDGPU/sdwa-peephole.ll =================================================================== --- test/CodeGen/AMDGPU/sdwa-peephole.ll +++ test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -400,9 +400,9 @@ ; Check that "pulling out" SDWA operands works correctly. ; GCN-LABEL: {{^}}pulled_out_test: -; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_and_b32_sdwa Index: test/CodeGen/AMDGPU/sint_to_fp.i64.ll =================================================================== --- test/CodeGen/AMDGPU/sint_to_fp.i64.ll +++ test/CodeGen/AMDGPU/sint_to_fp.i64.ll @@ -22,7 +22,7 @@ ; GCN: v_cndmask ; GCN-DAG: v_cmp_eq_u64 -; GCN-DAG: v_cmp_lt_u64 +; GCN-DAG: v_cmp_gt_u64 ; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} ; GCN: v_cndmask_b32_e{{32|64}} [[SIGN_SEL:v[0-9]+]], @@ -57,7 +57,7 @@ ; GCN: v_cndmask ; GCN-DAG: v_cmp_eq_u64 -; GCN-DAG: v_cmp_lt_u64 +; GCN-DAG: v_cmp_gt_u64 ; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} ; GCN: v_cndmask_b32_e{{32|64}} [[SIGN_SEL:v[0-9]+]], Index: test/CodeGen/AMDGPU/uint_to_fp.i64.ll =================================================================== --- test/CodeGen/AMDGPU/uint_to_fp.i64.ll +++ test/CodeGen/AMDGPU/uint_to_fp.i64.ll @@ -19,7 +19,7 @@ ; GCN: v_cndmask ; GCN-DAG: v_cmp_eq_u64 -; GCN-DAG: v_cmp_lt_u64 +; GCN-DAG: v_cmp_gt_u64 ; GCN: v_add_i32_e32 [[VR:v[0-9]+]] ; GCN: v_cvt_f16_f32_e32 [[VR_F16:v[0-9]+]], [[VR]] @@ -50,7 +50,7 @@ ; GCN: v_cndmask ; GCN-DAG: v_cmp_eq_u64 -; GCN-DAG: v_cmp_lt_u64 +; GCN-DAG: v_cmp_gt_u64 ; GCN: v_add_i32_e32 [[VR:v[0-9]+]] ; GCN: {{buffer|flat}}_store_dword {{.*}}[[VR]]