Index: llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -598,6 +598,11 @@ MachineBasicBlock *PostDomBound = PDT->findNearestCommonDominator(DomBlocks); + + // FIXME: This fails to find unstructured loops. If we have a def (other + // than a constant) in a pair of blocks that end up looping back to each + // other, it will be mishandle. Due to structurization this shouldn't occur + // in practice. unsigned FoundLoopLevel = LF.findLoop(PostDomBound); SSAUpdater.Initialize(DstReg); @@ -732,6 +737,9 @@ const MachineInstr *MI; for (;;) { MI = MRI->getUniqueVRegDef(Reg); + if (MI->getOpcode() == AMDGPU::IMPLICIT_DEF) + return true; + if (MI->getOpcode() != AMDGPU::COPY) break; @@ -808,9 +816,9 @@ MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned DstReg, unsigned PrevReg, unsigned CurReg) { - bool PrevVal; + bool PrevVal = false; bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal); - bool CurVal; + bool CurVal = false; bool CurConstant = isConstantLaneMask(CurReg, CurVal); if (PrevConstant && CurConstant) { Index: llvm/test/CodeGen/AMDGPU/loop_break.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/loop_break.ll +++ llvm/test/CodeGen/AMDGPU/loop_break.ll @@ -115,27 +115,25 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7 -; GCN-NEXT: ; implicit-def: $sgpr4 +; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-NEXT: ; implicit-def: $sgpr6 ; GCN-NEXT: BB1_1: ; %bb1 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec -; GCN-NEXT: s_and_b64 s[8:9], s[0:1], exec -; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GCN-NEXT: s_cmp_gt_i32 s4, -1 +; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_cmp_gt_i32 s6, -1 ; GCN-NEXT: s_cbranch_scc1 BB1_3 ; GCN-NEXT: ; %bb.2: ; %bb4 ; GCN-NEXT: ; in Loop: Header=BB1_1 Depth=1 ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 -; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec ; GCN-NEXT: s_and_b64 s[8:9], vcc, exec -; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GCN-NEXT: BB1_3: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; GCN-NEXT: s_add_i32 s4, s4, 1 -; GCN-NEXT: s_and_b64 s[8:9], exec, s[6:7] +; GCN-NEXT: s_add_i32 s6, s6, 1 +; GCN-NEXT: s_and_b64 s[8:9], exec, s[4:5] ; GCN-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] ; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN-NEXT: s_cbranch_execnz BB1_1 Index: llvm/test/CodeGen/AMDGPU/lower-i1-copies-implicit-def-unstructured-loop.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/lower-i1-copies-implicit-def-unstructured-loop.mir @@ -0,0 +1,171 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs -run-pass=si-i1-copies -o - %s | FileCheck %s + +# %bb.1 and %bb.3 loop back to each other, and thus neither dominates +# the other. +# When the phi in %bb.3 is handled, it attempted to insert instructions +# in %bb.1 to handle this def, but ended up inserting mask management +# instructions before the def of %34. This is avoided by treating +# IMPLICIT_DEF specially like constants + +--- +name: recursive_vreg_1_phi +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; CHECK-LABEL: name: recursive_vreg_1_phi + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr14, $sgpr15, $sgpr16 + ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 20 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 10 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; CHECK: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK: [[V_OR_B32_e32_:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 killed [[DEF3]], killed [[DEF1]], implicit $exec + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK: [[V_ASHRREV_I32_e32_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e32 31, [[COPY2]], implicit $exec + ; CHECK: [[DEF5:%[0-9]+]]:sreg_32_xm0 = IMPLICIT_DEF + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_ASHRREV_I32_e32_]], %subreg.sub1 + ; CHECK: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_32 = COPY killed [[S_MOV_B32_2]] + ; CHECK: [[V_LSHL_B64_e64_:%[0-9]+]]:vreg_64 = V_LSHL_B64_e64 killed [[REG_SEQUENCE]], [[COPY3]], implicit $exec + ; CHECK: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD killed [[V_LSHL_B64_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 1) + ; CHECK: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 68 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_4]] + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]] + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; CHECK: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 432 + ; CHECK: [[V_MAD_I64_I32_e64_:%[0-9]+]]:vreg_64, [[V_MAD_I64_I32_e64_1:%[0-9]+]]:sreg_64 = V_MAD_I64_I32_e64 killed [[FLAT_LOAD_DWORD]], killed [[S_MOV_B32_5]], [[REG_SEQUENCE1]], 0, implicit $exec + ; CHECK: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; CHECK: [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF6]], %bb.0, %31, %bb.3 + ; CHECK: [[PHI1:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_]], %bb.0, %54, %bb.3 + ; CHECK: [[PHI2:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_3]], %bb.0, %29, %bb.3 + ; CHECK: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[PHI]], $exec, implicit-def $scc + ; CHECK: [[COPY6:%[0-9]+]]:sreg_64 = COPY [[S_ANDN2_B64_]] + ; CHECK: S_CMP_EQ_U32 [[PHI2]], killed [[S_MOV_B32_6]], implicit-def $scc + ; CHECK: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; CHECK: [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK: S_CBRANCH_SCC1 %bb.3, implicit $scc + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: [[FLAT_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[V_MAD_I64_I32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 1) + ; CHECK: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 6 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_7]] + ; CHECK: [[V_LSHR_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHR_B32_e32 killed [[FLAT_LOAD_DWORD1]], killed [[COPY7]], implicit $exec + ; CHECK: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[V_LSHR_B32_e32_]], implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_AND_B32_e64_]], 1, implicit $exec + ; CHECK: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[PHI1]] + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY8]], killed [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[PHI1]] + ; CHECK: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[S_AND_B64_]], [[COPY9]], implicit-def dead $scc + ; CHECK: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; CHECK: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK: [[S_ANDN2_B64_1:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[COPY6]], $exec, implicit-def $scc + ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_OR_B64_]], $exec, implicit-def $scc + ; CHECK: [[S_OR_B64_1:%[0-9]+]]:sreg_64 = S_OR_B64 [[S_ANDN2_B64_1]], [[S_AND_B64_1]], implicit-def $scc + ; CHECK: bb.3: + ; CHECK: successors: %bb.4(0x00000000), %bb.1(0x80000000) + ; CHECK: [[PHI3:%[0-9]+]]:sreg_64 = PHI [[COPY6]], %bb.1, [[S_OR_B64_1]], %bb.2 + ; CHECK: [[PHI4:%[0-9]+]]:sreg_64 = PHI [[PHI1]], %bb.1, [[DEF9]], %bb.2 + ; CHECK: [[PHI5:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_1]], %bb.1, [[S_MOV_B64_2]], %bb.2 + ; CHECK: [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; CHECK: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[PHI5]], implicit $exec + ; CHECK: [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; CHECK: [[DEF10:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK: V_CMP_NE_U32_e32 killed [[S_MOV_B32_9]], [[V_CNDMASK_B32_e64_]], implicit-def $vcc, implicit $exec + ; CHECK: $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc + ; CHECK: [[S_ANDN2_B64_2:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[PHI4]], $exec, implicit-def $scc + ; CHECK: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 [[PHI3]], $exec, implicit-def $scc + ; CHECK: [[S_OR_B64_2:%[0-9]+]]:sreg_64 = S_OR_B64 [[S_ANDN2_B64_2]], [[S_AND_B64_2]], implicit-def $scc + ; CHECK: S_CBRANCH_VCCNZ %bb.1, implicit $vcc + ; CHECK: S_BRANCH %bb.4 + ; CHECK: bb.4: + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr14, $sgpr15, $sgpr16 + + %0:sreg_64 = IMPLICIT_DEF + %1:sreg_32 = S_MOV_B32 20 + %2:vgpr_32 = COPY %1 + %3:vgpr_32 = IMPLICIT_DEF + %4:sreg_32 = S_MOV_B32 10 + %5:vgpr_32 = COPY %4 + %6:vgpr_32 = IMPLICIT_DEF + %7:vgpr_32 = IMPLICIT_DEF + %8:vgpr_32 = V_OR_B32_e32 killed %7, killed %3, implicit $exec + %9:vgpr_32 = COPY $vgpr0 + %10:sreg_32 = IMPLICIT_DEF + %11:vgpr_32 = V_ASHRREV_I32_e32 31, %9, implicit $exec + %12:sreg_32_xm0 = IMPLICIT_DEF + %13:vreg_64 = REG_SEQUENCE %9, %subreg.sub0, %11, %subreg.sub1 + %14:sreg_32 = S_MOV_B32 2 + %15:sgpr_32 = COPY killed %14 + %16:vreg_64 = V_LSHL_B64_e64 killed %13, %15, implicit $exec + %17:vgpr_32 = FLAT_LOAD_DWORD killed %16, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 1) + %18:sreg_32 = S_MOV_B32 0 + %19:sreg_32 = S_MOV_B32 68 + %20:vgpr_32 = COPY killed %19 + %21:vgpr_32 = COPY %18 + %22:vreg_64 = REG_SEQUENCE killed %20, %subreg.sub0, %21, %subreg.sub1 + %23:sreg_32 = S_MOV_B32 432 + %24:vreg_64, %25:sreg_64 = V_MAD_I64_I32_e64 killed %17, killed %23, %22, 0, implicit $exec + %26:sreg_64 = S_MOV_B64 0 + %27:vreg_1 = COPY %26, implicit $exec + + bb.1: + successors: %bb.2, %bb.3 + + %28:sreg_32 = PHI %18, %bb.0, %29, %bb.3 + %30:vreg_1 = PHI %27, %bb.0, %31, %bb.3 + %32:sreg_32 = S_MOV_B32 0 + S_CMP_EQ_U32 %28, killed %32, implicit-def $scc + %33:sreg_64 = S_MOV_B64 -1 + %34:sreg_64 = IMPLICIT_DEF + %35:vreg_1 = COPY %34 + S_CBRANCH_SCC1 %bb.3, implicit $scc + S_BRANCH %bb.2 + + bb.2: + %36:vgpr_32 = FLAT_LOAD_DWORD %24, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 1) + %37:sreg_32 = S_MOV_B32 6 + %38:vgpr_32 = COPY %37 + %39:vgpr_32 = V_LSHR_B32_e32 killed %36, killed %38, implicit $exec + %40:sreg_32 = IMPLICIT_DEF + %41:vgpr_32 = V_AND_B32_e64 1, %39, implicit $exec + %42:sreg_64 = V_CMP_EQ_U32_e64 killed %41, 1, implicit $exec + %43:sreg_64 = COPY %30 + %44:sreg_64 = S_AND_B64 %43, killed %42, implicit-def dead $scc + %45:sreg_64 = COPY %30 + %46:sreg_64 = S_OR_B64 killed %44, %45, implicit-def dead $scc + %47:sreg_64 = S_MOV_B64 0 + %48:vreg_1 = COPY %46 + + bb.3: + successors: %bb.4(0x00000000), %bb.1(0x80000000) + + %31:vreg_1 = PHI %35, %bb.1, %48, %bb.2 + %49:sreg_64_xexec = PHI %33, %bb.1, %47, %bb.2 + %29:sreg_32 = S_MOV_B32 -1 + %50:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %49, implicit $exec + %51:sreg_32 = S_MOV_B32 1 + %52:sreg_32 = IMPLICIT_DEF + V_CMP_NE_U32_e32 killed %51, %50, implicit-def $vcc, implicit $exec + $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc + S_CBRANCH_VCCNZ %bb.1, implicit $vcc + S_BRANCH %bb.4 + + bb.4: + +... Index: llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -168,11 +168,9 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v0 -; SI-NEXT: s_and_b64 s[8:9], vcc, exec -; SI-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; SI-NEXT: s_and_b64 s[0:1], vcc, exec ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: BB3_2: ; %Flow ; SI-NEXT: s_or_saveexec_b64 s[2:3], s[2:3]