Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3717,15 +3717,6 @@ Inst.eraseFromParent(); continue; - case AMDGPU::S_CBRANCH_SCC0: - case AMDGPU::S_CBRANCH_SCC1: - // Clear unused bits of vcc - BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), - AMDGPU::VCC) - .addReg(AMDGPU::EXEC) - .addReg(AMDGPU::VCC); - break; - case AMDGPU::S_BFE_U64: case AMDGPU::S_BFM_B64: llvm_unreachable("Moving this op to VALU not implemented"); Index: lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- lib/Target/AMDGPU/SILowerControlFlow.cpp +++ lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -46,6 +46,9 @@ /// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr // Do the THEN block /// label1: /// %exec = S_OR_B64 %exec, %sgpr0 // Re-enable saved exec mask bits +/// +/// This pass also spots an S_CBRANCH_VCCZ/NZ from scalar control and ensures +/// that the vcc it uses does not have bits set in inactive lanes. //===----------------------------------------------------------------------===// #include "AMDGPU.h" @@ -95,6 +98,8 @@ void combineMasks(MachineInstr &MI); + void clearVccInactiveLanes(MachineInstr &MI); + public: static char ID; @@ -446,6 +451,106 @@ MRI->getUniqueVRegDef(Reg)->eraseFromParent(); } +// MI is an S_CBRANCH_VCCZ/NZ from scalar control. Ensure that the vcc it uses +// does not have bits set in inactive lanes. +void SILowerControlFlow::clearVccInactiveLanes(MachineInstr &MI) { + // Use a worklist to scan a tree of v_cmp, s_and, s_or and copy. Anything + // else we find means that we cannot guarantee that vcc has all bits clear + // for inactive lanes, so we need to insert an S_AND_B64 to ensure that. + MachineBasicBlock &MBB = *MI.getParent(); + std::vector WorkList; + WorkList.push_back(&MI.getOperand(1)); + assert(WorkList.back()->isReg() + && WorkList.back()->getReg() == AMDGPU::VCC); + bool InactiveBitsClear = true; + while (!WorkList.empty() && InactiveBitsClear) { + MachineOperand &WorkOpnd = *WorkList.back(); + WorkList.pop_back(); + if (!WorkOpnd.isReg()) { + InactiveBitsClear = false; + break; + } + MachineInstr *DefMI = nullptr; + MachineOperand *DefOpnd = nullptr; + if (WorkOpnd.getReg() == AMDGPU::VCC) { + // The operand is vcc. Scan back a little way to find the + // instruction that sets it. + MachineInstr *ScanMI = WorkOpnd.getParent(); + for (unsigned ScanLimit = 3; + !DefOpnd && ScanMI != &MBB.front() && ScanLimit; --ScanLimit) { + ScanMI = ScanMI->getPrevNode(); + for (unsigned OpndIdx = 0, NumOpnds = ScanMI->getNumOperands(); + OpndIdx != NumOpnds; ++OpndIdx) { + auto &Opnd = ScanMI->getOperand(OpndIdx); + if (Opnd.isReg() + && Opnd.isDef() && Opnd.getReg() == AMDGPU::VCC) { + // We have found the def of vcc. + DefMI = ScanMI; + DefOpnd = &Opnd; + break; + } + } + } + } else { + // Otherwise, find the single def of this operand. + auto WorkReg = WorkOpnd.getReg(); + if (MRI->hasOneDef(WorkReg)) { + DefMI = &*MRI->def_instr_begin(WorkReg); + for (unsigned OpndIdx = 0; ; ++OpndIdx) { + assert(OpndIdx < DefMI->getNumOperands()); + auto &Opnd = DefMI->getOperand(OpndIdx); + if (Opnd.isReg() && Opnd.isDef() && Opnd.getReg() == WorkReg) { + DefOpnd = &Opnd; + break; + } + } + } + } + if (!DefOpnd) { + // We did not find the def. + InactiveBitsClear = false; + break; + } + // Inspect the def. + if (DefMI->isCompare()) { + // A compare instruction is known to leave bits for inactive lanes + // clear. + continue; + } + if (DefMI->isCopy()) { + // Copy instruction: add its input to the worklist. + WorkList.push_back(&DefMI->getOperand(1)); + continue; + } + switch (DefMI->getOpcode()) { + case AMDGPU::V_CNDMASK_B32_e32: + case AMDGPU::V_CNDMASK_B32_e64: + // A later pass inserts a v_cmp after this, so we can treat it as + // a v_cmp. + continue; + case AMDGPU::S_AND_B32: + case AMDGPU::S_OR_B32: + case AMDGPU::S_AND_B64: + case AMDGPU::S_OR_B64: + // And/or: add its two inputs to the worklist. + WorkList.push_back(&DefMI->getOperand(1)); + WorkList.push_back(&DefMI->getOperand(2)); + break; + default: + // Any other op means that bits for inactive lanes might be set. + InactiveBitsClear = false; + break; + } + } + if (!InactiveBitsClear) { + // Add an S_AND_B64 to clear bits for inactive lanes. + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::S_AND_B64), + AMDGPU::VCC) + .addReg(AMDGPU::EXEC) + .addReg(AMDGPU::VCC); + } +} + bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { const SISubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); @@ -503,6 +608,15 @@ Last = I; continue; + case AMDGPU::S_CBRANCH_VCCNZ: + case AMDGPU::S_CBRANCH_VCCZ: + // This is an S_CBRANCH_VCCZ/NZ from uniform control. Ensure that the + // vcc it uses does not have bits set in inactive lanes. + clearVccInactiveLanes(MI); + // Do not replay newly inserted code. + Last = I; + continue; + default: Last = I; continue; Index: test/CodeGen/AMDGPU/indirect-addressing-si.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -662,14 +662,14 @@ ; GCN-LABEL: {{^}}broken_phi_bb: ; GCN: v_mov_b32_e32 [[PHIREG:v[0-9]+]], 8 -; GCN: s_branch [[BB2:BB[0-9]+_[0-9]+]] +; GCN: s_cbranch_vccz [[BB2:BB[0-9]+_[0-9]+]] ; GCN: {{^BB[0-9]+_[0-9]+}}: ; GCN: s_mov_b64 exec, ; IDXMODE: s_set_gpr_idx_off -; GCN: [[BB2]]: ; GCN: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, [[PHIREG]] +; GCN: [[BB2]]: ; GCN: buffer_load_dword ; GCN: [[REGLOOP:BB[0-9]+_[0-9]+]]: Index: test/CodeGen/AMDGPU/loop_break.ll =================================================================== --- test/CodeGen/AMDGPU/loop_break.ll +++ test/CodeGen/AMDGPU/loop_break.ll @@ -26,10 +26,9 @@ ; GCN: s_mov_b64 [[INITMASK:s\[[0-9]+:[0-9]+\]]], 0{{$}} ; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1 -; GCN: s_or_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[INITMASK]] ; GCN: v_cmp_lt_i32_e32 vcc, -1 -; GCN: s_and_b64 vcc, exec, vcc -; GCN-NEXT: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]] +; GCN: s_or_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[INITMASK]] +; GCN: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]] ; GCN: ; %bb.2: ; %bb4 ; GCN: buffer_load_dword Index: test/CodeGen/AMDGPU/nested-loop-conditions.ll =================================================================== --- test/CodeGen/AMDGPU/nested-loop-conditions.ll +++ test/CodeGen/AMDGPU/nested-loop-conditions.ll @@ -64,6 +64,7 @@ ; FIXME: Should fold to unconditional branch? ; GCN: s_mov_b64 vcc, -1 +; GCN-NEXT: s_and_b64 vcc, exec, vcc ; GCN-NEXT: ; implicit-def ; GCN: s_cbranch_vccz @@ -178,24 +179,20 @@ ; GCN-LABEL: {{^}}nested_loop_conditions: ; GCN: v_cmp_lt_i32_e32 vcc, 8, v -; GCN: s_and_b64 vcc, exec, vcc ; GCN: s_cbranch_vccnz [[BB31:BB[0-9]+_[0-9]+]] ; GCN: [[BB14:BB[0-9]+_[0-9]+]]: ; %bb14 ; GCN: v_cmp_ne_u32_e32 vcc, 1, v -; GCN-NEXT: s_and_b64 vcc, exec, vcc ; GCN-NEXT: s_cbranch_vccnz [[BB31]] ; GCN: [[BB18:BB[0-9]+_[0-9]+]]: ; %bb18 ; GCN: buffer_load_dword ; GCN: v_cmp_lt_i32_e32 vcc, 8, v -; GCN-NEXT: s_and_b64 vcc, exec, vcc ; GCN-NEXT: s_cbranch_vccnz [[BB18]] ; GCN: buffer_load_dword ; GCN: buffer_load_dword ; GCN: v_cmp_gt_i32_e32 vcc, 9 -; GCN-NEXT: s_and_b64 vcc, exec, vcc ; GCN-NEXT: s_cbranch_vccnz [[BB14]] ; GCN: [[BB31]]: Index: test/CodeGen/AMDGPU/salu-to-valu.ll =================================================================== --- test/CodeGen/AMDGPU/salu-to-valu.ll +++ test/CodeGen/AMDGPU/salu-to-valu.ll @@ -434,7 +434,6 @@ ; {{^}}sopc_vopc_legalize_bug: ; GCN: s_load_dword [[SGPR:s[0-9]+]] ; GCN: v_cmp_le_u32_e32 vcc, [[SGPR]], v{{[0-9]+}} -; GCN: s_and_b64 vcc, exec, vcc ; GCN: s_cbranch_vccnz [[EXIT:[A-Z0-9_]+]] ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; GCN-NOHSA: buffer_store_dword [[ONE]] Index: test/CodeGen/AMDGPU/scalar-branch-missing-and-exec.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/scalar-branch-missing-and-exec.ll @@ -0,0 +1,54 @@ +; RUN: llc -march=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=gfx800 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s + +; This checks for a bug where uniform control flow can result in multiple +; v_cmp results being combined together with s_and_b64, s_or_b64 and s_xor_b64, +; using the resulting mask in s_cbranch_vccnz +; without ensuring that the resulting mask has bits clear for inactive lanes. +; The problematic case is s_xor_b64, as, unlike the other ops, it can actually +; set bits for inactive lanes. +; +; The check for an s_xor_b64 is just to check that this test tests what it is +; supposed to test. If the s_xor_b64 disappears due to some other case, it does +; not necessarily mean that the bug has reappeared. +; +; The check for s_and_b64 vcc, vcc, exec checks that the bug is fixed. + +; CHECK: {{^}}main: +; CHECK: s_xor_b64 +; CHECK: s_and_b64 vcc, exec, vcc + +define amdgpu_cs void @main(i32 inreg %arg) { +.entry: + %tmp44 = load volatile <2 x float>, <2 x float> addrspace(1)* undef + %tmp16 = load volatile float, float addrspace(1)* undef + %tmp22 = load volatile float, float addrspace(1)* undef + %tmp25 = load volatile float, float addrspace(1)* undef + %tmp31 = fcmp olt float %tmp16, 0x3FA99999A0000000 + br i1 %tmp31, label %bb, label %.exit.thread + +bb: ; preds = %.entry + %tmp42 = fcmp olt float %tmp25, 0x3FA99999A0000000 + br i1 %tmp42, label %bb43, label %.exit.thread + +bb43: + %tmp46 = fcmp olt <2 x float> %tmp44, + %tmp47 = extractelement <2 x i1> %tmp46, i32 0 + %tmp48 = extractelement <2 x i1> %tmp46, i32 1 + %tmp49 = and i1 %tmp47, %tmp48 + br i1 %tmp49, label %bb50, label %.exit.thread + +bb50: + %tmp53 = fcmp olt float %tmp22, 0x3FA99999A0000000 + br i1 %tmp53, label %.exit3.i, label %.exit.thread + +.exit3.i: + store volatile i32 0, i32 addrspace(1)* undef + br label %.exit.thread + +.exit.thread: + ret void +} + Index: test/CodeGen/AMDGPU/skip-if-dead.ll =================================================================== --- test/CodeGen/AMDGPU/skip-if-dead.ll +++ test/CodeGen/AMDGPU/skip-if-dead.ll @@ -218,7 +218,6 @@ ; CHECK-NEXT: ; %bb.3: ; CHECK: buffer_load_dword [[LOAD:v[0-9]+]] ; CHECK: v_cmp_eq_u32_e32 vcc, 0, [[LOAD]] -; CHECK-NEXT: s_and_b64 vcc, exec, vcc ; CHECK-NEXT: s_cbranch_vccnz [[LOOP_BB]] ; CHECK-NEXT: {{^}}[[EXIT]]: Index: test/CodeGen/AMDGPU/uniform-cfg.ll =================================================================== --- test/CodeGen/AMDGPU/uniform-cfg.ll +++ test/CodeGen/AMDGPU/uniform-cfg.ll @@ -118,7 +118,6 @@ ; Using a floating-point value in an integer compare will cause the compare to ; be selected for the SALU and then later moved to the VALU. ; GCN: v_cmp_ne_u32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 5, [[CMP]] -; GCN: s_and_b64 vcc, exec, [[COND]] ; GCN: s_cbranch_vccnz [[ENDIF_LABEL:[0-9_A-Za-z]+]] ; GCN: buffer_store_dword ; GCN: [[ENDIF_LABEL]]: @@ -143,7 +142,6 @@ ; Using a floating-point value in an integer compare will cause the compare to ; be selected for the SALU and then later moved to the VALU. ; GCN: v_cmp_gt_u32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 6, [[CMP]] -; GCN: s_and_b64 vcc, exec, [[COND]] ; GCN: s_cbranch_vccnz [[ENDIF_LABEL:[0-9_A-Za-z]+]] ; GCN: buffer_store_dword ; GCN: [[ENDIF_LABEL]]: