diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -861,12 +861,16 @@ MachineInstr *VcmpMI; const MachineOperand &Op0 = MI.getOperand(0); const MachineOperand &Op1 = MI.getOperand(1); + + // VCC represents lanes killed. + Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; + if (TRI->isVGPR(*MRI, Op0.getReg())) { Opcode = AMDGPU::getVOPe32(Opcode); VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0); } else { VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)) - .addReg(AMDGPU::VCC, RegState::Define) + .addReg(VCC, RegState::Define) .addImm(0) // src0 modifiers .add(Op1) .addImm(0) // src1 modifiers @@ -874,9 +878,6 @@ .addImm(0); // omod } - // VCC represents lanes killed. - Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; - MachineInstr *MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) .addReg(LiveMaskReg) diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -3330,6 +3330,50 @@ ret float %out } +; Check if the correct VCC register is selected. WQM pass incorrectly uses VCC for +; vector comparisons in Wave32 mode. +define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(float addrspace(6)* inreg %0) { +; GFX9-W64-LABEL: test_for_deactivating_lanes_in_wave32: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b32 s3, 0x31016fac +; GFX9-W64-NEXT: s_mov_b32 s2, 32 +; GFX9-W64-NEXT: s_mov_b32 s1, 0x8000 +; GFX9-W64-NEXT: s_buffer_load_dword s0, s[0:3], 0x0 +; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-W64-NEXT: v_cmp_le_f32_e64 vcc, s0, 0 +; GFX9-W64-NEXT: s_andn2_b64 s[4:5], exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB50_1 +; GFX9-W64-NEXT: s_endpgm +; GFX9-W64-NEXT: .LBB50_1: +; GFX9-W64-NEXT: s_mov_b64 exec, 0 +; GFX9-W64-NEXT: exp null off, off, off, off done vm +; GFX9-W64-NEXT: s_endpgm +; +; GFX10-W32-LABEL: test_for_deactivating_lanes_in_wave32: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s3, 0x31016fac +; GFX10-W32-NEXT: s_mov_b32 s2, 32 +; GFX10-W32-NEXT: s_mov_b32 s1, 0x8000 +; GFX10-W32-NEXT: s_buffer_load_dword s0, s[0:3], 0x0 +; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-W32-NEXT: v_cmp_le_f32_e64 vcc_lo, s0, 0 +; GFX10-W32-NEXT: s_andn2_b32 s4, exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB50_1 +; GFX10-W32-NEXT: s_endpgm +; GFX10-W32-NEXT: .LBB50_1: +; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-W32-NEXT: exp null off, off, off, off done vm +; GFX10-W32-NEXT: s_endpgm +main_body: + %1 = ptrtoint float addrspace(6)* %0 to i32 + %2 = insertelement <4 x i32> , i32 %1, i32 0 + %3 = call nsz arcp float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %2, i32 0, i32 0) #3 + %4 = fcmp nsz arcp ugt float %3, 0.000000e+00 + call void @llvm.amdgcn.kill(i1 %4) #1 + ret void +} + + declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1 @@ -3361,6 +3405,7 @@ declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) +declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #7 attributes #1 = { nounwind } attributes #2 = { nounwind readonly } @@ -3368,3 +3413,4 @@ attributes #4 = { nounwind readnone convergent } attributes #5 = { "amdgpu-ps-wqm-outputs" } attributes #6 = { nounwind "InitialPSInputAddr"="2" } +attributes #7 = { nounwind readnone willreturn }