diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4118,6 +4118,7 @@ unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; + if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) { Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0) @@ -4138,7 +4139,10 @@ } const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg()); - if (TRI->getRegSizeInBits(*Src2RC) == 64) { + unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC); + assert(WaveSize == 64 || WaveSize == 32); + + if (WaveSize == 64) { if (ST.hasScalarCompareEq64()) { BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64)) .addReg(Src2.getReg()) @@ -4168,8 +4172,13 @@ BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1); - BuildMI(*BB, MII, DL, TII->get(AMDGPU::COPY), CarryDest.getReg()) - .addReg(AMDGPU::SCC); + unsigned SelOpc = + (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; + + BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg()) + .addImm(-1) + .addImm(0); + MI.eraseFromParent(); return BB; } diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -14,7 +14,7 @@ ; GFX7-NEXT: s_or_b32 s4, s4, s5 ; GFX7-NEXT: s_cmp_lg_u32 s4, 0 ; GFX7-NEXT: s_addc_u32 s4, s6, 0 -; GFX7-NEXT: s_cselect_b64 vcc, 1, 0 +; GFX7-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_cmp_gt_u32 s6, 31 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -31,7 +31,7 @@ ; GFX9-NEXT: v_add_co_u32_e64 v0, s[4:5], s6, s6 ; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9-NEXT: s_addc_u32 s4, s6, 0 -; GFX9-NEXT: s_cselect_b64 vcc, 1, 0 +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_cmp_gt_u32 s6, 31 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -49,7 +49,7 @@ ; GFX10-NEXT: v_add_co_u32 v0, s5, s4, s4 ; GFX10-NEXT: s_cmpk_lg_u32 s5, 0x0 ; GFX10-NEXT: s_addc_u32 s5, s4, 0 -; GFX10-NEXT: s_cselect_b32 s6, 1, 0 +; GFX10-NEXT: s_cselect_b32 s6, -1, 0 ; GFX10-NEXT: s_cmp_gt_u32 s4, 31 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s5, s6 ; GFX10-NEXT: s_cselect_b32 vcc_lo, -1, 0