Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9586,7 +9586,16 @@ // sub x, zext (setcc) => subcarry x, 0, setcc // sub x, sext (setcc) => addcarry x, 0, setcc - unsigned Opc = RHS.getOpcode(); + + bool Commuted = false; + unsigned Opc = LHS.getOpcode(); + if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND || + Opc == ISD::ANY_EXTEND) { + std::swap(RHS, LHS); + Commuted = true; + } + + Opc = RHS.getOpcode(); switch (Opc) { default: break; case ISD::ZERO_EXTEND: @@ -9598,8 +9607,22 @@ if (!isBoolSGPR(Cond)) break; SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1); - SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond }; - Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::ADDCARRY : ISD::SUBCARRY; + SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + SDValue Args[3]; + Args[2] = Cond; + + if (Commuted) { + // sub zext (setcc), x => addcarry 0, x, setcc + // sub sext (setcc), x => subcarry 0, x, setcc + Args[0] = Zero; + Args[1] = LHS; + Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY; + } else { + Args[0] = LHS; + Args[1] = Zero; + Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::ADDCARRY : ISD::SUBCARRY; + } + return DAG.getNode(Opc, SL, VTList, Args); } } Index: llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll +++ llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll @@ -311,6 +311,48 @@ ret void } +; Check case where sub is commuted with sext +; GCN-LABEL: {{^}}sub_zext_setcc_commute: +; GCN: v_cmp_gt_u32_e32 vcc, v +; GCN-NOT: vcc +; GCN: v_addc_u32_e32 [[ADDC:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc +; GCN: v_subrev_i32_e32 [[RESULT:v[0-9]+]], vcc, s{{[0-9]+}}, [[ADDC]] +define amdgpu_kernel void @sub_zext_setcc_commute(i32 addrspace(1)* nocapture %arg, i32 %a, i32%b) { +bb: + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x + %v = load i32, i32 addrspace(1)* %gep, align 4 + %cmp = icmp ugt i32 %x, %y + %ext = zext i1 %cmp to i32 + %adde = sub i32 %v, %ext + %sub = sub i32 %a, %adde + %sub2 = sub i32 %sub, %b + store i32 %sub2, i32 addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}sub_sext_setcc_commute: +; GCN: v_cmp_gt_u32_e32 vcc, v +; GCN-NOT: vcc +; GCN: v_subb_u32_e32 [[SUBB:v[0-9]+]], vcc, 0, v{{[0-9]+}}, vcc +; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, s{{[0-9]+}}, [[SUBB]] +; GCN: v_subrev_i32_e32 [[RESULT:v[0-9]+]], vcc, s{{[0-9]+}}, [[ADD]] +define amdgpu_kernel void @sub_sext_setcc_commute(i32 addrspace(1)* nocapture %arg, i32 %a, i32%b) { +bb: + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x + %v = load i32, i32 addrspace(1)* %gep, align 4 + %cmp = icmp ugt i32 %x, %y + %ext = sext i1 %cmp to i32 + %adde = sub i32 %v, %ext + %sub = sub i32 %a, %adde + %sub2 = sub i32 %sub, %b + store i32 %sub2, i32 addrspace(1)* %gep, align 4 + ret void +} + declare i1 @llvm.amdgcn.class.f32(float, i32) #0 declare i32 @llvm.amdgcn.workitem.id.x() #0