Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -271,7 +271,11 @@ // DAG legalization, so we can fold some i64 ADDs used for address // calculation into the LOAD and STORE instructions. case ISD::ADD: - case ISD::SUB: { + case ISD::ADDC: + case ISD::ADDE: + case ISD::SUB: + case ISD::SUBC: + case ISD::SUBE: { if (N->getValueType(0) != MVT::i64 || Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) break; @@ -576,7 +580,12 @@ SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - bool IsAdd = (N->getOpcode() == ISD::ADD); + unsigned Opcode = N->getOpcode(); + bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE); + bool ProduceCarry = + ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC; + bool IsAdd = + (Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE); SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); @@ -592,25 +601,43 @@ DL, MVT::i32, RHS, Sub1); SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue); - SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) }; unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; - SDNode *AddLo = CurDAG->getMachineNode( Opc, DL, VTList, AddLoArgs); - SDValue Carry(AddLo, 1); - SDNode *AddHi - = CurDAG->getMachineNode(CarryOpc, DL, MVT::i32, - SDValue(Hi0, 0), SDValue(Hi1, 0), Carry); + SDNode *AddLo; + if (!ConsumeCarry) { + SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) }; + AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args); + } else { + SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) }; + AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args); + } + SDValue AddHiArgs[] = { + SDValue(Hi0, 0), + SDValue(Hi1, 0), + SDValue(AddLo, 1) + }; + SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs); - SDValue Args[5] = { + SDValue RegSequenceArgs[] = { CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), SDValue(AddLo,0), Sub0, SDValue(AddHi,0), Sub1, }; - CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args); + SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::i64, RegSequenceArgs); + + if (ProduceCarry) { + // Replace the carry-use + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(AddHi, 1)); + } + + // Replace the remaining uses. + CurDAG->ReplaceAllUsesWith(N, RegSequence); + CurDAG->RemoveDeadNode(N); } // We need to handle this here because tablegen doesn't support matching Index: test/CodeGen/AMDGPU/add_i128.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/add_i128.ll @@ -0,0 +1,56 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}test_i128_vreg: +; GCN: v_add_i32_e32 v[[LO:[0-9]+]], vcc, +; GCN-NEXT: v_addc_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc +; GCN-NEXT: v_addc_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc +; GCN-NEXT: v_addc_u32_e32 v[[HI:[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc +; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]], +define void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %inA, i128 addrspace(1)* noalias %inB) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone + %a_ptr = getelementptr i128, i128 addrspace(1)* %inA, i32 %tid + %b_ptr = getelementptr i128, i128 addrspace(1)* %inB, i32 %tid + %a = load i128, i128 addrspace(1)* %a_ptr + %b = load i128, i128 addrspace(1)* %b_ptr + %result = add i128 %a, %b + store i128 %result, i128 addrspace(1)* %out + ret void +} + +; Check that the SGPR add operand is correctly moved to a VGPR. +; GCN-LABEL: {{^}}sgpr_operand: +; GCN: v_add_i32 +; GCN: v_addc_u32 +; GCN: v_addc_u32 +; GCN: v_addc_u32 +define void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) { + %foo = load i128, i128 addrspace(1)* %in, align 8 + %result = add i128 %foo, %a + store i128 %result, i128 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}sgpr_operand_reversed: +; GCN: v_add_i32 +; GCN: v_addc_u32 +; GCN: v_addc_u32 +; GCN: v_addc_u32 +define void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) { + %foo = load i128, i128 addrspace(1)* %in, align 8 + %result = add i128 %a, %foo + store i128 %result, i128 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_sreg: +; GCN: s_add_u32 +; GCN: s_addc_u32 +; GCN: s_addc_u32 +; GCN: s_addc_u32 +define void @test_sreg(i128 addrspace(1)* noalias %out, i128 %a, i128 %b) { + %result = add i128 %a, %b + store i128 %result, i128 addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() readnone