Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1824,6 +1824,10 @@ IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] >; +def int_amdgcn_convergent_copy : Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>], [IntrNoMem, IntrConvergent] +>; + // Given a value, copies it while setting all the inactive lanes to a given // value. Note that OpenGL helper lanes are considered active, so if the // program ever uses WQM, then the instruction and the first source will be Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7182,6 +7182,11 @@ SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0, SIInstrInfo::MO_ABS32_LO); return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0}; + } + case Intrinsic::amdgcn_convergent_copy: { + SDValue Src = Op.getOperand(1); + return SDValue(DAG.getMachineNode(AMDGPU::CONVERGENT_COPY_PSEUDO, DL, Src.getValueType(), Src), + 0); } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2048,6 +2048,12 @@ MI.eraseFromParent(); break; } + case AMDGPU::CONVERGENT_COPY_PSEUDO: { + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) + .add(MI.getOperand(1)); + MI.eraseFromParent(); + break; + } case AMDGPU::V_SET_INACTIVE_B64: { unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -235,6 +235,12 @@ } } // End Defs = [SCC] +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { + def CONVERGENT_COPY_PSEUDO : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)> { + let isConvergent = 1; + } +} + let usesCustomInserter = 1, Defs = [VCC, EXEC] in { def V_ADD_U64_PSEUDO : VPseudoInstSI < (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.convergent.copy.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.convergent.copy.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=amdgpu-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +define i32 @convergent_copy_i32(i32 %val) { + ; GCN-LABEL: name: convergent_copy_i32 + ; GCN: bb.0 (%ir-block.0): + ; GCN-NEXT: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[CONVERG:%[0-9]+]]:vgpr_32 = CONVERGENT_COPY_PSEUDO [[COPY]], implicit $exec + ; GCN-NEXT: $vgpr0 = COPY [[CONVERG]] + ; GCN-NEXT: SI_RETURN implicit $vgpr0 + %ret = call i32 @llvm.amdgcn.convergent.copy.i32(i32 %val) + ret i32 %ret +} + +define float @convergent_copy_f32(float %val) { + ; GCN-LABEL: name: convergent_copy_f32 + ; GCN: bb.0 (%ir-block.0): + ; GCN-NEXT: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[CONVERG:%[0-9]+]]:vgpr_32 = CONVERGENT_COPY_PSEUDO [[COPY]], implicit $exec + ; GCN-NEXT: $vgpr0 = COPY [[CONVERG]] + ; GCN-NEXT: SI_RETURN implicit $vgpr0 + %ret = call float @llvm.amdgcn.convergent.copy.f32(float %val) + ret float %ret +} + +declare i32 @llvm.amdgcn.convergent.copy.i32(i32) #0 +declare float @llvm.amdgcn.convergent.copy.f32(float) #0 +attributes #0 = { nounwind readnone speculatable }