Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -129,6 +129,15 @@ [IntrConvergent]>; +// Given a value, copies it while setting all the inactive lanes to a given +// value. +def int_amdgcn_set_inactive : + Intrinsic<[llvm_anyint_ty], + [LLVMMatchType<0>, // value to be copied + LLVMMatchType<0>], // value for the inactive lanes to take + [IntrNoMem, IntrConvergent]>; + + //===----------------------------------------------------------------------===// // Instruction Intrinsics //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1088,6 +1088,28 @@ MI.eraseFromParent(); break; } + case AMDGPU::V_SET_INACTIVE_B32: { + BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) + .add(MI.getOperand(2)); + BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); + MI.eraseFromParent(); + break; + } + case AMDGPU::V_SET_INACTIVE_B64: { + BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); + MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), + MI.getOperand(0).getReg()) + .add(MI.getOperand(2)); + expandPostRAPseudo(*Copy); + BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); + MI.eraseFromParent(); + break; + } case AMDGPU::V_MOVRELD_B32_V1: case AMDGPU::V_MOVRELD_B32_V2: case AMDGPU::V_MOVRELD_B32_V4: Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -117,6 +117,24 @@ (ins VSrc_b64:$src0)>; } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] +// Invert the exec mask and overwrite the inactive lanes of dst with inactive, +// restoring it after we're done. Used for implementing wavefront reductions. +def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst), + (ins VGPR_32: $src, VSrc_b32:$inactive), + [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> { + let Constraints = "$src = $vdst"; + // Ensure that helper lanes also get set to the inactive value. + let DisableWQM = 1; +} + +def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst), + (ins VReg_64: $src, VSrc_b64:$inactive), + [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> { + let Constraints = "$src = $vdst"; + // Ensure that helper lanes also get set to the inactive value. + let DisableWQM = 1; +} + let usesCustomInserter = 1, SALU = 1 in { def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins), [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>; Index: test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll @@ -0,0 +1,29 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s + + +; GCN-LABEL: {{^}}set_inactive: +; GCN: s_not_b64 exec, exec +; GCN: v_mov_b32_e32 {{v[0-9]+}}, 42 +; GCN: s_not_b64 exec, exec +define amdgpu_kernel void @set_inactive(i32 addrspace(1)* %out, i32 %in) { + %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 + store i32 %tmp, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}set_inactive_64: +; GCN: s_not_b64 exec, exec +; GCN: v_mov_b32_e32 {{v[0-9]+}}, 0 +; GCN: v_mov_b32_e32 {{v[0-9]+}}, 0 +; GCN: s_not_b64 exec, exec +define amdgpu_kernel void @set_inactive_64(i64 addrspace(1)* %out, i64 %in) { + %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0 + store i64 %tmp, i64 addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0 +declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0 + +attributes #0 = { convergent readnone }