Index: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td @@ -756,6 +756,16 @@ [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] >; +// Given a value, copies it while setting all the inactive lanes to a given +// value. Note that OpenGL helper lanes are considered active, so if the +// program ever uses WQM, then the instruction and the first source will be +// computed in WQM. +def int_amdgcn_set_inactive : + Intrinsic<[llvm_anyint_ty], + [LLVMMatchType<0>, // value to be copied + LLVMMatchType<0>], // value for the inactive lanes to take + [IntrNoMem, IntrConvergent]>; + //===----------------------------------------------------------------------===// // CI+ Intrinsics //===----------------------------------------------------------------------===// Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1099,6 +1099,28 @@ MI.eraseFromParent(); break; } + case AMDGPU::V_SET_INACTIVE_B32: { + BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) + .add(MI.getOperand(2)); + BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); + MI.eraseFromParent(); + break; + } + case AMDGPU::V_SET_INACTIVE_B64: { + BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); + MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), + MI.getOperand(0).getReg()) + .add(MI.getOperand(2)); + expandPostRAPseudo(*Copy); + BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); + MI.eraseFromParent(); + break; + } case AMDGPU::V_MOVRELD_B32_V1: case AMDGPU::V_MOVRELD_B32_V2: case AMDGPU::V_MOVRELD_B32_V4: Index: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td @@ -137,6 +137,20 @@ let mayStore = 0; } +// Invert the exec mask and overwrite the inactive lanes of dst with inactive, +// restoring it after we're done. +def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst), + (ins VGPR_32: $src, VSrc_b32:$inactive), + [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> { + let Constraints = "$src = $vdst"; +} + +def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst), + (ins VReg_64: $src, VSrc_b64:$inactive), + [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> { + let Constraints = "$src = $vdst"; +} + let usesCustomInserter = 1, SALU = 1 in { def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins), [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>; Index: llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -303,6 +303,7 @@ std::vector &Worklist) { char GlobalFlags = 0; bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs"); + SmallVector SetInactiveInstrs; // We need to visit the basic blocks in reverse post-order so that we visit // defs before uses, in particular so that we don't accidentally mark an @@ -341,6 +342,23 @@ GlobalFlags |= StateWWM; LowerToCopyInstrs.push_back(&MI); continue; + } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 || + Opcode == AMDGPU::V_SET_INACTIVE_B64) { + III.Disabled = StateWWM; + MachineOperand &Inactive = MI.getOperand(2); + if (Inactive.isReg()) { + if (Inactive.isUndef()) { + LowerToCopyInstrs.push_back(&MI); + } else { + unsigned Reg = Inactive.getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + for (MachineInstr &DefMI : MRI->def_instructions(Reg)) + markInstruction(DefMI, StateWWM, Worklist); + } + } + } + SetInactiveInstrs.push_back(&MI); + continue; } else if (TII->isDisableWQM(MI)) { BBI.Needs |= StateExact; if (!(BBI.InNeeds & StateExact)) { @@ -380,6 +398,14 @@ } } + // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is + // ever used anywhere in the function. This implements the corresponding + // semantics of @llvm.amdgcn.set.inactive. + if (GlobalFlags & StateWQM) { + for (MachineInstr *MI : SetInactiveInstrs) + markInstruction(*MI, StateWQM, Worklist); + } + return GlobalFlags; } @@ -799,8 +825,11 @@ } void SIWholeQuadMode::lowerCopyInstrs() { - for (MachineInstr *MI : LowerToCopyInstrs) + for (MachineInstr *MI : LowerToCopyInstrs) { + for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--) + MI->RemoveOperand(i); MI->setDesc(TII->get(AMDGPU::COPY)); + } } bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll @@ -0,0 +1,29 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s + + +; GCN-LABEL: {{^}}set_inactive: +; GCN: s_not_b64 exec, exec +; GCN: v_mov_b32_e32 {{v[0-9]+}}, 42 +; GCN: s_not_b64 exec, exec +define amdgpu_kernel void @set_inactive(i32 addrspace(1)* %out, i32 %in) { + %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 + store i32 %tmp, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}set_inactive_64: +; GCN: s_not_b64 exec, exec +; GCN: v_mov_b32_e32 {{v[0-9]+}}, 0 +; GCN: v_mov_b32_e32 {{v[0-9]+}}, 0 +; GCN: s_not_b64 exec, exec +define amdgpu_kernel void @set_inactive_64(i64 addrspace(1)* %out, i64 %in) { + %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0 + store i64 %tmp, i64 addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0 +declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0 + +attributes #0 = { convergent readnone } Index: llvm/trunk/test/CodeGen/AMDGPU/wqm.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/wqm.ll +++ llvm/trunk/test/CodeGen/AMDGPU/wqm.ll @@ -256,6 +256,47 @@ ret float %out.1 } +; Check that @llvm.amdgcn.set.inactive disables WWM. +; +;CHECK-LABEL: {{^}}test_set_inactive1: +;CHECK: buffer_load_dword +;CHECK: s_not_b64 exec, exec +;CHECK: v_mov_b32_e32 +;CHECK: s_not_b64 exec, exec +;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 +;CHECK: v_add_i32_e32 +define amdgpu_ps void @test_set_inactive1(i32 inreg %idx) { +main_body: + %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) + %src.0 = bitcast float %src to i32 + %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0) + %out = add i32 %src.1, %src.1 + %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out) + %out.1 = bitcast i32 %out.0 to float + call void @llvm.amdgcn.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) + ret void +} + +; Check that enabling WQM anywhere enables WQM for the set.inactive source. +; +;CHECK-LABEL: {{^}}test_set_inactive2: +;CHECK: s_wqm_b64 exec, exec +;CHECK: buffer_load_dword +;CHECK: buffer_load_dword +define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %src1.0 = bitcast float %src1 to i32 + %src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 undef) + %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) + %src0.0 = bitcast float %src0 to i32 + %src0.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %src0.0) + %out = add i32 %src0.1, %src1.1 + %out.0 = bitcast i32 %out to float + call void @llvm.amdgcn.buffer.store.f32(float %out.0, <4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + ret void +} + ; Check a case of one branch of an if-else requiring WQM, the other requiring ; exact. ; @@ -513,7 +554,7 @@ ; CHECK: s_wqm_b64 exec, exec ; CHECK: v_add_f32_e32 v0, ; CHECK: s_and_b64 exec, exec, [[ORIG]] -define amdgpu_ps float @test_prolog_1(float %a, float %b) #4 { +define amdgpu_ps float @test_prolog_1(float %a, float %b) #5 { main_body: %s = fadd float %a, %b ret float %s @@ -680,10 +721,12 @@ declare i32 @llvm.amdgcn.wqm.i32(i32) #3 declare float @llvm.amdgcn.wwm.f32(float) #3 declare i32 @llvm.amdgcn.wwm.i32(i32) #3 +declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3 attributes #1 = { nounwind } attributes #2 = { nounwind readonly } attributes #3 = { nounwind readnone } -attributes #4 = { "amdgpu-ps-wqm-outputs" } +attributes #4 = { nounwind readnone convergent } +attributes #5 = { "amdgpu-ps-wqm-outputs" }