Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -1431,6 +1431,13 @@ [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] >; +// Copies the source value to the destination value, such that the source +// is computed as if the entire program were executed in WQM if any other +// program code executes in WQM. +def int_amdgcn_softwqm : Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] +>; + // Return true if at least one thread within the pixel quad passes true into // the function. def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty], Index: lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -617,6 +617,7 @@ continue; case AMDGPU::COPY: case AMDGPU::WQM: + case AMDGPU::SOFT_WQM: case AMDGPU::WWM: { // If the destination register is a physical register there isn't really // much we can do to fix this. Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -5952,6 +5952,11 @@ return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src), 0); } + case Intrinsic::amdgcn_softwqm: { + SDValue Src = Op.getOperand(1); + return SDValue(DAG.getMachineNode(AMDGPU::SOFT_WQM, DL, Src.getValueType(), Src), + 0); + } case Intrinsic::amdgcn_wwm: { SDValue Src = Op.getOperand(1); return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src), Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3631,6 +3631,7 @@ case AMDGPU::PHI: return AMDGPU::PHI; case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; case AMDGPU::WQM: return AMDGPU::WQM; + case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM; case AMDGPU::WWM: return AMDGPU::WWM; case AMDGPU::S_MOV_B32: { const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); @@ -5506,6 +5507,7 @@ switch (UseMI.getOpcode()) { case AMDGPU::COPY: case AMDGPU::WQM: + case AMDGPU::SOFT_WQM: case AMDGPU::WWM: case AMDGPU::REG_SEQUENCE: case AMDGPU::PHI: @@ -5623,6 +5625,7 @@ case AMDGPU::REG_SEQUENCE: case AMDGPU::INSERT_SUBREG: case AMDGPU::WQM: + case AMDGPU::SOFT_WQM: case AMDGPU::WWM: { const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1); if (RI.hasAGPRs(SrcRC)) { Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -111,6 +111,10 @@ // WQM pass processes it. def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; +// Pseudoinstruction for @llvm.amdgcn.softwqm. Like @llvm.amdgcn.wcm it is +// turned into a copy by WQM pass, but does not seed WQM requirements. +def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; + // Pseudoinstruction for @llvm.amdgcn.wwm. It is turned into a copy post-RA, so // that the @earlyclobber is respected. The @earlyclobber is to make sure that // the instruction that defines $src0 (which is run in WWM) doesn't Index: lib/Target/AMDGPU/SIWholeQuadMode.cpp =================================================================== --- lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -312,6 +312,7 @@ char GlobalFlags = 0; bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs"); SmallVector SetInactiveInstrs; + SmallVector SoftWQMInstrs; // We need to visit the basic blocks in reverse post-order so that we visit // defs before uses, in particular so that we don't accidentally mark an @@ -340,6 +341,10 @@ // correct, so we need it to be in WQM. Flags = StateWQM; LowerToCopyInstrs.push_back(&MI); + } else if (Opcode == AMDGPU::SOFT_WQM) { + LowerToCopyInstrs.push_back(&MI); + SoftWQMInstrs.push_back(&MI); + continue; } else if (Opcode == AMDGPU::WWM) { // The WWM intrinsic doesn't make the same guarantee, and plus it needs // to be executed in WQM or Exact so that its copy doesn't clobber @@ -407,9 +412,12 @@ // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is // ever used anywhere in the function. This implements the corresponding // semantics of @llvm.amdgcn.set.inactive. + // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm. if (GlobalFlags & StateWQM) { for (MachineInstr *MI : SetInactiveInstrs) markInstruction(*MI, StateWQM, Worklist); + for (MachineInstr *MI : SoftWQMInstrs) + markInstruction(*MI, StateWQM, Worklist); } return GlobalFlags; @@ -885,7 +893,7 @@ unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; if (!(GlobalFlags & StateWQM)) { lowerLiveMaskQueries(Exec); - if (!(GlobalFlags & StateWWM)) + if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty()) return !LiveMaskQueries.empty(); } else { // Store a copy of the original live mask when required Index: test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll @@ -0,0 +1,188 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CHECK %s + +; Check that WQM is not triggered by the softwqm intrinsic alone. +; +;CHECK-LABEL: {{^}}test1: +;CHECK-NOT: s_wqm_b64 exec, exec +;CHECK: buffer_load_dword +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +define amdgpu_ps float @test1(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) + %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out) + ret float %out.0 +} + +; Check that the softwqm intrinsic works correctly for integers. +; +;CHECK-LABEL: {{^}}test2: +;CHECK-NOT: s_wqm_b64 exec, exec +;CHECK: buffer_load_dword +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +define amdgpu_ps float @test2(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) + %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %out = fadd float %src0, %src1 + %out.0 = bitcast float %out to i32 + %out.1 = call i32 @llvm.amdgcn.softwqm.i32(i32 %out.0) + %out.2 = bitcast i32 %out.1 to float + ret float %out.2 +} + +; Make sure the transition from WQM to Exact to softwqm does not trigger WQM. +; +;CHECK-LABEL: {{^}}test_softwqm1: +;CHECK-NOT: s_wqm_b64 exec, exec +;CHECK: buffer_load_dword +;CHECK: buffer_load_dword +;CHECK: buffer_store_dword +;CHECK-NOT; s_wqm_b64 exec, exec +;CHECK: v_add_f32_e32 +define amdgpu_ps float @test_softwqm1(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) + %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %temp = fadd float %src0, %src1 + call void @llvm.amdgcn.buffer.store.f32(float %temp, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) + %out = fadd float %temp, %temp + %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out) + ret float %out.0 +} + +; Make sure the transition from WQM to Exact to softwqm does trigger WQM. +; +;CHECK-LABEL: {{^}}test_softwqm2: +;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +;CHECK: s_wqm_b64 exec, exec +;CHECK: buffer_load_dword +;CHECK: buffer_load_dword +;CHECK: s_and_b64 exec, exec, [[ORIG]] +;CHECK: buffer_store_dword +;CHECK; s_wqm_b64 exec, exec +;CHECK: v_add_f32_e32 +define amdgpu_ps float @test_softwqm2(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) + %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %temp = fadd float %src0, %src1 + %temp.0 = call float @llvm.amdgcn.wqm.f32(float %temp) + call void @llvm.amdgcn.buffer.store.f32(float %temp.0, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) + %out = fadd float %temp, %temp + %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out) + ret float %out.0 +} + +; Make sure the transition from Exact to WWM then softwqm does not trigger WQM. +; +;CHECK-LABEL: {{^}}test_wwm1: +;CHECK: buffer_load_dword +;CHECK: buffer_store_dword +;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK-NOT: s_wqm_b64 +define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) + %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %temp = fadd float %src0, %src1 + %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp) + %out = fadd float %temp.0, %temp.0 + %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out) + ret float %out.0 +} + +; Check that softwqm on one case of branch does not trigger WQM for shader. +; +;CHECK-LABEL: {{^}}test_control_flow_0: +;CHECK-NEXT: ; %main_body +;CHECK-NOT: s_wqm_b64 exec, exec +;CHECK: %ELSE +;CHECK: store +;CHECK: %IF +;CHECK: buffer_load +;CHECK: buffer_load +define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) { +main_body: + %cmp = icmp eq i32 %z, 0 + br i1 %cmp, label %IF, label %ELSE + +IF: + %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) + %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %out = fadd float %src0, %src1 + %data.if = call float @llvm.amdgcn.softwqm.f32(float %out) + br label %END + +ELSE: + call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0) + br label %END + +END: + %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] + ret float %r +} + +; Check that softwqm on one case of branch is treated as WQM in WQM shader. +; +;CHECK-LABEL: {{^}}test_control_flow_1: +;CHECK-NEXT: ; %main_body +;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +;CHECK-NEXT: s_wqm_b64 exec, exec +;CHECK: %ELSE +;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]] +;CHECK: store +;CHECK: s_mov_b64 exec, [[SAVED]] +;CHECK: %IF +;CHECK-NOT: s_and_saveexec_b64 +;CHECK-NOT: s_and_b64 exec +;CHECK: buffer_load +;CHECK: buffer_load +define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) { +main_body: + %c.bc = bitcast i32 %c to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %data.sample = extractelement <4 x float> %dtex, i32 0 + + %cmp = icmp eq i32 %z, 0 + br i1 %cmp, label %IF, label %ELSE + +IF: + %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) + %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %out = fadd float %src0, %src1 + %data.if = call float @llvm.amdgcn.softwqm.f32(float %out) + br label %END + +ELSE: + call void @llvm.amdgcn.buffer.store.f32(float %data.sample, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0) + br label %END + +END: + %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] + ret float %r +} + +declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #2 +declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2 +declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #3 +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 +declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 +declare void @llvm.amdgcn.kill(i1) #1 +declare float @llvm.amdgcn.wqm.f32(float) #3 +declare float @llvm.amdgcn.softwqm.f32(float) #3 +declare i32 @llvm.amdgcn.softwqm.i32(i32) #3 +declare float @llvm.amdgcn.wwm.f32(float) #3 + +attributes #1 = { nounwind } +attributes #2 = { nounwind readonly } +attributes #3 = { nounwind readnone }