Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -740,6 +740,13 @@ [IntrNoMem, IntrSpeculatable] >; + +// Copies the source value to the destination value, with the guarantee that +// the source value is computed as if the entire program were executed in WQM. +def int_amdgcn_wqm : Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] +>; + //===----------------------------------------------------------------------===// // CI+ Intrinsics //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -338,6 +338,9 @@ unsigned &SMovOp, int64_t &Imm) { + if (Copy->getOpcode() != AMDGPU::COPY) + return false; + if (!MoveImm->isMoveImmediate()) return false; @@ -564,7 +567,8 @@ switch (MI.getOpcode()) { default: continue; - case AMDGPU::COPY: { + case AMDGPU::COPY: + case AMDGPU::WQM: { // If the destination register is a physical register there isn't really // much we can do to fix this. if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg())) Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -3290,6 +3290,11 @@ Op.getOperand(1), Op.getOperand(2)); return DAG.getNode(ISD::BITCAST, DL, VT, Node); } + case Intrinsic::amdgcn_wqm: { + SDValue Src = Op.getOperand(1); + return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src), + 0); + } default: return Op; } Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2653,6 +2653,7 @@ case AMDGPU::COPY: return AMDGPU::COPY; case AMDGPU::PHI: return AMDGPU::PHI; case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; + case AMDGPU::WQM: return AMDGPU::WQM; case AMDGPU::S_MOV_B32: return MI.getOperand(1).isReg() ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; @@ -3957,6 +3958,7 @@ case AMDGPU::PHI: case AMDGPU::REG_SEQUENCE: case AMDGPU::INSERT_SUBREG: + case AMDGPU::WQM: if (RI.hasVGPRs(NewDstRC)) return nullptr; Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -115,6 +115,11 @@ // pass to enable folding of inline immediates. def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), (ins VSrc_b64:$src0)>; + +// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy +// after the WQM pass processes them. +def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; + } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] let usesCustomInserter = 1, SALU = 1 in { Index: lib/Target/AMDGPU/SIWholeQuadMode.cpp =================================================================== --- lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -136,6 +136,7 @@ DenseMap Instructions; DenseMap Blocks; SmallVector LiveMaskQueries; + SmallVector LowerToCopyInstrs; void printInfo(); @@ -162,6 +163,7 @@ void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry); void lowerLiveMaskQueries(unsigned LiveMaskReg); + void lowerCopyInstrs(); public: static char ID; @@ -294,6 +296,11 @@ markUsesWQM(MI, Worklist); GlobalFlags |= StateWQM; continue; + } else if (Opcode == AMDGPU::WQM) { + // The WQM intrinsic requires its output to have all the helper lanes + // correct, so we need it to be in WQM. + Flags = StateWQM; + LowerToCopyInstrs.push_back(&MI); } else if (TII->isDisableWQM(MI)) { Flags = StateExact; } else { @@ -666,6 +673,11 @@ } } +void SIWholeQuadMode::lowerCopyInstrs() { + for (MachineInstr *MI : LowerToCopyInstrs) + MI->setDesc(TII->get(AMDGPU::COPY)); +} + bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS) return false; @@ -673,6 +685,7 @@ Instructions.clear(); Blocks.clear(); LiveMaskQueries.clear(); + LowerToCopyInstrs.clear(); const SISubtarget &ST = MF.getSubtarget(); @@ -708,6 +721,7 @@ .addReg(AMDGPU::EXEC); lowerLiveMaskQueries(LiveMaskReg); + lowerCopyInstrs(); // EntryMI may become invalid here return true; } @@ -716,6 +730,7 @@ DEBUG(printInfo()); lowerLiveMaskQueries(LiveMaskReg); + lowerCopyInstrs(); // Handle the general case for (auto BII : Blocks) Index: test/CodeGen/AMDGPU/wqm.ll =================================================================== --- test/CodeGen/AMDGPU/wqm.ll +++ test/CodeGen/AMDGPU/wqm.ll @@ -74,6 +74,40 @@ ret <4 x float> %dtex } +; Check that WQM is triggered by the wqm intrinsic. +; +;CHECK-LABEL: {{^}}test5: +;CHECK: s_wqm_b64 exec, exec +;CHECK: buffer_load_dword +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) + %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) + ret float %out.0 +} + +; Check that the wqm intrinsic works correctly for integers. +; +;CHECK-LABEL: {{^}}test6: +;CHECK: s_wqm_b64 exec, exec +;CHECK: buffer_load_dword +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) + %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %out = fadd float %src0, %src1 + %out.0 = bitcast float %out to i32 + %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0) + %out.2 = bitcast i32 %out.1 to float + ret float %out.2 +} + ; Check a case of one branch of an if-else requiring WQM, the other requiring ; exact. ; @@ -494,6 +528,8 @@ declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3 declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3 declare void @llvm.AMDGPU.kill(float) #1 +declare float @llvm.amdgcn.wqm.f32(float) #3 +declare i32 @llvm.amdgcn.wqm.i32(i32) #3 attributes #1 = { nounwind } attributes #2 = { nounwind readonly }