Index: lib/Target/AMDGPU/SIDefines.h =================================================================== --- lib/Target/AMDGPU/SIDefines.h +++ lib/Target/AMDGPU/SIDefines.h @@ -41,7 +41,8 @@ WQM = 1 << 22, VGPRSpill = 1 << 23, VOPAsmPrefer32Bit = 1 << 24, - Gather4 = 1 << 25 + Gather4 = 1 << 25, + DisableWQM = 1 << 26 }; } Index: lib/Target/AMDGPU/SIInstrFormats.td =================================================================== --- lib/Target/AMDGPU/SIInstrFormats.td +++ lib/Target/AMDGPU/SIInstrFormats.td @@ -41,6 +41,8 @@ field bits<1> DS = 0; field bits<1> MIMG = 0; field bits<1> FLAT = 0; + + // Whether WQM _must_ be enabled for this instruction field bits<1> WQM = 0; field bits<1> VGPRSpill = 0; @@ -50,6 +52,9 @@ field bits<1> Gather4 = 0; + // Whether WQM _must_ be disabled for this instruction. + field bits<1> DisableWQM = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = VM_CNT; let TSFlags{1} = EXP_CNT; @@ -81,6 +86,7 @@ let TSFlags{23} = VGPRSpill; let TSFlags{24} = VOPAsmPrefer32Bit; let TSFlags{25} = Gather4; + let TSFlags{26} = DisableWQM; let SchedRW = [Write32Bit]; Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -340,6 +340,14 @@ return get(Opcode).TSFlags & SIInstrFlags::WQM; } + static bool isDisableWQM(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::DisableWQM; + } + + bool isDisableWQM(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::DisableWQM; + } + static bool isVGPRSpill(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::VGPRSpill; } Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -2984,6 +2984,10 @@ def "" : MUBUF_Pseudo , MUBUFAddr64Table <0>; + let DisableWQM = 1 in { + def "_exact" : MUBUF_Pseudo ; + } + let addr64 = 0, isCodeGenOnly = 0 in { def _si : MUBUF_Real_si ; } @@ -3054,7 +3058,8 @@ multiclass MUBUF_Atomic { - let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1 in { + let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1, + DisableWQM = 1 in { // No return variants let glc = 0, AsmMatchConverter = "cvtMubufAtomic" in { @@ -3458,6 +3463,7 @@ let mayStore = 1; let hasSideEffects = 1; let hasPostISelHook = 0; + let DisableWQM = 1; } multiclass MIMG_Store_Addr_Helper op, string asm, @@ -3489,6 +3495,7 @@ let mayStore = 1; let hasSideEffects = 1; let hasPostISelHook = 0; + let DisableWQM = 1; let Constraints = "$vdst = $vdata"; let AsmMatchConverter = "cvtMIMGAtomic"; } Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -2210,7 +2210,7 @@ (name vt:$vdata, v4i32:$rsrc, 0, (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), imm:$glc, imm:$slc), - (!cast(opcode # _OFFSET) $vdata, $rsrc, $soffset, (as_i16imm $offset), + (!cast(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; @@ -2218,7 +2218,7 @@ (name vt:$vdata, v4i32:$rsrc, i32:$vindex, (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), imm:$glc, imm:$slc), - (!cast(opcode # _IDXEN) $vdata, $vindex, $rsrc, $soffset, + (!cast(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; @@ -2227,7 +2227,7 @@ (name vt:$vdata, v4i32:$rsrc, 0, (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), imm:$glc, imm:$slc), - (!cast(opcode # _OFFEN) $vdata, $voffset, $rsrc, $soffset, + (!cast(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; @@ -2236,7 +2236,7 @@ (name vt:$vdata, v4i32:$rsrc, i32:$vindex, (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), imm:$glc, imm:$slc), - (!cast(opcode # _BOTHEN) + (!cast(opcode # _BOTHEN_exact) $vdata, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), Index: lib/Target/AMDGPU/SIWholeQuadMode.cpp =================================================================== --- lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -185,7 +185,7 @@ if (TII->isWQM(Opcode) || TII->isDS(Opcode)) { Flags = StateWQM; - } else if (MI.mayStore() && TII->usesVM_CNT(MI)) { + } else if (TII->isDisableWQM(MI)) { Flags = StateExact; } else { // Handle export instructions with the exec mask valid flag set @@ -237,9 +237,10 @@ InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references BlockInfo &BI = Blocks[MBB]; - // Control flow-type instructions that are followed by WQM computations - // must themselves be in WQM. - if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) && MI.isTerminator()) { + // Control flow-type instructions and stores to temporary memory that are + // followed by WQM computations must themselves be in WQM. + if ((II.OutNeeds & StateWQM) && !II.Needs && + (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) { Instructions[&MI].Needs = StateWQM; II.Needs = StateWQM; } Index: test/CodeGen/AMDGPU/skip-if-dead.ll =================================================================== --- test/CodeGen/AMDGPU/skip-if-dead.ll +++ test/CodeGen/AMDGPU/skip-if-dead.ll @@ -376,7 +376,7 @@ br i1 %tmp7, label %bb8, label %bb9 bb8: ; preds = %bb9, %bb4 - store volatile i32 9, i32 addrspace(1)* undef + call void @llvm.amdgcn.buffer.store.f32(float 9.0, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) ret void bb9: ; preds = %bb4 @@ -385,6 +385,7 @@ declare void @llvm.AMDGPU.kill(float) #0 declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) nounwind attributes #0 = { nounwind } attributes #1 = { nounwind readnone } \ No newline at end of file Index: test/CodeGen/AMDGPU/wqm.ll =================================================================== --- test/CodeGen/AMDGPU/wqm.ll +++ test/CodeGen/AMDGPU/wqm.ll @@ -41,14 +41,14 @@ ;CHECK: store ;CHECK-NOT: exec ;CHECK: .size test3 -define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) { +define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <4 x i32> %c) { main_body: %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tex.1 = bitcast <4 x float> %tex to <4 x i32> %tex.2 = extractelement <4 x i32> %tex.1, i32 0 - %gep = getelementptr float, float addrspace(1)* %ptr, i32 %tex.2 - %wr = extractelement <4 x float> %tex, i32 1 - store float %wr, float addrspace(1)* %gep + + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i1 0, i1 0) + ret <4 x float> %tex } @@ -66,8 +66,9 @@ define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) { main_body: %c.1 = mul i32 %c, %d - %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.1 - store float %data, float addrspace(1)* %gep + + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i1 0, i1 0) + %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %tex } @@ -89,7 +90,7 @@ ;CHECK: s_mov_b64 exec, [[SAVED]] ;CHECK: %IF ;CHECK: image_sample -define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) { +define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { main_body: %cmp = icmp eq i32 %z, 0 br i1 %cmp, label %IF, label %ELSE @@ -100,8 +101,7 @@ br label %END ELSE: - %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c - store float %data, float addrspace(1)* %gep + call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0) br label %END END: @@ -129,7 +129,7 @@ ;CHECK: s_or_b64 exec, exec, ;CHECK: v_mov_b32_e32 v0 ;CHECK: ; return -define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) { +define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { main_body: %cmp = icmp eq i32 %z, 0 br i1 %cmp, label %ELSE, label %IF @@ -140,8 +140,7 @@ br label %END ELSE: - %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c - store float %data, float addrspace(1)* %gep + call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0) br label %END END: @@ -163,23 +162,20 @@ ;CHECK: store ;CHECK: s_wqm_b64 exec, exec ;CHECK: v_cmp -define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) { +define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) { main_body: %idx.1 = extractelement <3 x i32> %idx, i32 0 - %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1 %data.1 = extractelement <2 x float> %data, i32 0 - store float %data.1, float addrspace(1)* %gep.1 + call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0) ; The load that determines the branch (and should therefore be WQM) is ; surrounded by stores that require disabled WQM. %idx.2 = extractelement <3 x i32> %idx, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2 - %z = load float, float addrspace(1)* %gep.2 + %z = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i1 0, i1 0) %idx.3 = extractelement <3 x i32> %idx, i32 2 - %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3 %data.3 = extractelement <2 x float> %data, i32 1 - store float %data.3, float addrspace(1)* %gep.3 + call void @llvm.amdgcn.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i1 0, i1 0) %cc = fcmp ogt float %z, 0.0 br i1 %cc, label %IF, label %ELSE @@ -210,24 +206,21 @@ ;CHECK: load ;CHECK: store ;CHECK: v_cmp -define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) { +define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) { main_body: %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tex.1 = extractelement <4 x float> %tex, i32 0 %idx.1 = extractelement <3 x i32> %idx, i32 0 - %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1 %data.1 = extractelement <2 x float> %data, i32 0 - store float %data.1, float addrspace(1)* %gep.1 + call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0) %idx.2 = extractelement <3 x i32> %idx, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2 - %z = load float, float addrspace(1)* %gep.2 + %z = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i1 0, i1 0) %idx.3 = extractelement <3 x i32> %idx, i32 2 - %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3 %data.3 = extractelement <2 x float> %data, i32 1 - store float %data.3, float addrspace(1)* %gep.3 + call void @llvm.amdgcn.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i1 0, i1 0) %cc = fcmp ogt float %z, 0.0 br i1 %cc, label %IF, label %ELSE @@ -258,15 +251,14 @@ ;CHECK: s_mov_b64 exec, [[SAVE]] ;CHECK: %END ;CHECK: image_sample -define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %coord, i32 %y, float %z) { +define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %coord, i32 %y, float %z) { main_body: %cond = icmp eq i32 %y, 0 br i1 %cond, label %IF, label %END IF: - %data = load float, float addrspace(1)* %ptr - %gep = getelementptr float, float addrspace(1)* %ptr, i32 1 - store float %data, float addrspace(1)* %gep + %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 0, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0) br label %END END: @@ -282,13 +274,11 @@ ;CHECK-NEXT: s_wqm_b64 exec, exec ;CHECK: image_sample ;CHECK: s_and_b64 exec, exec, [[ORIG]] -;SI: buffer_store_dword -;VI: flat_store_dword +;CHECK: buffer_store_dword ;CHECK: s_wqm_b64 exec, exec ;CHECK: v_cmpx_ ;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]] -;SI: buffer_store_dword -;VI: flat_store_dword +;CHECK: buffer_store_dword ;CHECK: s_mov_b64 exec, [[SAVE]] ;CHECK: image_sample define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) { @@ -296,16 +286,14 @@ %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %idx.0 = extractelement <2 x i32> %idx, i32 0 - %gep.0 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.0 %data.0 = extractelement <2 x float> %data, i32 0 - store float %data.0, float addrspace(1)* %gep.0 + call void @llvm.amdgcn.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i1 0, i1 0) call void @llvm.AMDGPU.kill(float %z) %idx.1 = extractelement <2 x i32> %idx, i32 1 - %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1 %data.1 = extractelement <2 x float> %data, i32 1 - store float %data.1, float addrspace(1)* %gep.1 + call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0) %tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %out = fadd <4 x float> %tex, %tex2 @@ -321,16 +309,14 @@ ; CHECK: s_wqm_b64 exec, exec ; CHECK: image_sample ; CHECK: s_and_b64 exec, exec, [[ORIG]] -; SI: buffer_store_dword -; VI: flat_store_dword +; CHECK: buffer_store_dword ; CHECK-NOT: wqm ; CHECK: v_cmpx_ -define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) { +define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) { main_body: %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %gep = getelementptr float, float addrspace(1)* %ptr, i32 %idx - store float %data, float addrspace(1)* %gep + call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) call void @llvm.AMDGPU.kill(float %z) @@ -388,9 +374,53 @@ ret <4 x float> %c.iv } +; Only intrinsic stores need exact execution -- other stores do not have +; externally visible effects and may require WQM for correctness. +; +; CHECK-LABEL: {{^}}test_alloca: +; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec +; CHECK: s_wqm_b64 exec, exec + +; CHECK: s_and_b64 exec, exec, [[LIVE]] +; CHECK: buffer_store_dword +; CHECK: s_wqm_b64 exec, exec +; CHECK: buffer_store_dword +; CHECK: s_and_b64 exec, exec, [[LIVE]] +; CHECK: buffer_store_dword +; CHECK: s_wqm_b64 exec, exec +; CHECK: buffer_load_dword + +; CHECK: image_sample +; CHECK: s_and_b64 exec, exec, [[LIVE]] +; CHECK: buffer_store_dwordx4 +define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind { +entry: + %array = alloca [32 x i32], align 4 + + call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) + + %s.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 0 + store i32 %a, i32* %s.gep, align 4 + + call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0) + + %c.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 %idx + %c = load i32, i32* %c.gep, align 4 + + %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) + + ret void +} + + declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 +declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1 +declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 +declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2 declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3