Index: llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -228,6 +228,16 @@ return true; } +static bool +hasPhysRegDef(MachineInstr &MI) { + for (const MachineOperand &Def : MI.defs()) { + if (Def.isReg() && + TargetRegisterInfo::isPhysicalRegister(Def.getReg())) + return true; + } + return false; +} + bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { // XXX - Would the same offset be OK? Is there any reason this would happen or // be useful? @@ -350,6 +360,13 @@ return false; } + if (hasPhysRegDef(*MBBI)) { + // We could re-order this instruction in theory, but it would require + // tracking physreg defs and uses. This should only affect M0 in + // practice. + return false; + } + if (MBBI->mayLoadOrStore() && (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) { @@ -437,7 +454,8 @@ // down past this instruction. // check if we can move I across MBBI and if we can move all I's users if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || - !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) + !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA) || + hasPhysRegDef(*MBBI)) break; } return false; Index: llvm/trunk/test/CodeGen/AMDGPU/ds_read2.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/ds_read2.ll +++ llvm/trunk/test/CodeGen/AMDGPU/ds_read2.ll @@ -613,6 +613,24 @@ ret void } +; GCN-LABEL: ds_read_call_read: +; GCN: ds_read_b32 +; GCN: s_swappc_b64 +; GCN: ds_read_b32 +define amdgpu_kernel void @ds_read_call_read(i32 addrspace(1)* %out, i32 addrspace(3)* %arg) { + %x = call i32 @llvm.amdgcn.workitem.id.x() + %arrayidx0 = getelementptr i32, i32 addrspace(3)* %arg, i32 %x + %arrayidx1 = getelementptr i32, i32 addrspace(3)* %arrayidx0, i32 1 + %v0 = load i32, i32 addrspace(3)* %arrayidx0, align 4 + call void @void_func_void() + %v1 = load i32, i32 addrspace(3)* %arrayidx1, align 4 + %r = add i32 %v0, %v1 + store i32 %r, i32 addrspace(1)* %out, align 4 + ret void +} + +declare void @void_func_void() #3 + declare i32 @llvm.amdgcn.workgroup.id.x() #1 declare i32 @llvm.amdgcn.workgroup.id.y() #1 declare i32 @llvm.amdgcn.workitem.id.x() #1 @@ -623,3 +641,4 @@ attributes #0 = { nounwind } attributes #1 = { nounwind readnone speculatable } attributes #2 = { convergent nounwind } +attributes #3 = { nounwind noinline } Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll @@ -160,21 +160,25 @@ ; SI won't merge ds memory operations, because of the signed offset bug, so ; we only have check lines for VI. -; VI-LABEL: v_interp_readnone: -; VI: s_mov_b32 m0, 0 -; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; VI-DAG: v_interp_mov_f32 v{{[0-9]+}}, p0, attr0.x{{$}} -; VI: s_mov_b32 m0, -1{{$}} -; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4 -define amdgpu_ps void @v_interp_readnone(float addrspace(3)* %lds) #0 { -bb: - store float 0.000000e+00, float addrspace(3)* %lds - %tmp1 = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 0) - %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4 - store float 0.000000e+00, float addrspace(3)* %tmp2 - call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 - ret void -} +; +; TODO: VI won't merge them either, because we are conservative about moving +; instructions past changes to physregs. +; +; TODO-VI-LABEL: v_interp_readnone: +; TODO-VI: s_mov_b32 m0, 0 +; TODO-VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 +; TODO-VI-DAG: v_interp_mov_f32 v{{[0-9]+}}, p0, attr0.x{{$}} +; TODO-VI: s_mov_b32 m0, -1{{$}} +; TODO-VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4 +;define amdgpu_ps void @v_interp_readnone(float addrspace(3)* %lds) #0 { +;bb: +; store float 0.000000e+00, float addrspace(3)* %lds +; %tmp1 = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 0) +; %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4 +; store float 0.000000e+00, float addrspace(3)* %tmp2 +; call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 +; ret void +;} ; Thest that v_interp_p1 uses different source and destination registers ; on 16 bank LDS chips. Index: llvm/trunk/test/CodeGen/AMDGPU/smrd.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/smrd.ll +++ llvm/trunk/test/CodeGen/AMDGPU/smrd.ll @@ -232,6 +232,48 @@ ret void } +; GCN-LABEL: {{^}}smrd_imm_nomerge_m0: +; +; In principle we could merge the loads here as well, but it would require +; careful tracking of physical registers since both v_interp* and v_movrel* +; instructions (or gpr idx mode) use M0. +; +; GCN: s_buffer_load_dword +; GCN: s_buffer_load_dword +define amdgpu_ps float @smrd_imm_nomerge_m0(<4 x i32> inreg %desc, i32 inreg %prim, float %u, float %v) #0 { +main_body: + %idx1.f = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 0) + %idx1 = bitcast float %idx1.f to i32 + + %v0.x1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 0, i32 0, i32 %prim) + %v0.x = call nsz float @llvm.amdgcn.interp.p2(float %v0.x1, float %v, i32 0, i32 0, i32 %prim) + %v0.y1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 0, i32 1, i32 %prim) + %v0.y = call nsz float @llvm.amdgcn.interp.p2(float %v0.y1, float %v, i32 0, i32 1, i32 %prim) + %v0.z1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 0, i32 2, i32 %prim) + %v0.z = call nsz float @llvm.amdgcn.interp.p2(float %v0.z1, float %v, i32 0, i32 2, i32 %prim) + %v0.tmp0 = insertelement <3 x float> undef, float %v0.x, i32 0 + %v0.tmp1 = insertelement <3 x float> %v0.tmp0, float %v0.y, i32 1 + %v0 = insertelement <3 x float> %v0.tmp1, float %v0.z, i32 2 + %a = extractelement <3 x float> %v0, i32 %idx1 + + %v1.x1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 1, i32 0, i32 %prim) + %v1.x = call nsz float @llvm.amdgcn.interp.p2(float %v1.x1, float %v, i32 1, i32 0, i32 %prim) + %v1.y1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 1, i32 1, i32 %prim) + %v1.y = call nsz float @llvm.amdgcn.interp.p2(float %v1.y1, float %v, i32 1, i32 1, i32 %prim) + %v1.z1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 1, i32 2, i32 %prim) + %v1.z = call nsz float @llvm.amdgcn.interp.p2(float %v1.z1, float %v, i32 1, i32 2, i32 %prim) + %v1.tmp0 = insertelement <3 x float> undef, float %v0.x, i32 0 + %v1.tmp1 = insertelement <3 x float> %v0.tmp0, float %v0.y, i32 1 + %v1 = insertelement <3 x float> %v0.tmp1, float %v0.z, i32 2 + + %b = extractelement <3 x float> %v1, i32 %idx1 + %c = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 4) + + %res.tmp = fadd float %a, %b + %res = fadd float %res.tmp, %c + ret float %res +} + ; GCN-LABEL: {{^}}smrd_vgpr_merged: ; GCN-NEXT: %bb. ; GCN-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 @@ -289,8 +331,11 @@ declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 +declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2 +declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readnone speculatable } !0 = !{}