diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -396,9 +396,12 @@ static ScheduleDAGInstrs * createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { + const GCNSubtarget &ST = C->MF->getSubtarget(); ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(C, std::make_unique(C)); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + if (ST.shouldClusterStores()) + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createIGroupLPDAGMutation()); DAG->addMutation(createSchedBarrierDAGMutation()); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); @@ -408,9 +411,12 @@ static ScheduleDAGInstrs * createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { + const GCNSubtarget &ST = C->MF->getSubtarget(); auto DAG = new GCNIterativeScheduler(C, GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + if (ST.shouldClusterStores()) + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); return DAG; } @@ -421,9 +427,12 @@ static ScheduleDAGInstrs * createIterativeILPMachineScheduler(MachineSchedContext *C) { + const GCNSubtarget &ST = C->MF->getSubtarget(); auto DAG = new GCNIterativeScheduler(C, GCNIterativeScheduler::SCHEDULE_ILP); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + if (ST.shouldClusterStores()) + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); return DAG; } @@ -898,6 +907,8 @@ ScheduleDAGMI *DAG = createGenericSchedPostRA(C); const GCNSubtarget &ST = C->MF->getSubtarget(); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + if (ST.shouldClusterStores()) + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); DAG->addMutation(createIGroupLPDAGMutation()); DAG->addMutation(createSchedBarrierDAGMutation()); @@ -1102,8 +1113,11 @@ llvm::ScheduleDAGInstrs * AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const { + const GCNSubtarget &ST = C->MF->getSubtarget(); ScheduleDAGMILive *DAG = createGenericSchedLive(C); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + if (ST.shouldClusterStores()) + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); return DAG; } diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -3,6 +3,8 @@ ; RUN: FileCheck --enable-var-scope --check-prefix=DBG %s < %t ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefix=GFX10 %s ; RUN: FileCheck --enable-var-scope --check-prefix=DBG %s < %t +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefix=GFX11 %s +; RUN: FileCheck --enable-var-scope --check-prefixes=DBG,DBG11 %s < %t ; REQUIRES: asserts ; FIXME: Verifier error with xnack enabled. @@ -22,6 +24,10 @@ ; DBG: Cluster ld/st SU([[L2]]) - SU([[L3:[0-9]+]]) ; DBG: Cluster ld/st SU([[L3]]) - SU([[L4:[0-9]+]]) +; DBG11: Cluster ld/st SU([[S1:[0-9]+]]) - SU([[S2:[0-9]+]]) +; DBG11: Cluster ld/st SU([[S2]]) - SU([[S3:[0-9]+]]) +; DBG11: Cluster ld/st SU([[S3]]) - SU([[S4:[0-9]+]]) + ; DBG-NOT: Cluster ld/st define amdgpu_kernel void @cluster_load_cluster_store(i32* noalias %lb, i32* noalias %sb) { @@ -93,6 +99,31 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) ; GFX10-NEXT: flat_store_dword v[6:7], v11 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: cluster_load_cluster_store: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:8 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:16 +; GFX11-NEXT: flat_load_b32 v5, v[0:1] offset:24 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; GFX11-NEXT: flat_store_b32 v[0:1], v3 offset:8 +; GFX11-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; GFX11-NEXT: flat_store_b32 v[0:1], v4 offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) +; GFX11-NEXT: flat_store_b32 v[0:1], v5 offset:24 +; GFX11-NEXT: s_endpgm bb: %la0 = getelementptr inbounds i32, i32* %lb, i32 0 %ld0 = load i32, i32* %la0 @@ -130,6 +161,10 @@ ; DBG: Cluster ld/st SU([[L2]]) - SU([[L3:[0-9]+]]) ; DBG: Cluster ld/st SU([[L3]]) - SU([[L4:[0-9]+]]) +; DBG11: Cluster ld/st SU([[S1:[0-9]+]]) - SU([[S2:[0-9]+]]) +; DBG11: Cluster ld/st SU([[S2]]) - SU([[S3:[0-9]+]]) +; DBG11: Cluster ld/st SU([[S3]]) - SU([[S4:[0-9]+]]) + ; DBG-NOT: Cluster ld/st define amdgpu_kernel void @cluster_load_valu_cluster_store(i32* noalias %lb, i32* noalias %sb) { @@ -203,6 +238,33 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) ; GFX10-NEXT: flat_store_dword v[6:7], v10 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: cluster_load_valu_cluster_store: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: flat_load_b32 v2, v[0:1] offset:8 +; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:16 +; GFX11-NEXT: flat_load_b32 v5, v[0:1] offset:24 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 1, v2 +; GFX11-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: flat_store_b32 v[0:1], v3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 offset:8 +; GFX11-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; GFX11-NEXT: flat_store_b32 v[0:1], v4 offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) +; GFX11-NEXT: flat_store_b32 v[0:1], v5 offset:24 +; GFX11-NEXT: s_endpgm bb: %la0 = getelementptr inbounds i32, i32* %lb, i32 0 %ld0 = load i32, i32* %la0 @@ -266,6 +328,23 @@ ; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 ; GFX10-NEXT: image_store v[2:5], v[0:1], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: cluster_image_load: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_add_nc_u32_e32 v2, 1, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 1, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 2, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 2, v1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: image_load v[2:5], v[2:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: image_load v[6:9], v[6:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX11-NEXT: v_add_f32_e32 v4, v4, v8 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-NEXT: image_store v[2:5], v[0:1], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: s_endpgm entry: %x1 = add i32 %x, 1 %y1 = add i32 %y, 1 @@ -309,6 +388,19 @@ ; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 ; GFX10-NEXT: image_store v[2:5], v[0:1], s[16:23] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: no_cluster_image_load: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: image_load_mip v[2:5], [v0, v1, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: image_load_mip v[6:9], [v0, v1, v6], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX11-NEXT: v_add_f32_e32 v4, v4, v8 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-NEXT: image_store v[2:5], v[0:1], s[16:23] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: s_endpgm entry: %val1 = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %x, i32 %y, i32 0, <8 x i32> %src1, i32 0, i32 0) %val2 = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %x, i32 %y, i32 0, <8 x i32> %src2, i32 0, i32 0) @@ -377,6 +469,33 @@ ; GFX10-NEXT: v_add_f32_e32 v2, v14, v18 ; GFX10-NEXT: image_store v[2:5], v[0:1], s[12:19] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: cluster_image_sample: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_cvt_f32_i32_e32 v8, v0 +; GFX11-NEXT: v_cvt_f32_i32_e32 v9, v1 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_mov_b32_e32 v10, 1.0 +; GFX11-NEXT: v_add_f32_e32 v2, 1.0, v8 +; GFX11-NEXT: v_add_f32_e32 v3, 1.0, v9 +; GFX11-NEXT: v_mov_b32_e32 v5, v4 +; GFX11-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-NEXT: v_mov_b32_e32 v7, v4 +; GFX11-NEXT: v_add_f32_e32 v8, 2.0, v8 +; GFX11-NEXT: v_add_f32_e32 v9, 2.0, v9 +; GFX11-NEXT: v_mov_b32_e32 v11, v10 +; GFX11-NEXT: v_mov_b32_e32 v12, v10 +; GFX11-NEXT: v_mov_b32_e32 v13, v10 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: image_sample_d v[2:5], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_sample_d v[6:9], v[8:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX11-NEXT: v_add_f32_e32 v4, v4, v8 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-NEXT: image_store v[2:5], v[0:1], s[12:19] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: s_endpgm entry: %s = sitofp i32 %x to float %t = sitofp i32 %y to float