diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -4147,9 +4147,9 @@ * The vector memory operations access a vector L0 cache. There is a single L0 cache per CU. Each SIMD of a CU accesses the same L0 cache. Therefore, no special action is required for coherence between the lanes of a single - wavefront. However, a ``BUFFER_GL0_INV`` is required for coherence between + wavefront. However, a ``buffer_gl0_inv`` is required for coherence between wavefronts executing in the same work-group as they may be executing on SIMDs - of different CUs that access different L0s. A ``BUFFER_GL0_INV`` is also + of different CUs that access different L0s. A ``buffer_gl0_inv`` is also required for coherence between wavefronts executing in different work-groups as they may be executing on different WGPs. * The scalar memory operations access a scalar L0 cache shared by all wavefronts @@ -4158,7 +4158,7 @@ :ref:`amdgpu-address-spaces`. * The vector and scalar memory L0 caches use an L1 cache shared by all WGPs on the same SA. Therefore, no special action is required for coherence between - the wavefronts of a single work-group. However, a ``BUFFER_GL1_INV`` is + the wavefronts of a single work-group. However, a ``buffer_gl1_inv`` is required for coherence between wavefronts executing in different work-groups as they may be executing on different SAs that access different L1s. * The L1 caches have independent quadrants to service disjoint ranges of virtual @@ -4320,7 +4320,8 @@ load atomic monotonic - workgroup - global 1. buffer/global/flat_load 1. buffer/global/flat_load - generic glc=1 - - If CU wavefront execution mode, omit glc=1. + - If CU wavefront execution + mode, omit glc=1. load atomic monotonic - singlethread - local 1. ds_load 1. ds_load - wavefront @@ -4350,11 +4351,13 @@ - generic load atomic acquire - workgroup - global 1. buffer/global/flat_load 1. buffer/global_load glc=1 - - If CU wavefront execution mode, omit glc=1. + - If CU wavefront execution + mode, omit glc=1. 2. s_waitcnt vmcnt(0) - - If CU wavefront execution mode, omit. + - If CU wavefront execution + mode, omit. - Must happen before the following buffer_gl0_inv and before any following @@ -4365,7 +4368,8 @@ 3. buffer_gl0_inv - - If CU wavefront execution mode, omit. + - If CU wavefront execution + mode, omit. - Ensures that following loads will not see @@ -4390,7 +4394,8 @@ 3. buffer_gl0_inv - - If CU wavefront execution mode, omit. + - If CU wavefront execution + mode, omit. - If OpenCL, omit. - Ensures that following @@ -4399,12 +4404,14 @@ load atomic acquire - workgroup - generic 1. flat_load 1. flat_load glc=1 - - If CU wavefront execution mode, omit glc=1. + - If CU wavefront execution + mode, omit glc=1. 2. s_waitcnt lgkmcnt(0) 2. s_waitcnt lgkmcnt(0) & vmcnt(0) - - If CU wavefront execution mode, omit vmcnt. + - If CU wavefront execution + mode, omit vmcnt(1)1. - If OpenCL, omit. - If OpenCL, omit lgkmcnt(0). - Must happen before - Must happen before @@ -4423,7 +4430,8 @@ 3. buffer_gl0_inv - - If CU wavefront execution mode, omit. + - If CU wavefront execution + mode, omit. - Ensures that following loads will not see @@ -4487,10 +4495,11 @@ atomicrmw acquire - workgroup - global 1. buffer/global/flat_atomic 1. buffer/global_atomic 2. s_waitcnt vm/vscnt(0) - - If CU wavefront execution mode, omit. - - Use vmcnt if atomic with - return and vscnt if atomic - with no-return. + - If CU wavefront execution + mode, omit. + - Use vmcnt(0) if atomic with + return and vscnt(0) if + atomic with no-return. - Must happen before the following buffer_gl0_inv and before any following @@ -4501,7 +4510,8 @@ 3. buffer_gl0_inv - - If CU wavefront execution mode, omit. + - If CU wavefront execution + mode, omit. - Ensures that following loads will not see @@ -4536,13 +4546,13 @@ 2. waitcnt lgkmcnt(0) 2. waitcnt lgkmcnt(0) & vm/vscnt(0) - - If CU wavefront execution mode, omit vm/vscnt. + - If CU wavefront execution + mode, omit vm/vscnt(0). - If OpenCL, omit. - If OpenCL, omit - waitcnt lgkmcnt(0).. - - Use vmcnt if atomic with - return and vscnt if atomic - with no-return. waitcnt lgkmcnt(0). + - Use vmcnt(0) if atomic with + return and vscnt(0) if + atomic with no-return. - Must happen before - Must happen before any following the following global/generic buffer_gl0_inv. @@ -4558,7 +4568,8 @@ 3. buffer_gl0_inv - - If CU wavefront execution mode, omit. + - If CU wavefront execution + mode, omit. - Ensures that following loads will not see @@ -4567,9 +4578,9 @@ atomicrmw acquire - agent - global 1. buffer/global/flat_atomic 1. buffer/global_atomic - system 2. s_waitcnt vmcnt(0) 2. s_waitcnt vm/vscnt(0) - - Use vmcnt if atomic with - return and vscnt if atomic - with no-return. + - Use vmcnt(0) if atomic with + return and vscnt(0) if + atomic with no-return. waitcnt lgkmcnt(0). - Must happen before - Must happen before following following @@ -4599,9 +4610,9 @@ - If OpenCL, omit - If OpenCL, omit lgkmcnt(0). lgkmcnt(0). - - Use vmcnt if atomic with - return and vscnt if atomic - with no-return. + - Use vmcnt(0) if atomic with + return and vscnt(0) if + atomic with no-return. - Must happen before - Must happen before following following buffer_wbinvl1_vol. buffer_gl*_inv. @@ -4629,8 +4640,9 @@ fence acquire - workgroup *none* 1. s_waitcnt lgkmcnt(0) 1. s_waitcnt lgkmcnt(0) & vmcnt(0) & vscnt(0) - - If CU wavefront execution mode, omit vmcnt and - vscnt. + - If CU wavefront execution + mode, omit vmcnt(0) and + vscnt(0). - If OpenCL and - If OpenCL and address space is address space is not generic, omit. not generic, omit @@ -4741,7 +4753,8 @@ 3. buffer_gl0_inv - - If CU wavefront execution mode, omit. + - If CU wavefront execution + mode, omit. - Ensures that following loads will not see @@ -4897,8 +4910,9 @@ store atomic release - workgroup - global 1. s_waitcnt lgkmcnt(0) 1. s_waitcnt lgkmcnt(0) & vmcnt(0) & vscnt(0) - - If CU wavefront execution mode, omit vmcnt and - vscnt. + - If CU wavefront execution + mode, omit vmcnt(0) and + vscnt(0). - If OpenCL, omit. - If OpenCL, omit lgkmcnt(0). - Must happen after @@ -4947,10 +4961,11 @@ store that is being store that is being released. released. - 2. buffer/global/flat_store 2. buffer/global_store + 2. buffer/global/flat_store 2. buffer/global/flat_store store atomic release - workgroup - local 1. waitcnt vmcnt(0) & vscnt(0) - - If CU wavefront execution mode, omit. + - If CU wavefront execution + mode, omit. - If OpenCL, omit. - Could be split into separate s_waitcnt @@ -4987,8 +5002,9 @@ store atomic release - workgroup - generic 1. s_waitcnt lgkmcnt(0) 1. s_waitcnt lgkmcnt(0) & vmcnt(0) & vscnt(0) - - If CU wavefront execution mode, omit vmcnt and - vscnt. + - If CU wavefront execution + mode, omit vmcnt(0) and + vscnt(0). - If OpenCL, omit. - If OpenCL, omit lgkmcnt(0). - Must happen after @@ -5022,8 +5038,10 @@ - s_waitcnt lgkmcnt(0) must happen after any preceding - local/generic load/store/load - atomic/store atomic/atomicrmw. + local/generic + load/store/load + atomic/store + atomic/atomicrmw. - Must happen before - Must happen before the following the following store. store. @@ -5088,8 +5106,9 @@ atomicrmw release - workgroup - global 1. s_waitcnt lgkmcnt(0) 1. s_waitcnt lgkmcnt(0) & vmcnt(0) & vscnt(0) - - If CU wavefront execution mode, omit vmcnt and - vscnt. + - If CU wavefront execution + mode, omit vmcnt(0) and + vscnt(0). - If OpenCL, omit. - Must happen after @@ -5141,7 +5160,8 @@ 2. buffer/global/flat_atomic 2. buffer/global_atomic atomicrmw release - workgroup - local 1. waitcnt vmcnt(0) & vscnt(0) - - If CU wavefront execution mode, omit. + - If CU wavefront execution + mode, omit. - If OpenCL, omit. - Could be split into separate s_waitcnt @@ -5178,8 +5198,9 @@ atomicrmw release - workgroup - generic 1. s_waitcnt lgkmcnt(0) 1. s_waitcnt lgkmcnt(0) & vmcnt(0) & vscnt(0) - - If CU wavefront execution mode, omit vmcnt and - vscnt. + - If CU wavefront execution + mode, omit vmcnt(0) and + vscnt(0). - If OpenCL, omit. - If OpenCL, omit waitcnt lgkmcnt(0). - Must happen after @@ -5213,8 +5234,10 @@ - s_waitcnt lgkmcnt(0) must happen after any preceding - local/generic load/store/load - atomic/store atomic/atomicrmw. + local/generic + load/store/load + atomic/store + atomic/atomicrmw. - Must happen before - Must happen before the following the following atomicrmw. atomicrmw. @@ -5278,8 +5301,9 @@ fence release - workgroup *none* 1. s_waitcnt lgkmcnt(0) 1. s_waitcnt lgkmcnt(0) & vmcnt(0) & vscnt(0) - - If CU wavefront execution mode, omit vmcnt and - vscnt. + - If CU wavefront execution + mode, omit vmcnt(0) and + vscnt(0). - If OpenCL and - If OpenCL and address space is address space is not generic, omit. not generic, omit @@ -5437,8 +5461,9 @@ atomicrmw acq_rel - workgroup - global 1. s_waitcnt lgkmcnt(0) 1. s_waitcnt lgkmcnt(0) & vmcnt(0) & vscnt(0) - - If CU wavefront execution mode, omit vmcnt and - vscnt. + - If CU wavefront execution + mode, omit vmcnt(0) and + vscnt(0). - If OpenCL, omit. - If OpenCL, omit s_waitcnt lgkmcnt(0). - Must happen after - Must happen after @@ -5472,8 +5497,10 @@ - s_waitcnt lgkmcnt(0) must happen after any preceding - local/generic load/store/load - atomic/store atomic/atomicrmw. + local/generic + load/store/load + atomic/store + atomic/atomicrmw. - Must happen before - Must happen before the following the following atomicrmw. atomicrmw. @@ -5488,10 +5515,11 @@ 2. buffer/global/flat_atomic 2. buffer/global_atomic 3. s_waitcnt vm/vscnt(0) - - If CU wavefront execution mode, omit vm/vscnt. - - Use vmcnt if atomic with - return and vscnt if atomic - with no-return. + - If CU wavefront execution + mode, omit vm/vscnt(0). + - Use vmcnt(0) if atomic with + return and vscnt(0) if + atomic with no-return. waitcnt lgkmcnt(0). - Must happen before the following @@ -5505,7 +5533,8 @@ 4. buffer_gl0_inv - - If CU wavefront execution mode, omit. + - If CU wavefront execution + mode, omit. - Ensures that following loads will not see @@ -5513,7 +5542,8 @@ atomicrmw acq_rel - workgroup - local 1. waitcnt vmcnt(0) & vscnt(0) - - If CU wavefront execution mode, omit. + - If CU wavefront execution + mode, omit. - If OpenCL, omit. - Could be split into separate s_waitcnt @@ -5565,7 +5595,8 @@ 4. buffer_gl0_inv - - If CU wavefront execution mode, omit. + - If CU wavefront execution + mode, omit. - If OpenCL omit. - Ensures that following @@ -5575,8 +5606,9 @@ atomicrmw acq_rel - workgroup - generic 1. s_waitcnt lgkmcnt(0) 1. s_waitcnt lgkmcnt(0) & vmcnt(0) & vscnt(0) - - If CU wavefront execution mode, omit vmcnt and - vscnt. + - If CU wavefront execution + mode, omit vmcnt(0) and + vscnt(0). - If OpenCL, omit. - If OpenCL, omit waitcnt lgkmcnt(0). - Must happen after @@ -5610,8 +5642,10 @@ - s_waitcnt lgkmcnt(0) must happen after any preceding - local/generic load/store/load - atomic/store atomic/atomicrmw. + local/generic + load/store/load + atomic/store + atomic/atomicrmw. - Must happen before - Must happen before the following the following atomicrmw. atomicrmw. @@ -5627,7 +5661,8 @@ 3. s_waitcnt lgkmcnt(0) 3. s_waitcnt lgkmcnt(0) & vm/vscnt(0) - - If CU wavefront execution mode, omit vm/vscnt. + - If CU wavefront execution + mode, omit vm/vscnt(0). - If OpenCL, omit. - If OpenCL, omit waitcnt lgkmcnt(0). - Must happen before - Must happen before @@ -5645,7 +5680,8 @@ 3. buffer_gl0_inv - - If CU wavefront execution mode, omit. + - If CU wavefront execution + mode, omit. - Ensures that following loads will not see @@ -5699,9 +5735,9 @@ 2. buffer/global/flat_atomic 2. buffer/global_atomic 3. s_waitcnt vmcnt(0) 3. s_waitcnt vm/vscnt(0) - - Use vmcnt if atomic with - return and vscnt if atomic - with no-return. + - Use vmcnt(0) if atomic with + return and vscnt(0) if + atomic with no-return. waitcnt lgkmcnt(0). - Must happen before - Must happen before following following @@ -5776,9 +5812,9 @@ - If OpenCL, omit - If OpenCL, omit lgkmcnt(0). lgkmcnt(0). - - Use vmcnt if atomic with - return and vscnt if atomic - with no-return. + - Use vmcnt(0) if atomic with + return and vscnt(0) if + atomic with no-return. - Must happen before - Must happen before following following buffer_wbinvl1_vol. buffer_gl*_inv. @@ -5806,8 +5842,9 @@ fence acq_rel - workgroup *none* 1. s_waitcnt lgkmcnt(0) 1. s_waitcnt lgkmcnt(0) & vmcnt(0) & vscnt(0) - - If CU wavefront execution mode, omit vmcnt and - vscnt. + - If CU wavefront execution + mode, omit vmcnt(0) and + vscnt(0). - If OpenCL and - If OpenCL and address space is address space is not generic, omit. not generic, omit @@ -5926,7 +5963,8 @@ 3. buffer_gl0_inv - - If CU wavefront execution mode, omit. + - If CU wavefront execution + mode, omit. - Ensures that following loads will not see @@ -6047,8 +6085,9 @@ load atomic seq_cst - workgroup - global 1. s_waitcnt lgkmcnt(0) 1. s_waitcnt lgkmcnt(0) & - generic vmcnt(0) & vscnt(0) - - If CU wavefront execution mode, omit vmcnt and - vscnt. + - If CU wavefront execution + mode, omit vmcnt(0) and + vscnt(0). - Could be split into separate s_waitcnt vmcnt(0), s_waitcnt @@ -6134,8 +6173,17 @@ preventing a store preventing a store release followed by release followed by load acquire from load acquire from - competing out of competing out of - order.) order.) + completing out of completing out of + order. The waitcnt order. The waitcnt + could be placed after could be placed after + seq_store or before seq_store or before + the seq_load. We the seq_load. We + choose the load to choose the load to + make the waitcnt be make the waitcnt be + as late as possible as late as possible + so that the store so that the store + may have already may have already + completed.) completed.) 2. *Following 2. *Following instructions same as instructions same as @@ -6152,7 +6200,8 @@ 1. s_waitcnt vmcnt(0) & vscnt(0) - - If CU wavefront execution mode, omit. + - If CU wavefront execution + mode, omit. - Could be split into separate s_waitcnt vmcnt(0) and s_waitcnt @@ -6221,8 +6270,17 @@ preventing a store release followed by load acquire from - competing out of - order.) + completing out of + order. The waitcnt + could be placed after + seq_store or before + the seq_load. We + choose the load to + make the waitcnt be + as late as possible + so that the store + may have already + completed.) 2. *Following instructions same as @@ -6320,8 +6378,17 @@ preventing a store preventing a store release followed by release followed by load acquire from load acquire from - competing out of competing out of - order.) order.) + completing out of completing out of + order. The waitcnt order. The waitcnt + could be placed after could be placed after + seq_store or before seq_store or before + the seq_load. We the seq_load. We + choose the load to choose the load to + make the waitcnt be make the waitcnt be + as late as possible as late as possible + so that the store so that the store + may have already may have already + completed.) completed.) 2. *Following 2. *Following instructions same as instructions same as