This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
clang/
-
lib/
-
CodeGen/
-
CGOpenMPRuntime.h
-
CGOpenMPRuntime.cpp
1
CGOpenMPRuntimeGPU.h
2/6
CGOpenMPRuntimeGPU.cpp
-
CGStmtOpenMP.cpp
1
CodeGenFunction.h
-
Sema/
-
SemaOpenMP.cpp
-
test/
-
AST/
-
ast-dump-openmp-distribute-parallel-for-simd.c
-
ast-dump-openmp-distribute-parallel-for.c
-
ast-dump-openmp-target-teams-distribute-parallel-for-simd.c
-
ast-dump-openmp-target-teams-distribute-parallel-for.c
-
ast-dump-openmp-teams-distribute-parallel-for-simd.c
-
ast-dump-openmp-teams-distribute-parallel-for.c
-
CodeGen/PowerPC/
-
PowerPC/
-
ppc64le-varargs-f128.c
-
OpenMP/
-
amdgpu_target_with_aligned_attribute.c
-
bug54082.c
-
bug60602.cpp
-
cancel_codegen.cpp
-
cancellation_point_codegen.cpp
-
debug-info-complex-byval.cpp
-
debug-info-openmp-array.cpp
-
debug_threadprivate_copyin.c
-
declare_target_codegen_globalization.cpp
-
declare_target_constexpr_codegen.cpp
1
declare_variant_construct_codegen_1.c
-
distribute_codegen.cpp
-
distribute_firstprivate_codegen.cpp
-
distribute_lastprivate_codegen.cpp
-
distribute_parallel_for_codegen.cpp
-
distribute_parallel_for_firstprivate_codegen.cpp
-
distribute_parallel_for_if_codegen.cpp
-
distribute_parallel_for_lastprivate_codegen.cpp
-
distribute_parallel_for_num_threads_codegen.cpp
-
distribute_parallel_for_private_codegen.cpp
-
distribute_parallel_for_proc_bind_codegen.cpp
-
distribute_parallel_for_reduction_task_codegen.cpp
-
distribute_parallel_for_simd_codegen.cpp
-
distribute_parallel_for_simd_firstprivate_codegen.cpp
-
distribute_parallel_for_simd_if_codegen.cpp
-
distribute_parallel_for_simd_lastprivate_codegen.cpp
-
distribute_parallel_for_simd_num_threads_codegen.cpp
-
distribute_parallel_for_simd_private_codegen.cpp
-
distribute_parallel_for_simd_proc_bind_codegen.cpp
-
distribute_private_codegen.cpp
-
distribute_simd_codegen.cpp
-
distribute_simd_firstprivate_codegen.cpp
-
distribute_simd_lastprivate_codegen.cpp
-
distribute_simd_private_codegen.cpp
-
distribute_simd_reduction_codegen.cpp
-
for_firstprivate_codegen.cpp
-
for_lastprivate_codegen.cpp
-
for_linear_codegen.cpp
-
for_private_codegen.cpp
-
for_reduction_codegen.cpp
-
for_reduction_codegen_UDR.cpp
-
for_reduction_task_codegen.cpp
-
irbuilder_safelen.cpp
-
irbuilder_safelen_order_concurrent.cpp
-
irbuilder_simd_aligned.cpp
-
irbuilder_simdlen.cpp
-
irbuilder_simdlen_safelen.cpp
-
master_taskloop_in_reduction_codegen.cpp
-
master_taskloop_simd_in_reduction_codegen.cpp
-
metadirective_device_kind_codegen.c
-
metadirective_device_kind_codegen.cpp
-
metadirective_implementation_codegen.cpp
-
nested_loop_codegen.cpp
-
nvptx_SPMD_codegen.cpp
-
nvptx_allocate_codegen.cpp
-
nvptx_data_sharing.cpp
-
nvptx_distribute_parallel_generic_mode_codegen.cpp
-
nvptx_lambda_capturing.cpp
-
nvptx_lambda_pointer_capturing.cpp
-
nvptx_multi_target_parallel_codegen.cpp
-
nvptx_nested_parallel_codegen.cpp
-
nvptx_parallel_codegen.cpp
-
nvptx_parallel_for_codegen.cpp
-
nvptx_target_codegen.cpp
-
nvptx_target_parallel_codegen.cpp
-
nvptx_target_parallel_num_threads_codegen.cpp
-
nvptx_target_parallel_proc_bind_codegen.cpp
-
nvptx_target_parallel_reduction_codegen.cpp
-
nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
-
nvptx_target_teams_codegen.cpp
-
nvptx_target_teams_distribute_codegen.cpp
-
nvptx_target_teams_distribute_parallel_for_codegen.cpp
-
nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
-
nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
-
nvptx_target_teams_distribute_simd_codegen.cpp
-
nvptx_teams_codegen.cpp
-
nvptx_teams_reduction_codegen.cpp
-
openmp_win_codegen.cpp
-
outlined_artificial.c
-
parallel_codegen.cpp
-
parallel_copyin_codegen.cpp
-
parallel_copyin_combined_codegen.c
-
parallel_firstprivate_codegen.cpp
-
parallel_for_codegen.cpp
-
parallel_for_lastprivate_conditional.cpp
-
parallel_for_linear_codegen.cpp
-
parallel_for_reduction_task_codegen.cpp
-
parallel_for_simd_aligned_codegen.cpp
-
parallel_if_codegen.cpp
-
parallel_if_codegen_PR51349.cpp
-
parallel_masked.cpp
-
parallel_masked_target.cpp
-
parallel_master_codegen.cpp
-
parallel_master_reduction_task_codegen.cpp
-
parallel_master_taskloop_codegen.cpp
-
parallel_master_taskloop_firstprivate_codegen.cpp
-
parallel_master_taskloop_lastprivate_codegen.cpp
-
parallel_master_taskloop_simd_codegen.cpp
-
parallel_master_taskloop_simd_firstprivate_codegen.cpp
-
parallel_master_taskloop_simd_lastprivate_codegen.cpp
-
parallel_private_codegen.cpp
-
parallel_reduction_codegen.cpp
-
parallel_reduction_task_codegen.cpp
-
parallel_sections_codegen.cpp
-
parallel_sections_reduction_task_codegen.cpp
-
reduction_compound_op.cpp
-
reduction_implicit_map.cpp
-
remarks_parallel_in_multiple_target_state_machines.c
-
remarks_parallel_in_target_state_machine.c
-
sections_firstprivate_codegen.cpp
-
sections_lastprivate_codegen.cpp
-
sections_private_codegen.cpp
-
sections_reduction_codegen.cpp
-
sections_reduction_task_codegen.cpp
-
single_codegen.cpp
-
single_firstprivate_codegen.cpp
-
single_private_codegen.cpp
-
target_codegen_global_capture.cpp
-
target_data_map_codegen_hold.cpp
-
target_in_reduction_codegen.cpp
-
target_map_codegen_03.cpp
-
target_map_codegen_hold.cpp
-
target_map_member_expr_codegen.cpp
-
target_ompx_dyn_cgroup_mem_codegen.cpp
-
target_parallel_codegen.cpp
-
target_parallel_debug_codegen.cpp
-
target_parallel_for_codegen.cpp
-
target_parallel_for_debug_codegen.cpp
-
target_parallel_for_reduction_task_codegen.cpp
-
target_parallel_for_simd_codegen.cpp
-
target_parallel_if_codegen.cpp
-
target_parallel_num_threads_codegen.cpp
-
target_parallel_reduction_task_codegen.cpp
-
target_teams_codegen.cpp
-
target_teams_distribute_codegen.cpp
-
target_teams_distribute_collapse_codegen.cpp
-
target_teams_distribute_dist_schedule_codegen.cpp
-
target_teams_distribute_firstprivate_codegen.cpp
-
target_teams_distribute_lastprivate_codegen.cpp
-
target_teams_distribute_parallel_for_codegen.cpp
-
target_teams_distribute_parallel_for_collapse_codegen.cpp
-
target_teams_distribute_parallel_for_dist_schedule_codegen.cpp
-
target_teams_distribute_parallel_for_firstprivate_codegen.cpp
-
target_teams_distribute_parallel_for_if_codegen.cpp
-
target_teams_distribute_parallel_for_lastprivate_codegen.cpp
-
target_teams_distribute_parallel_for_order_codegen.cpp
-
target_teams_distribute_parallel_for_private_codegen.cpp
-
target_teams_distribute_parallel_for_proc_bind_codegen.cpp
-
target_teams_distribute_parallel_for_reduction_codegen.cpp
-
target_teams_distribute_parallel_for_reduction_task_codegen.cpp
-
target_teams_distribute_parallel_for_schedule_codegen.cpp
-
target_teams_distribute_parallel_for_simd_codegen.cpp
-
target_teams_distribute_parallel_for_simd_collapse_codegen.cpp
-
target_teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp
-
target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
-
target_teams_distribute_parallel_for_simd_if_codegen.cpp
-
target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
-
target_teams_distribute_parallel_for_simd_private_codegen.cpp
-
target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp
-
target_teams_distribute_parallel_for_simd_reduction_codegen.cpp
-
target_teams_distribute_parallel_for_simd_schedule_codegen.cpp
-
target_teams_distribute_private_codegen.cpp
-
target_teams_distribute_reduction_codegen.cpp
-
target_teams_distribute_simd_codegen.cpp
-
target_teams_distribute_simd_collapse_codegen.cpp
-
target_teams_distribute_simd_dist_schedule_codegen.cpp
-
target_teams_distribute_simd_firstprivate_codegen.cpp
-
target_teams_distribute_simd_lastprivate_codegen.cpp
-
target_teams_distribute_simd_private_codegen.cpp
-
target_teams_distribute_simd_reduction_codegen.cpp
-
target_teams_map_codegen.cpp
-
target_teams_num_teams_codegen.cpp
-
target_teams_thread_limit_codegen.cpp
-
task_codegen.cpp
-
task_if_codegen.cpp
-
task_in_reduction_codegen.cpp
-
taskgroup_codegen.cpp
-
taskloop_in_reduction_codegen.cpp
-
taskloop_simd_in_reduction_codegen.cpp
-
teams_codegen.cpp
-
teams_distribute_codegen.cpp
-
teams_distribute_collapse_codegen.cpp
-
teams_distribute_dist_schedule_codegen.cpp
-
teams_distribute_firstprivate_codegen.cpp
-
teams_distribute_lastprivate_codegen.cpp
-
teams_distribute_parallel_for_codegen.cpp
-
teams_distribute_parallel_for_collapse_codegen.cpp
-
teams_distribute_parallel_for_copyin_codegen.cpp
-
teams_distribute_parallel_for_dist_schedule_codegen.cpp
-
teams_distribute_parallel_for_firstprivate_codegen.cpp
-
teams_distribute_parallel_for_if_codegen.cpp
-
teams_distribute_parallel_for_lastprivate_codegen.cpp
-
teams_distribute_parallel_for_num_threads_codegen.cpp
-
teams_distribute_parallel_for_private_codegen.cpp
-
teams_distribute_parallel_for_proc_bind_codegen.cpp
-
teams_distribute_parallel_for_reduction_codegen.cpp
-
teams_distribute_parallel_for_reduction_task_codegen.cpp
-
teams_distribute_parallel_for_schedule_codegen.cpp
-
teams_distribute_parallel_for_simd_codegen.cpp
-
teams_distribute_parallel_for_simd_collapse_codegen.cpp
-
teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp
-
teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
-
teams_distribute_parallel_for_simd_if_codegen.cpp
-
teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
-
teams_distribute_parallel_for_simd_num_threads_codegen.cpp
-
teams_distribute_parallel_for_simd_private_codegen.cpp
-
teams_distribute_parallel_for_simd_proc_bind_codegen.cpp
-
teams_distribute_parallel_for_simd_reduction_codegen.cpp
-
teams_distribute_parallel_for_simd_schedule_codegen.cpp
-
teams_distribute_private_codegen.cpp
-
teams_distribute_reduction_codegen.cpp
-
teams_distribute_simd_codegen.cpp
-
teams_distribute_simd_collapse_codegen.cpp
-
teams_distribute_simd_dist_schedule_codegen.cpp
-
teams_distribute_simd_firstprivate_codegen.cpp
-
teams_distribute_simd_lastprivate_codegen.cpp
-
teams_distribute_simd_private_codegen.cpp
-
teams_distribute_simd_reduction_codegen.cpp
-
teams_firstprivate_codegen.cpp
-
teams_private_codegen.cpp
-
tile_codegen.cpp
-
unroll_codegen_parallel_for_factor.cpp
-
vla_crash.c
-
utils/update_cc_test_checks/Inputs/
-
update_cc_test_checks/
-
Inputs/
-
generated-funcs.c.generated.expected
-
generated-funcs.c.no-generated.expected
-
llvm/
-
include/llvm/Frontend/OpenMP/
-
llvm/
-
Frontend/
-
OpenMP/
2
OMPKinds.def
-
lib/Transforms/IPO/
-
Transforms/
-
IPO/
3
OpenMPOpt.cpp
-
openmp/libomptarget/
-
libomptarget/
-
DeviceRTL/
-
include/
-
Interface.h
-
generated_microtask_cases.gen
-
src/
-
Parallelism.cpp
-
State.cpp
-
utils/
-
generate_microtask_cases.py

Differential D102107

[OpenMP] Codegen aggregate for outlined function captures
AcceptedPublic

Authored by ggeorgakoudis on May 8 2021, 8:16 AM.

Download Raw Diff

Details

Reviewers

jdoerfert
jhuber6
ABataev

Commits

rG7539e9cf811e: [OpenMP] Codegen aggregate for outlined function captures
rG1d66649adf28: [OpenMP] Codegen aggregate for outlined function captures
rGe9c7291cb25f: [OpenMP] Codegen aggregate for outlined function captures

Summary

Parallel regions are outlined as functions with capture variables explicitly generated as distinct parameters in the function's argument list. That complicates the fork_call interface in the OpenMP runtime: (1) the fork_call is variadic since there is a variable number of arguments to forward to the outlined function, (2) wrapping/unwrapping arguments happens in the OpenMP runtime, which is sub-optimal, has been a source of ABI bugs, and has a hardcoded limit (16) in the number of arguments, (3) forwarded arguments must cast to pointer types, which complicates debugging. This patch avoids those issues by aggregating captured arguments in a struct to pass to the fork_call.

Additional changes by Dhruva Chakrabarti <Dhruva.Chakrabarti@amd.com>
- Fixed opaque pointer miscompile.
- Added alloc_aggregate_arg entry point to OpenMPOpt SPMD list.
- Fixed nocapture attribute of kmpc_alloc_aggregate_arg.
- Added align attribute for call to kmpc_alloc_shared.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

There are a very large number of changes, so older changes are hidden. Show Older Changes

@ronlieb bisected amdgpu crashing to this too, rocm 'veccopy' case tries to dereference 0. Might be the same failure mode as the above or a different one, the hsa error reporting is quite coarse grained.

Suggest we pull this and try to fix it up before reapplying

edit: I haven't looked through the patch in detail, but it seems plausible that we could diff IR before and after for the failing cases to narrow down the fix. Test update looks machine generated, was it a script that could be repeated after adjusting codegen?

ggeorgakoudis added a reverting change: rGfb0cf0179526: Revert "[OpenMP] Codegen aggregate for outlined function captures".Jul 19 2021, 7:55 AM

Thanks for spotting. Test are auto-gened through update scripts so it should be easy to compare. I'll fix and ping.

ggeorgakoudis reopened this revision.Jul 27 2021, 11:04 AM

This revision is now accepted and ready to land.Jul 27 2021, 11:04 AM

jhuber6 added inline comments.Jul 27 2021, 11:05 AM

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
1268	This needs to include the size of the accompanying push after D106496.

Rebase, update tests

Harbormaster completed remote builds in B116635: Diff 362305.Jul 28 2021, 8:10 AM

jdoerfert added a child revision: D109165: [OpenMP] Add callback annotation to `__kmpc_parallel_51`.Sep 2 2021, 9:22 AM

Add previous, upper bound to aggregate for combined distributed directives.
Update tests.

Remove unnecessary comments.

ggeorgakoudis requested review of this revision.Sep 16 2021, 4:57 PM

Harbormaster completed remote builds in B124311: Diff 373107.Sep 16 2021, 6:39 PM

LGTM, this passed the usual tests and I didn't see any leftover allocations.

This revision is now accepted and ready to land.Sep 20 2021, 5:49 AM

ggeorgakoudis mentioned this in D110114: [OMPIRBuilder] Generate aggregate argument for parallel region outlined functions.Sep 20 2021, 4:50 PM

Fix for clang-tidy

Harbormaster completed remote builds in B124792: Diff 373755.Sep 20 2021, 5:16 PM

Rebase and update tests

Harbormaster completed remote builds in B124953: Diff 373982.Sep 21 2021, 10:35 AM

This revision was landed with ongoing or failed builds.Sep 21 2021, 10:51 AM

Closed by commit rG1d66649adf28: [OpenMP] Codegen aggregate for outlined function captures (authored by ggeorgakoudis). · Explain Why

This revision was automatically updated to reflect the committed changes.

ggeorgakoudis added a commit: rG1d66649adf28: [OpenMP] Codegen aggregate for outlined function captures.

seeing buildbot failures after this patch landed https://lab.llvm.org/staging/#/builders/183/builds/1598

In D102107#3013233, @ronlieb wrote:

seeing buildbot failures after this patch landed https://lab.llvm.org/staging/#/builders/183/builds/1598

Looking at it @ronlieb, thanks for reporting

In D102107#3013233, @ronlieb wrote:

seeing buildbot failures after this patch landed https://lab.llvm.org/staging/#/builders/183/builds/1598

This looks like another AMDGPU issue. The code in question doesn't do anything AMDGPU specific.
@ronlieb @JonChesterfield How to debug this?

Please revert the patch so our buildbot can resume greeness, and we can look into it with urgency today (me or Jon)
as it should be reproducible

@ronlieb can you apply this to amd-stg-open? If it breaks there we have a chance of trying a debugger on it. @dpalermo might be available again now.

@jdoerfert I debug stuff like this by inspection, guesswork and a DIY printf implementation that is itself not totally robust. Very occasionally the thing can be isolated as a unit test. If we're lucky a debug llvm + debug rocr build will be more verbose about what is going wrong.

In D102107#3013437, @ronlieb wrote:

Please revert the patch so our buildbot can resume greeness, and we can look into it with urgency today (me or Jon)
as it should be reproducible

Sounds good. @ggeorgakoudis let's revert and wait for input.

ggeorgakoudis added a reverting change: rGac90dfc43a01: Revert "[OpenMP] Codegen aggregate for outlined function captures".Sep 21 2021, 1:22 PM

@pdhaliwal
i will pass the problem over to Pushpinder Singh who should be waking up soon.

George, thank you for reverting it. i can reproduce the issue on a local system.
building latest (revert present) passes.
Revert the revert and fails

one test that fails :
in build directory:
export LOC=pwd; cd $LOC/runtimes/runtimes-bins/openmp && /usr/bin/python3.8 $LOC/./bin/llvm-l
it -vv --show-unsupported --show-xfail -j 32 $LOC/runtimes/runtimes-bins/openmp/libomptarget/test/amdgcn-amd-amdhsa/mapping/declare_mapper_targe
t_data.cpp

command stderr:

[GPU Memory Error] Addr: 0x0 Reason: Page not present or supervisor privilege.
Memory access fault by GPU node-2 (Agent handle: 0x18ae1d0) on address (nil). Reason: Page not present or supervisor privilege.

It looks like from IR diff that this patch is adding use of kmpc_alloc_shared method. These methods likely won't work on AMDGPU as device malloc is not available. Not sure what could be done apart from marking those tests as XFAIL on amdgcn. :(

In D102107#3014599, @pdhaliwal wrote:

It looks like from IR diff that this patch is adding use of kmpc_alloc_shared method. These methods likely won't work on AMDGPU as device malloc is not available. Not sure what could be done apart from marking those tests as XFAIL on amdgcn. :(

That's a good theory. Could confirm by patching the amdgpu malloc to return 0xdeadbeef or similar instead of 0 and seeing if that number shows up in the invalid memory access error. If so there's two problems:
1/ malloc on the gpu can fail, so it would mean we're missing a check on the return code of malloc in the devicertl
2/ increased importance for getting malloc running on amdgpu
The openmp in rocm/aomp does have a malloc, so it would also be interesting to see if they run OK with this patch applied

I got this after changing __kmpc_impl_malloc to return 0xdeadbeef. So, this confirms that missing malloc implementation is the root cause.

Memory access fault by GPU node-4 (Agent handle: 0x1bc5000) on address 0xdeadb000. Reason: Page not present or supervisor privilege.

In D102107#3014743, @pdhaliwal wrote:

I got this after changing __kmpc_impl_malloc to return 0xdeadbeef. So, this confirms that missing malloc implementation is the root cause.

Memory access fault by GPU node-4 (Agent handle: 0x1bc5000) on address 0xdeadb000. Reason: Page not present or supervisor privilege.

Nice! In that case I think the way to go is to audit the (probably few) places where kmpc_impl_malloc are called and add a check for whether the return value is 0. With that in place we can reland this and get more graceful failure (at a guess we should fall back to the host when gpu memory is exhausted? or maybe just print a 'out of gpu heap memory' style message and abort, don't know).

In D102107#3014759, @JonChesterfield wrote:

In D102107#3014743, @pdhaliwal wrote:

I got this after changing __kmpc_impl_malloc to return 0xdeadbeef. So, this confirms that missing malloc implementation is the root cause.

Memory access fault by GPU node-4 (Agent handle: 0x1bc5000) on address 0xdeadb000. Reason: Page not present or supervisor privilege.

Nice! In that case I think the way to go is to audit the (probably few) places where kmpc_impl_malloc are called and add a check for whether the return value is 0. With that in place we can reland this and get more graceful failure (at a guess we should fall back to the host when gpu memory is exhausted? or maybe just print a 'out of gpu heap memory' style message and abort, don't know).

We should only fail to remove the kmpc_shared_alloc with O0. Since we need kmpc_shared_alloc for all non-trivial codes, they would always fail on AMDGPU. That said,
why is the shared memory stack not catching this. It's a 64 byte stack for the main thread and we are looking at at 24 byte allocation for declare_mapper_target.cpp.
Can you determine why first two conditionals in __kmpc_alloc_shared don't catch this and return proper memory?

atmnpatel added a subscriber: atmnpatel.Sep 22 2021, 7:42 AM

ggeorgakoudis reopened this revision.Sep 28 2021, 10:01 AM

This revision is now accepted and ready to land.Sep 28 2021, 10:01 AM

Update memory allocation for aggregate argument.
Introduce runtime interface to allocate from local memory,
when in SPMD mode, or heap, when in generic.

Herald added a project: Restricted Project. · View Herald TranscriptSep 28 2021, 10:25 AM

Herald added a subscriber: llvm-commits. · View Herald Transcript

Harbormaster completed remote builds in B126137: Diff 375634.Sep 28 2021, 10:25 AM

@pdhaliwal @JonChesterfield @ronlieb I updated the aggregate argument memory allocation to use an alloca instead of malloc'ing in SPMD mode, which should resolve your issue. Could someone please test the updated patch and give me feedback before landing?

[AMD Official Use Only]

Hi George,
I will do it now ...

i backed up to your reverted patch, and applied this one.
I see some new errors

libomptarget :: amdgcn-amd-amdhsa :: mapping/declare_mapper_target.cpp
libomptarget :: amdgcn-amd-amdhsa :: mapping/declare_mapper_target_data.cpp
libomptarget :: amdgcn-amd-amdhsa :: mapping/declare_mapper_target_data_enter_exit.cpp
libomptarget :: amdgcn-amd-amdhsa :: mapping/declare_mapper_target_update.cpp
libomptarget :: amdgcn-amd-amdhsa :: offloading/parallel_offloading_map.cpp
libomptarget :: amdgcn-amd-amdhsa :: offloading/taskloop_offload_nowait.cpp

Dont know if Jon is around, so i will ask Singh @pdhaliwal if he can dig into it a bit more.

In D102107#3028386, @ronlieb wrote:
i backed up to your reverted patch, and applied this one.
I see some new errors
libomptarget :: amdgcn-amd-amdhsa :: mapping/declare_mapper_target.cpp
libomptarget :: amdgcn-amd-amdhsa :: mapping/declare_mapper_target_data.cpp
libomptarget :: amdgcn-amd-amdhsa :: mapping/declare_mapper_target_data_enter_exit.cpp
libomptarget :: amdgcn-amd-amdhsa :: mapping/declare_mapper_target_update.cpp
libomptarget :: amdgcn-amd-amdhsa :: offloading/parallel_offloading_map.cpp
libomptarget :: amdgcn-amd-amdhsa :: offloading/taskloop_offload_nowait.cpp
Dont know if Jon is around, so i will ask Singh @pdhaliwal if he can dig into it a bit more.

Hmm, thanks @ronlieb, @pdhaliwal please let me know what fails.

Apologies for late reply. Most of the tests now do not try to call malloc, so no page fault errors. But all of them are producing wrong results. For e.g. declare_mapper_target.cpp produces Sum = 132608 with the patch applied. Similarly for other tests as well. So don't know what's happening yet.

In D102107#3029825, @pdhaliwal wrote:

Apologies for late reply. Most of the tests now do not try to call malloc, so no page fault errors. But all of them are producing wrong results. For e.g. declare_mapper_target.cpp produces Sum = 132608 with the patch applied. Similarly for other tests as well. So don't know what's happening yet.

Thanks @pdhaliwal. I get the right result on nvidia. Please let me know when you get to the bottom of it.

I modified the declare_mapper_target to print the contents of array after target region and found the following output:

2 3 4 5 6 7 8 9 10 11 Sum = 65

Program:

#include <cstdio>
#include <cstdlib>

#define NUM 10

int main() {
  int *c= new int[NUM];
  for (int i = 0; i < NUM; i++) {
    c[i] = 1;
  }
#pragma omp target teams distribute  parallel for map(tofrom: c[0:NUM])
  for (int i = 0; i < NUM; i++) {
    c[i]++;
  }
  int sum = 0;
  for (int i = 0; i < NUM; i++) {
    sum += c[i];
    printf("%d ", c[i]);
  }
  // CHECK: Sum = 2048
  printf("Sum = %d\n", sum);
  return 0;
}

Different variant of the same program is producing correct output,

#include <cstdio>
#include <cstdlib>

#define NUM 10

int main() {
  int *c= new int[NUM];
  for (int i = 0; i < NUM; i++) {
    c[i] = 1;
  }

  int *b = new int[NUM];
#pragma omp target teams distribute  parallel for map(tofrom: c[0:NUM], b[0:NUM])
  for (int i = 0; i < NUM; i++) {
    b[i] = c[i] + 1;
  }
  int sum = 0;
  for (int i = 0; i < NUM; i++) {
    sum += b[i];
    printf("%d ", b[i]);
  }
  // CHECK: Sum = 2048
  printf("Sum = %d\n", sum);
  return 0;
}

Output (this is the right answer):

2 2 2 2 2 2 2 2 2 2 Sum = 20

On internal amd-stg-open branch, this patch works fine, so issue is only with the trunk.
I compared the generated IR before and after applying this patch, I didn't see anything suspicious. (but can't be 100% sure).

pdhaliwal added a child revision: D111218: [AMDGPU][OpenMP] Remove optnone from outlined functions.Oct 6 2021, 3:22 AM

I have created a patch (D111218) with fix for amdgcn. This is a temporary fix. I will still keep on looking into it until I find a real root cause.

JonChesterfield mentioned this in D111218: [AMDGPU][OpenMP] Remove optnone from outlined functions.Oct 6 2021, 3:34 AM

Update the interface for allocating/sharing the struct aggregate
Simplify invoking tasks

ggeorgakoudis requested review of this revision.Nov 9 2021, 7:44 AM

Harbormaster completed remote builds in B133254: Diff 385818.Nov 9 2021, 7:44 AM

TODO update tests

ggeorgakoudis removed a parent revision: D97680: [OpenMP] Simplify GPU memory globalization.Nov 9 2021, 7:47 AM

Update tests
Fix for attributes to kmpc_alloc_aggregate_arg
Do not emit allocations if there are no arguments in the aggregate

Herald added subscribers: asavonic, ormris. · View Herald TranscriptNov 11 2021, 10:55 AM

Harbormaster completed remote builds in B133778: Diff 386594.Nov 11 2021, 3:14 PM

JonChesterfield mentioned this in D114865: [AMDGPU][OpenMP] Use -amdgpu-fixed-function-abi.Dec 1 2021, 6:36 AM

Ping!

LG wit a nit

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
1270	Enclose into braces too

This revision is now accepted and ready to land.Dec 1 2021, 9:26 AM

Rebase, address comment, update few tests

ggeorgakoudis marked 2 inline comments as done.Dec 1 2021, 11:36 AM

This works approximately as well as trunk does for me, provided D114865 is also applied. My baseline is not totally solid but I think there's a credible chance this would pass the buildbot, provided D114865 went in first.

Ron reports two new failures with this applied,
libomptarget :: amdgcn-amd-amdhsa :: offloading/bug51781.c
libomptarget :: amdgcn-amd-amdhsa :: offloading/bug51982.c

My local sm_75 box with this patch applied (and otherwise a clean build) claims failures in

libomptarget :: nvptx64-nvidia-cuda :: offloading/bug49334.cpp
libomptarget :: nvptx64-nvidia-cuda :: offloading/bug51781.c
libomptarget :: nvptx64-nvidia-cuda-newRTL :: offloading/bug49021.cpp
libomptarget :: nvptx64-nvidia-cuda-newRTL :: offloading/bug49334.cpp
libomptarget :: nvptx64-nvidia-cuda-newRTL :: offloading/bug51781.c

Harbormaster completed remote builds in B136979: Diff 391099.Dec 1 2021, 4:40 PM

Can we land this? AMD issues seems resolved.

[AMD Official Use Only]

@Singh, Pushpinder is this resolved?
You were most recently working on it.

Thx

I am seeing a lot of failures on nvptx machine (sm_70, cuda11.4) with this patch,

libomptarget :: nvptx64-nvidia-cuda :: offloading/bug49021.cpp
libomptarget :: nvptx64-nvidia-cuda :: offloading/bug49334.cpp
libomptarget :: nvptx64-nvidia-cuda :: offloading/bug49779.cpp
libomptarget :: nvptx64-nvidia-cuda :: offloading/bug51781.c
libomptarget :: nvptx64-nvidia-cuda :: offloading/bug51982.c
libomptarget :: nvptx64-nvidia-cuda :: unified_shared_memory/close_enter_exit.c
libomptarget :: nvptx64-nvidia-cuda :: unified_shared_memory/close_modifier.c
libomptarget :: nvptx64-nvidia-cuda :: unified_shared_memory/shared_update.c
libomptarget :: nvptx64-nvidia-cuda-newRTL :: offloading/bug49021.cpp
libomptarget :: nvptx64-nvidia-cuda-newRTL :: offloading/bug49334.cpp
libomptarget :: nvptx64-nvidia-cuda-newRTL :: offloading/bug51781.c
libomptarget :: nvptx64-nvidia-cuda-newRTL :: unified_shared_memory/close_enter_exit.c
libomptarget :: nvptx64-nvidia-cuda-newRTL :: unified_shared_memory/close_modifier.c
libomptarget :: nvptx64-nvidia-cuda-newRTL :: unified_shared_memory/shared_update.c

On amdgcn, these are the tests failing,

libomptarget :: amdgcn-amd-amdhsa :: offloading/bug49021.cpp
libomptarget :: amdgcn-amd-amdhsa :: offloading/bug51781.c
libomptarget :: amdgcn-amd-amdhsa :: offloading/bug51982.c
libomptarget :: amdgcn-amd-amdhsa-newRTL :: offloading/bug49021.cpp
libomptarget :: amdgcn-amd-amdhsa-newRTL :: offloading/bug51781.c

jhuber6 mentioned this in rG7cb4c2617391: [OMPIRBuilder] Generate aggregate argument for parallel region outlined….Jan 25 2022, 6:25 PM

I added https://github.com/llvm/llvm-project/issues/54654 documenting what I found when testing this patch on amdgpu.

@ggeorgakoudis Can you please rebase this patch on top of main? Thanks.

Herald added a project: Restricted Project. · View Herald TranscriptMar 30 2022, 12:05 PM

As discussed in https://github.com/llvm/llvm-project/issues/54654, this needs to be added for SPMDization with this patch. Not sure whether further handling is required.

diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index c4736521e475..23cfa6fe5e27 100644

a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp

+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -4260,6 +4260,7 @@ struct AAKernelInfoCallSite : AAKernelInfo {

case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2:
case OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2:
case OMPRTL___kmpc_nvptx_end_reduce_nowait:

+ case OMPRTL___kmpc_alloc_aggregate_arg:

  break;
case OMPRTL___kmpc_distribute_static_init_4:
case OMPRTL___kmpc_distribute_static_init_4u:

In D102107#3417452, @dhruvachak wrote:

I added https://github.com/llvm/llvm-project/issues/54654 documenting what I found when testing this patch on amdgpu.

@ggeorgakoudis Can you please rebase this patch on top of main? Thanks.

Hey @dhruvachak. Unfortunately I can't find time lately to work on this patch. Would you like to take over?

In D102107#3434733, @ggeorgakoudis wrote:

In D102107#3417452, @dhruvachak wrote:

I added https://github.com/llvm/llvm-project/issues/54654 documenting what I found when testing this patch on amdgpu.

@ggeorgakoudis Can you please rebase this patch on top of main? Thanks.

Hey @dhruvachak. Unfortunately I can't find time lately to work on this patch. Would you like to take over?

@ggeorgakoudis I rebased the sources on top of main and resolved conflicts in my local workspace. I haven't updated the clang/llvm tests. There are tests that fail on amdgpu that I am investigating. One example is https://github.com/llvm/llvm-project/issues/54654.

dhruvachak added inline comments.Apr 8 2022, 12:35 PM

llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
931	NoCapture attributes for the parameters need to be removed. See https://github.com/llvm/llvm-project/issues/54654

jdoerfert mentioned this in D129197: [OpenMP] enable kernel launches up to 64 params.Jul 6 2022, 7:04 AM

reverse ping. Are there outstanding issues with this?

Herald added a subscriber: mattd. · View Herald TranscriptJul 6 2022, 8:03 AM

I rebased and resolved conflicts just now and got the compiler built. I did not update the tests, hence not updating this review. I see the following outstanding issues:

(1) make check-libomptarget produces a bunch of failures with the following compile-time assertion. So my rebased patch is not interacting correctly with opaque pointers. It is the same assertion for all the failures.
llvm-project/llvm/include/llvm/IR/Type.h:384: llvm::Type* llvm::Type::getNonOpaquePointerElementType() const: Assertion `NumContainedTys && "Attempting to get element type of opaque pointer"' failed.

(2) From earlier investigation a couple of months back, this patch uses device alloc and will fail if device allocation is not implemented (e.g. in main branch of amdgpu). Most of these failures are seen at -O0, OpenMPOpt is able to optimize them away at higher opt levels. Are we ok with these failures at -O0?

(3) There were a few issues found regarding SPDMization, NoCaptureAttrs, alignment that should be applied to this patch. I have those changes on a local branch.

Also, make sure to remove all deviceRTL files and probably reset the autogenerated tests to upstream (and re-generate) before you merge (or reupload).

In D102107#3633678, @dhruvachak wrote:

I rebased and resolved conflicts just now and got the compiler built. I did not update the tests, hence not updating this review. I see the following outstanding issues:

(1) make check-libomptarget produces a bunch of failures with the following compile-time assertion. So my rebased patch is not interacting correctly with opaque pointers. It is the same assertion for all the failures.
llvm-project/llvm/include/llvm/IR/Type.h:384: llvm::Type* llvm::Type::getNonOpaquePointerElementType() const: Assertion `NumContainedTys && "Attempting to get element type of opaque pointer"' failed.

See my comment below. I think that's the issue.

(2) From earlier investigation a couple of months back, this patch uses device alloc and will fail if device allocation is not implemented (e.g. in main branch of amdgpu). Most of these failures are seen at -O0, OpenMPOpt is able to optimize them away at higher opt levels. Are we ok with these failures at -O0?

It used __kmpc_alloc_shared, which should in theory work with O0 (also for AMDGPU) but in practice might not, especially if it has to fallback to malloc. We are working on malloc support right now. This should not stop us. No reasonable code runs with O0 (on AMDGPU) right now.

(3) There were a few issues found regarding SPDMization, NoCaptureAttrs, alignment that should be applied to this patch. I have those changes on a local branch.

Apply them, I can look over everything again.

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
1200	This doesn't work anymore with opaque pointers, IIRC. We should remember the type and pass to this place.

In D102107#3633705, @jdoerfert wrote:

Also, make sure to remove all deviceRTL files and probably reset the autogenerated tests to upstream (and re-generate) before you merge (or reupload).

In D102107#3633678, @dhruvachak wrote:

I rebased and resolved conflicts just now and got the compiler built. I did not update the tests, hence not updating this review. I see the following outstanding issues:

(1) make check-libomptarget produces a bunch of failures with the following compile-time assertion. So my rebased patch is not interacting correctly with opaque pointers. It is the same assertion for all the failures.
llvm-project/llvm/include/llvm/IR/Type.h:384: llvm::Type* llvm::Type::getNonOpaquePointerElementType() const: Assertion `NumContainedTys && "Attempting to get element type of opaque pointer"' failed.

See my comment below. I think that's the issue.

(2) From earlier investigation a couple of months back, this patch uses device alloc and will fail if device allocation is not implemented (e.g. in main branch of amdgpu). Most of these failures are seen at -O0, OpenMPOpt is able to optimize them away at higher opt levels. Are we ok with these failures at -O0?

It used __kmpc_alloc_shared, which should in theory work with O0 (also for AMDGPU) but in practice might not, especially if it has to fallback to malloc. We are working on malloc support right now. This should not stop us. No reasonable code runs with O0 (on AMDGPU) right now.

(3) There were a few issues found regarding SPDMization, NoCaptureAttrs, alignment that should be applied to this patch. I have those changes on a local branch.

Apply them, I can look over everything again.

Yes, I will apply the changes, refresh the tests, and re-upload.

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
1200	Thanks. Changing this fixed the assertions.

dhruvachak added inline comments.Jul 8 2022, 11:31 AM

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
3238	This should be VoidTy now that GlobalArgs type has changed.

Is there an llvm/utils script to update clang tests that have RUN lines at the top? An example is clang/test/OpenMP/debug_threadprivate_copyin.c.

In D102107#3639551, @dhruvachak wrote:

Is there an llvm/utils script to update clang tests that have RUN lines at the top? An example is clang/test/OpenMP/debug_threadprivate_copyin.c.

You can create the run lines with the llvm/utils/update_cc_test_checks.py script but those tests have manual lines for now.
I usually run llvm/utils/update_cc_test_checks.py -u clang/test/OpenMP/*.{c,cpp} to update all autogenerated tests.

In D102107#3639556, @jdoerfert wrote:

In D102107#3639551, @dhruvachak wrote:

Is there an llvm/utils script to update clang tests that have RUN lines at the top? An example is clang/test/OpenMP/debug_threadprivate_copyin.c.

You can create the run lines with the llvm/utils/update_cc_test_checks.py script but those tests have manual lines for now.
I usually run llvm/utils/update_cc_test_checks.py -u clang/test/OpenMP/*.{c,cpp} to update all autogenerated tests.

Okay, I will convert those few manual OpenMP tests to autogen format.

How about the AST ones? Do they have to be manually updated? Example: clang/test/AST/ast-dump-openmp-distribute-parallel-for-simd.c

ast_dump_2_check.py1 KBDownload

>>! In D102107#3639615, @dhruvachak wrote:

In D102107#3639556, @jdoerfert wrote:

In D102107#3639551, @dhruvachak wrote:

Is there an llvm/utils script to update clang tests that have RUN lines at the top? An example is clang/test/OpenMP/debug_threadprivate_copyin.c.

You can create the run lines with the llvm/utils/update_cc_test_checks.py script but those tests have manual lines for now.
I usually run llvm/utils/update_cc_test_checks.py -u clang/test/OpenMP/*.{c,cpp} to update all autogenerated tests.

Okay, I will convert those few manual OpenMP tests to autogen format.

How about the AST ones? Do they have to be manually updated? Example: clang/test/AST/ast-dump-openmp-distribute-parallel-for-simd.c

For these ones I have a script locally (attached) that need some manual doing but it helps:

1. run the ast dump and store the result (same as RUN line), e.g.,
  {F23722650} clang -cc1 -internal-isystem /data/build/llvm-project/lib/clang/13.0.0/include -nostdsysteminc -triple x86_64-unknown-unknown -fopenmp -verify -ast-dump /data/src/llvm-project/clang/test/AST/ast-dump-openmp-begin-declare-variant_template_4.cpp &> /tmp/ast
2. python3 ast_dump_2_check.py /tmp/ast CHECK
3. replace the check lines with the content of /tmp/ast.check

In D102107#3639735, @jdoerfert wrote:
ast_dump_2_check.py1 KBDownload
>>! In D102107#3639615, @dhruvachak wrote:

In D102107#3639556, @jdoerfert wrote:

In D102107#3639551, @dhruvachak wrote:

Is there an llvm/utils script to update clang tests that have RUN lines at the top? An example is clang/test/OpenMP/debug_threadprivate_copyin.c.

You can create the run lines with the llvm/utils/update_cc_test_checks.py script but those tests have manual lines for now.
I usually run llvm/utils/update_cc_test_checks.py -u clang/test/OpenMP/*.{c,cpp} to update all autogenerated tests.

Okay, I will convert those few manual OpenMP tests to autogen format.

How about the AST ones? Do they have to be manually updated? Example: clang/test/AST/ast-dump-openmp-distribute-parallel-for-simd.c

For these ones I have a script locally (attached) that need some manual doing but it helps:
1. run the ast dump and store the result (same as RUN line), e.g.,
  {F23722650} clang -cc1 -internal-isystem /data/build/llvm-project/lib/clang/13.0.0/include -nostdsysteminc -triple x86_64-unknown-unknown -fopenmp -verify -ast-dump /data/src/llvm-project/clang/test/AST/ast-dump-openmp-begin-declare-variant_template_4.cpp &> /tmp/ast
2. python3 ast_dump_2_check.py /tmp/ast CHECK
3. replace the check lines with the content of /tmp/ast.check

Thanks. I followed the above steps and regenerated a couple of the AST tests but they still fail. Perhaps I am missing some options?

I currently have a handful of clang test failures where regen did not work. I am going to update the patch, post the current test results, and we can figure out how to regen the rest before we land this patch.

In D102107#3640198, @dhruvachak wrote:

Thanks. I followed the above steps and regenerated a couple of the AST tests but they still fail. Perhaps I am missing some options?

I currently have a handful of clang test failures where regen did not work. I am going to update the patch, post the current test results, and we can figure out how to regen the rest before we land this patch.

So, generate check lines for new tests in a separate patch first.
For the AST ones, you need to take the run line of the test, not what I posted there. If it doesn't work, one needs to check why. Hard to diagnose and I don't remember if there is something else. Maybe you need to only include part of it?

In D102107#3640198, @dhruvachak wrote:

Thanks. I followed the above steps and regenerated a couple of the AST tests but they still fail. Perhaps I am missing some options?

I currently have a handful of clang test failures where regen did not work. I am going to update the patch, post the current test results, and we can figure out how to regen the rest before we land this patch.

Sometimes if update_cc_test_check.py -u ${test} doesn't work you either just need to run it twice so the line numbers get updated on the kernel functions, or you can try taking the command line directly from the top of the file and running it again with that instead of -u. A few options aren't handled properly via the update with -u and need to be run again completely.

Fixed opaque pointer miscompile.
Added alloc_aggregate_arg entry point to OpenMPOpt SPMD list.
Fixed nocapture attribute of kmpc_alloc_aggregate_arg,
Added align attribute for call to kmpc_alloc_shared.
Updated (most) failing clang tests.

Herald added a subscriber: hiraditya. · View Herald TranscriptJul 8 2022, 5:53 PM

In D102107#3640232, @jdoerfert wrote:

In D102107#3640198, @dhruvachak wrote:

Thanks. I followed the above steps and regenerated a couple of the AST tests but they still fail. Perhaps I am missing some options?

I currently have a handful of clang test failures where regen did not work. I am going to update the patch, post the current test results, and we can figure out how to regen the rest before we land this patch.

So, generate check lines for new tests in a separate patch first.

Not sure what you mean by new tests. make check-clang has a few failures on existing tests. I think all of them are regen issues. I will post the results.

For the AST ones, you need to take the run line of the test, not what I posted there. If it doesn't work, one needs to check why. Hard to diagnose and I don't remember if there is something else. Maybe you need to only include part of it?

Yes, I took the run line of the test. The regen worked OK, I removed the old CHECK lines and added the new ones. But make check-clang still flags it as a failure. As you said, we need to understand why.

dhruvachak added inline comments.Jul 8 2022, 6:02 PM

llvm/lib/Transforms/IPO/OpenMPOpt.cpp
4675	@jdoerfert Is this enough to enable SPMDization or is further handling required?

dhruvachak added inline comments.Jul 8 2022, 6:04 PM

llvm/lib/Transforms/IPO/OpenMPOpt.cpp
4675	Just to be clear, this change does allow SPMDization now but want to make sure nothing else is missing.

Harbormaster completed remote builds in B174488: Diff 443399.Jul 8 2022, 6:41 PM

Results from "make check-clang":

Failed Tests (14):

Clang :: AST/ast-dump-openmp-distribute-parallel-for-simd.c
Clang :: AST/ast-dump-openmp-distribute-parallel-for.c
Clang :: AST/ast-dump-openmp-target-teams-distribute-parallel-for-simd.c
Clang :: AST/ast-dump-openmp-target-teams-distribute-parallel-for.c
Clang :: AST/ast-dump-openmp-teams-distribute-parallel-for-simd.c
Clang :: AST/ast-dump-openmp-teams-distribute-parallel-for.c
Clang :: CodeGenCXX/observe-noexcept.cpp
Clang :: OpenMP/declare_variant_construct_codegen_1.c
Clang :: OpenMP/nvptx_lambda_pointer_capturing.cpp
Clang :: OpenMP/remarks_parallel_in_multiple_target_state_machines.c
Clang :: OpenMP/remarks_parallel_in_target_state_machine.c
Clang :: OpenMP/target_in_reduction_codegen.cpp
Clang :: SemaCXX/static-assert.cpp
Clang :: utils/update_cc_test_checks/generated-funcs.test

Testing Time: 29.33s

Skipped          :     4
Unsupported      :  1478
Passed           : 29406
Expectedly Failed:    27
Failed           :    14

Need to check the following again.

clang/test/AST/ast-dump-openmp-distribute-parallel-for.c was regenerated and part of the patch but the test still fails. The other regenerated AST tests are not part of this patch, they seem to fail even after regen.

Need to regen CodeGenCXX, SemaCXX, and utils tests (3 total).

I tried converting the OpenMP manual CHECK tests to the autogen format. Some of them still fail as above, don't know why.

Need to know how to regen the OpenMP remarks tests.

make check-openmp passes on amdgpu. Need to check on nvptx.

Testing Time: 39.95s

Unsupported      : 143
Passed           : 563
Expectedly Failed:  14

[100%] Built target check-openmp
[100%] Built target check-openmp

It seems the buildbot didn't actually test this patch but an old one, still:

The checks for this tests are not updated:
target_teams_distribute_parallel_for_order_codegen.cpp
target_in_reduction_codegen.cpp
nvptx_lambda_capturing.cpp
nvptx_lambda_pointer_capturing.cpp

Similar to clang/test/OpenMP/declare_variant_construct_codegen_1.c, we should manually update the few fork calls in clang/test/OpenMP/declare_variant_construct_codegen_1.c.

Can you share the output of the AST dump tests and the new check lines, so what run produces and the file we give to Filechec to verify it.

clang/test/OpenMP/declare_variant_construct_codegen_1.c
1052	Something went wrong here. Might be easier to manually change the kmpc_forc_call line (should not be much more)

In D102107#3648219, @jdoerfert wrote:

Can you share the output of the AST dump tests and the new check lines, so what run produces and the file we give to Filechec to verify it.

I looked at the AST test output and the CHECK lines more carefully. Turns out the full path was embedded in some of the CHECK lines causing the failures. I corrected those manually and those AST tests now pass. I will move on to the other failures.

Regenerated clang tests, make check-clang passes

Rebased on top of a recent commit. Both check-clang and check-openmp (on amdgpu) pass.

Testing Time: 30.73s

Skipped          :     4
Unsupported      :  1480
Passed           : 29554
Expectedly Failed:    27

[100%] Built target check-clang

On amdgpu:

Testing Time: 42.65s

Unsupported      : 145
Passed           : 570
Expectedly Failed:  14

[100%] Built target check-openmp
[100%] Built target check-openmp

@jdoerfert With this patch, additional remarks are being generated. Please check whether the new OMP121 remarks in the following tests are OK.

Clang :: OpenMP/remarks_parallel_in_multiple_target_state_machines.c
Clang :: OpenMP/remarks_parallel_in_target_state_machine.c

All changes from my end are in. Please review.

Harbormaster completed remote builds in B178123: Diff 448404.Jul 28 2022, 1:52 PM

In D102107#3685694, @dhruvachak wrote:

@jdoerfert With this patch, additional remarks are being generated. Please check whether the new OMP121 remarks in the following tests are OK.

Clang :: OpenMP/remarks_parallel_in_multiple_target_state_machines.c
Clang :: OpenMP/remarks_parallel_in_target_state_machine.c

Can you send me the device IR generated for these (-save-temps). I need to check what's happening and building the patch myself will take a while.

@jdoerfert Attached are the device IR files, generated with -save-temps.

remarks_parallel_in_multiple_target_state_machines-openmp-amdgcn-amd-amdhsa.ll318 KBDownload

remarks_parallel_in_target_state_machine-openmp-amdgcn-amd-amdhsa.ll302 KBDownload

Pointing out the recent changes at the corresponding source locations.

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
1217	Added align attribute for call to __kmpc_alloc_shared.
clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
253	CapturedVarsElemTypes added to handle opaque pointers.
clang/lib/CodeGen/CodeGenFunction.h
3359	CapturedVarsElemTypes introduced to handle opaque pointers.
llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
934	Fixed attribute of __kmpc_alloc_aggregate_arg,
llvm/lib/Transforms/IPO/OpenMPOpt.cpp
4675	Added alloc_aggregate_arg entry point to OpenMPOpt SPMD list.

LG, the new remarks need to be addressed in a follow up. Please test for them and make a TODO that they should be optimized away.

Rebased

This revision was landed with ongoing or failed builds.Sep 14 2022, 5:55 PM

Closed by commit rG7539e9cf811e: [OpenMP] Codegen aggregate for outlined function captures (authored by ggeorgakoudis, committed by dhruvachak). · Explain Why

This revision was automatically updated to reflect the committed changes.

dhruvachak added a commit: rG7539e9cf811e: [OpenMP] Codegen aggregate for outlined function captures.

Harbormaster completed remote builds in B186763: Diff 460274.Sep 14 2022, 6:50 PM

check-llvm fails bunch of test for me

Failed Tests (12):

LLVM :: Transforms/OpenMP/custom_state_machines.ll
LLVM :: Transforms/OpenMP/custom_state_machines_remarks.ll
LLVM :: Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
LLVM :: Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
LLVM :: Transforms/OpenMP/is_spmd_exec_mode_fold.ll
LLVM :: Transforms/OpenMP/parallel_level_fold.ll
LLVM :: Transforms/OpenMP/spmdization.ll
LLVM :: Transforms/OpenMP/spmdization_assumes.ll
LLVM :: Transforms/OpenMP/spmdization_constant_prop.ll
LLVM :: Transforms/OpenMP/spmdization_guarding.ll
LLVM :: Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
LLVM :: Transforms/OpenMP/spmdization_remarks.ll

In D102107#3791292, @vitalybuka wrote:

check-llvm fails bunch of test for me

Failed Tests (12):

LLVM :: Transforms/OpenMP/custom_state_machines.ll
LLVM :: Transforms/OpenMP/custom_state_machines_remarks.ll
LLVM :: Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
LLVM :: Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
LLVM :: Transforms/OpenMP/is_spmd_exec_mode_fold.ll
LLVM :: Transforms/OpenMP/parallel_level_fold.ll
LLVM :: Transforms/OpenMP/spmdization.ll
LLVM :: Transforms/OpenMP/spmdization_assumes.ll
LLVM :: Transforms/OpenMP/spmdization_constant_prop.ll
LLVM :: Transforms/OpenMP/spmdization_guarding.ll
LLVM :: Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
LLVM :: Transforms/OpenMP/spmdization_remarks.ll

Thanks for reporting them. I need to update them.

dhruvachak added a reverting change: rG839ac62c5085: Revert "[OpenMP] Codegen aggregate for outlined function captures".Sep 14 2022, 8:09 PM

I reverted this commit while I fix the failing tests.

This patch was reverted.

This revision is now accepted and ready to land.Sep 23 2022, 1:41 PM

Updated llvm tests. The following 3 tests still fail:

LLVM :: Transforms/OpenMP/spmdization_constant_prop.ll
LLVM :: Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
LLVM :: Transforms/OpenMP/spmdization_remarks.ll

dhruvachak edited the summary of this revision. (Show Details)Sep 23 2022, 1:44 PM

In D102107#3812554, @dhruvachak wrote:
Updated llvm tests. The following 3 tests still fail:
LLVM :: Transforms/OpenMP/spmdization_constant_prop.ll
LLVM :: Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
LLVM :: Transforms/OpenMP/spmdization_remarks.ll

@jdoerfert @jhuber6
I updated the LLVM tests except one, Transforms/OpenMP/spmdization_constant_prop.ll. There is no C source snippet in there. Can you help as to how to update it? Please review the diffs for all the updated LLVM tests as well.

In addition, the other 2 tests above fail even after updating. Looks like something is wrong. Can you help as to how to fix them?

Similar to clang tests, we are seeing remarks differences. We already decided to file an issue (after this patch lands) and look at them after-the-fact.

Harbormaster completed remote builds in B188469: Diff 462582.Sep 23 2022, 3:49 PM

In D102107#3812582, @dhruvachak wrote:
In D102107#3812554, @dhruvachak wrote:
Updated llvm tests. The following 3 tests still fail:
LLVM :: Transforms/OpenMP/spmdization_constant_prop.ll
LLVM :: Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
LLVM :: Transforms/OpenMP/spmdization_remarks.ll
@jdoerfert @jhuber6
I updated the LLVM tests except one, Transforms/OpenMP/spmdization_constant_prop.ll. There is no C source snippet in there. Can you help as to how to update it? Please review the diffs for all the updated LLVM tests as well.

In addition, the other 2 tests above fail even after updating. Looks like something is wrong. Can you help as to how to fix them?

Similar to clang tests, we are seeing remarks differences. We already decided to file an issue (after this patch lands) and look at them after-the-fact.

Did you recreate the tests from the C snipped? That is probably not a good idea. We should modify the IR. If we start with C code we can't do it like this anyway. I mean:

the IR is totally different,
the debug info is missing,
lots of unrelated metadata,
part of the device runtime was merged in,
...

In D102107#3812946, @jdoerfert wrote:
In D102107#3812582, @dhruvachak wrote:
In D102107#3812554, @dhruvachak wrote:
Updated llvm tests. The following 3 tests still fail:
LLVM :: Transforms/OpenMP/spmdization_constant_prop.ll
LLVM :: Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
LLVM :: Transforms/OpenMP/spmdization_remarks.ll
@jdoerfert @jhuber6
I updated the LLVM tests except one, Transforms/OpenMP/spmdization_constant_prop.ll. There is no C source snippet in there. Can you help as to how to update it? Please review the diffs for all the updated LLVM tests as well.

In addition, the other 2 tests above fail even after updating. Looks like something is wrong. Can you help as to how to fix them?

Similar to clang tests, we are seeing remarks differences. We already decided to file an issue (after this patch lands) and look at them after-the-fact.
Did you recreate the tests from the C snipped? That is probably not a good idea. We should modify the IR. If we start with C code we can't do it like this anyway. I mean:

the IR is totally different,

the debug info is missing,

lots of unrelated metadata,

part of the device runtime was merged in,

...

Yes, for the ones that have the C snippet, I re-created from that. Since the IR is quite different now, I thought this was the best way and less error-prone while generating the new IR.

Can you help update these tests by getting the patch locally?

These have the C snippet.

LLVM :: Transforms/OpenMP/custom_state_machines.ll
LLVM :: Transforms/OpenMP/custom_state_machines_remarks.ll
LLVM :: Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
LLVM :: Transforms/OpenMP/spmdization.ll
LLVM :: Transforms/OpenMP/spmdization_assumes.ll
LLVM :: Transforms/OpenMP/spmdization_guarding.ll
LLVM :: Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
LLVM :: Transforms/OpenMP/spmdization_remarks.ll

I think the following are updated correctly and pass.

LLVM :: Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
LLVM :: Transforms/OpenMP/is_spmd_exec_mode_fold.ll
LLVM :: Transforms/OpenMP/parallel_level_fold.ll

I was not able to update the following, so it fails.

LLVM :: Transforms/OpenMP/spmdization_constant_prop.ll

I'm unlikely to get to it in the next 2 weeks (IWOMP and OpenMP F2F). What I would do is to take the new IR, the old IR, run instnamer on the new one. Then splice in the new parts into the old IR removing what was there wrt. parallel_51.

Hi, any chance this will be completed any time soon? We are very keen to resurrect our clang-based OpenMP offloading pipeline at https://github.com/devitocodes/devito :-)

@dhruvachak Do you still need help updating the LLVM tests?

In D102107#3921948, @jhuber6 wrote:

@dhruvachak Do you still need help updating the LLVM tests?

If you go a few messages back, there are some llvm tests that @jdoerfert said were not updated properly. Can someone help update those tests properly?

In D102107#3922842, @dhruvachak wrote:

In D102107#3921948, @jhuber6 wrote:

@dhruvachak Do you still need help updating the LLVM tests?

If you go a few messages back, there are some llvm tests that @jdoerfert said were not updated properly. Can someone help update those tests properly?

The patch does not apply cleanly currently, please rebase it and I'll try to get it working locally.

Rebased.

Herald added subscribers: kosarev, jvesely. · View Herald TranscriptNov 14 2022, 10:30 PM

@jhuber6

Turns out a rebase on top of trunk had ~200 test conflicts. During my last update in Sep, I had resolved all of the clang test conflicts and failures, there were only llvm test failures.

At this point, I checked out commit 92bc3fb5 for all the failed tests (both clang and llvm tests) and then ran update_cc_test_checks.py on all of the auto-generated clang tests with the updated compiler. After this update, the test results look like the following:

make check-clang: Some of these may be new since the last iteration. But I believe most of them need some manual updates. I did not check all of the 7 tests below but I believe most of them are not autogenerated. I suggest looking at them after the llvm tests are regenerated properly.

Failed Tests (7):

Clang :: CodeGen/PowerPC/ppc64le-varargs-f128.c
Clang :: OpenMP/nvptx_target_printf_codegen.c
Clang :: OpenMP/parallel_copyin_combined_codegen.c
Clang :: OpenMP/target_globals_codegen.cpp
Clang :: OpenMP/target_map_codegen_hold.cpp
Clang :: OpenMP/task_target_device_codegen.c
Clang :: OpenMP/unroll_codegen_parallel_for_factor.cpp

Testing Time: 47.70s

Skipped          :     4
Unsupported      :  1490
Passed           : 30113
Expectedly Failed:    28
Failed           :     7

make check-llvm: Other than spmdization_constant_prop, I think the rest of them have a C code snippet. These are the ones to look at first. These tests are not updated in the current rebased version. You may see the updates I made to them from the previous commit.

Failed Tests (9):

LLVM :: Transforms/OpenMP/custom_state_machines.ll
LLVM :: Transforms/OpenMP/custom_state_machines_remarks.ll
LLVM :: Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
LLVM :: Transforms/OpenMP/spmdization.ll
LLVM :: Transforms/OpenMP/spmdization_assumes.ll
LLVM :: Transforms/OpenMP/spmdization_constant_prop.ll
LLVM :: Transforms/OpenMP/spmdization_guarding.ll
LLVM :: Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
LLVM :: Transforms/OpenMP/spmdization_remarks.ll

Testing Time: 58.80s

Skipped          :    59
Unsupported      : 19062
Passed           : 31988
Expectedly Failed:    69
Failed           :     9

make check-openmp: On amdgpu, this looks good.

Expectedly Failed Tests (12):

libomptarget :: amdgcn-amd-amdhsa :: mapping/data_member_ref.cpp
libomptarget :: amdgcn-amd-amdhsa :: mapping/declare_mapper_nested_default_mappers.cpp
libomptarget :: amdgcn-amd-amdhsa :: mapping/declare_mapper_nested_mappers.cpp
libomptarget :: amdgcn-amd-amdhsa :: mapping/lambda_by_value.cpp
libomptarget :: amdgcn-amd-amdhsa :: mapping/ompx_hold/struct.c
libomptarget :: amdgcn-amd-amdhsa :: offloading/host_as_target.c
libomptarget :: amdgcn-amd-amdhsa-LTO :: mapping/data_member_ref.cpp
libomptarget :: amdgcn-amd-amdhsa-LTO :: mapping/declare_mapper_nested_default_mappers.cpp
libomptarget :: amdgcn-amd-amdhsa-LTO :: mapping/declare_mapper_nested_mappers.cpp
libomptarget :: amdgcn-amd-amdhsa-LTO :: mapping/lambda_by_value.cpp
libomptarget :: amdgcn-amd-amdhsa-LTO :: mapping/ompx_hold/struct.c
libomptarget :: amdgcn-amd-amdhsa-LTO :: offloading/host_as_target.c

Testing Time: 145.83s

Unsupported      : 139
Passed           : 613
Expectedly Failed:  12

[100%] Built target check-openmp
[100%] Built target check-openmp

Harbormaster completed remote builds in B197671: Diff 475341.Nov 14 2022, 11:38 PM

Rebased.

Herald added a subscriber: • pcwang-thead. · View Herald TranscriptJan 17 2023, 9:43 PM

After rebasing on top of main today and regenerating all the auto-update clang tests, here are the test results. The AST tests have to be updated manually as Johannes mentioned earlier. I haven't looked at the other clang test failures.

The llvm tests need to be fixed, they have not been regenerated at this point. @jhuber6

make check-clang:

Failed Tests (16):

Clang :: AST/ast-dump-openmp-target-teams-distribute-parallel-for-simd.c
Clang :: AST/ast-dump-openmp-target-teams-distribute-parallel-for.c
Clang :: AST/ast-dump-openmp-teams-distribute-parallel-for-simd.c
Clang :: AST/ast-dump-openmp-teams-distribute-parallel-for.c
Clang :: CodeGen/PowerPC/ppc64le-varargs-f128.c
Clang :: OpenMP/irbuilder_safelen.cpp
Clang :: OpenMP/irbuilder_safelen_order_concurrent.cpp
Clang :: OpenMP/irbuilder_simd_aligned.cpp
Clang :: OpenMP/irbuilder_simdlen.cpp
Clang :: OpenMP/irbuilder_simdlen_safelen.cpp
Clang :: OpenMP/parallel_copyin_combined_codegen.c
Clang :: OpenMP/target_data_map_codegen_hold.cpp
Clang :: OpenMP/target_globals_codegen.cpp
Clang :: OpenMP/target_map_codegen_hold.cpp
Clang :: OpenMP/target_map_member_expr_codegen.cpp
Clang :: OpenMP/unroll_codegen_parallel_for_factor.cpp

Testing Time: 51.15s

Skipped          :     4
Unsupported      :  2776
Passed           : 30423
Expectedly Failed:    26
Failed           :    16

make check-llvm:

Failed Tests (13):

LLVM :: Transforms/OpenMP/custom_state_machines.ll
LLVM :: Transforms/OpenMP/custom_state_machines_remarks.ll
LLVM :: Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
LLVM :: Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
LLVM :: Transforms/OpenMP/is_spmd_exec_mode_fold.ll
LLVM :: Transforms/OpenMP/nested_parallelism.ll
LLVM :: Transforms/OpenMP/parallel_level_fold.ll
LLVM :: Transforms/OpenMP/spmdization.ll
LLVM :: Transforms/OpenMP/spmdization_assumes.ll
LLVM :: Transforms/OpenMP/spmdization_constant_prop.ll
LLVM :: Transforms/OpenMP/spmdization_guarding.ll
LLVM :: Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
LLVM :: Transforms/OpenMP/spmdization_remarks.ll

Testing Time: 83.05s

Skipped          :    59
Unsupported      : 19442
Passed           : 32601
Expectedly Failed:    68
Failed           :    13

make check-openmp on amdgpu:

Expectedly Failed Tests (12):

libomptarget :: amdgcn-amd-amdhsa :: mapping/data_member_ref.cpp
libomptarget :: amdgcn-amd-amdhsa :: mapping/declare_mapper_nested_default_mappers.cpp
libomptarget :: amdgcn-amd-amdhsa :: mapping/declare_mapper_nested_mappers.cpp
libomptarget :: amdgcn-amd-amdhsa :: mapping/lambda_by_value.cpp
libomptarget :: amdgcn-amd-amdhsa :: mapping/ompx_hold/struct.c
libomptarget :: amdgcn-amd-amdhsa :: offloading/host_as_target.c
libomptarget :: amdgcn-amd-amdhsa-LTO :: mapping/data_member_ref.cpp
libomptarget :: amdgcn-amd-amdhsa-LTO :: mapping/declare_mapper_nested_default_mappers.cpp
libomptarget :: amdgcn-amd-amdhsa-LTO :: mapping/declare_mapper_nested_mappers.cpp
libomptarget :: amdgcn-amd-amdhsa-LTO :: mapping/lambda_by_value.cpp
libomptarget :: amdgcn-amd-amdhsa-LTO :: mapping/ompx_hold/struct.c
libomptarget :: amdgcn-amd-amdhsa-LTO :: offloading/host_as_target.c

Testing Time: 148.67s

Unsupported      : 141
Passed           : 676
Expectedly Failed:  12

Harbormaster completed remote builds in B208402: Diff 490030.Jan 17 2023, 10:33 PM

Rebased and updated tests.

Herald added subscribers: jplehr, sunshaoce, kerbowa. · View Herald TranscriptApr 3 2023, 11:35 PM

I rebased the patch and regenerated the clang tests. I haven't regenerated the llvm tests. @jhuber6 @jdoerfert Please help regenerate the llvm tests. Several of the failing clang tests were regenerated earlier, they can perhaps be regenerated after the llvm tests are regenerated.

Here are the test results. check-openmp on amdgpu passes.

Failed Tests (13):
  Clang :: AST/ast-dump-openmp-target-teams-distribute-parallel-for-simd.c
  Clang :: AST/ast-dump-openmp-target-teams-distribute-parallel-for.c
  Clang :: AST/ast-dump-openmp-teams-distribute-parallel-for-simd.c
  Clang :: AST/ast-dump-openmp-teams-distribute-parallel-for.c
  Clang :: CodeGen/PowerPC/ppc64le-varargs-f128.c
  Clang :: Headers/amdgcn-openmp-device-math-complex.c
  Clang :: Headers/amdgcn-openmp-device-math-complex.cpp
  Clang :: Headers/amdgcn_openmp_device_math.c
  Clang :: Headers/openmp_device_math_isnan.cpp
  Clang :: OpenMP/nvptx_lambda_pointer_capturing.cpp
  Clang :: OpenMP/parallel_copyin_combined_codegen.c
  Clang :: OpenMP/target_globals_codegen.cpp
  Clang :: OpenMP/unroll_codegen_parallel_for_factor.cpp

Failed Tests (14):
  LLVM :: Transforms/OpenMP/custom_state_machines.ll
  LLVM :: Transforms/OpenMP/custom_state_machines_remarks.ll
  LLVM :: Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
  LLVM :: Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
  LLVM :: Transforms/OpenMP/is_spmd_exec_mode_fold.ll
  LLVM :: Transforms/OpenMP/nested_parallelism.ll
  LLVM :: Transforms/OpenMP/parallel_level_fold.ll
  LLVM :: Transforms/OpenMP/spmdization.ll
  LLVM :: Transforms/OpenMP/spmdization_assumes.ll
  LLVM :: Transforms/OpenMP/spmdization_constant_prop.ll
  LLVM :: Transforms/OpenMP/spmdization_guarding.ll
  LLVM :: Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
  LLVM :: Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
  LLVM :: Transforms/OpenMP/spmdization_remarks.ll

Harbormaster completed remote builds in B223512: Diff 510706.Apr 4 2023, 3:40 AM

Fixed the Clang tests. Haven't touched the LLVM ones because this breaks SPMDzation and state machine rewrites completely in those tests. Someone who knows what this patch changes should look into what needs to be updated to make those tests match whatever form SPDMzation expects now. Also for some bizarre reason this patch breaks adding alwaysinline on kmpc_parallel_51.

Herald added subscribers: kbarton, nemanjai. · View Herald TranscriptApr 4 2023, 7:13 AM

Harbormaster completed remote builds in B223575: Diff 510797.Apr 4 2023, 8:29 AM

In D102107#4243260, @jhuber6 wrote:

Fixed the Clang tests. Haven't touched the LLVM ones because this breaks SPMDzation and state machine rewrites completely in those tests. Someone who knows what this patch changes should look into what needs to be updated to make those tests match whatever form SPDMzation expects now. Also for some bizarre reason this patch breaks adding alwaysinline on kmpc_parallel_51.

I hadn't looked into whether OpenMPOpt was working ok, so did not realize this breakage. @jdoerfert If you get a chance, please look into this problem ^^.

doru1004 mentioned this in D150134: [OpenMP][Libomptarget] Enable up to 64 arguments for outlined regions in OpenMP device code.May 8 2023, 12:44 PM

I'm trying to pick up the context for this and D95976. Superficially it looks like lowering variadic functions in the compiler could be used to simplify quite a lot of this, @jdoerfert there's a comment from some time ago which suggests that this code path was originally a workaround for lack of variadics.

I'm currently debugging an IR pass that eliminates variadic calls in the hope of using that all the time on amdgpu. I think it could be adapted to patch these calls on the fly for nvptx as well if we added it to the openmp codegen pipeline, need to see whether the function pointer interacts well with the recent specialisation pass.

Large Diff

This large diff affects 246 files. Files without inline comments have been collapsed. Expand All Files

Revision Contents

Path

Size

clang/

lib/

CodeGen/

CGOpenMPRuntime.h

2 lines

CGOpenMPRuntime.cpp

23 lines

CGOpenMPRuntimeGPU.h

1 line

CGOpenMPRuntimeGPU.cpp

186 lines

CGStmtOpenMP.cpp

232 lines

CodeGenFunction.h

6 lines

Sema/

SemaOpenMP.cpp

76 lines

test/

AST/

ast-dump-openmp-distribute-parallel-for-simd.c

435 lines

ast-dump-openmp-distribute-parallel-for.c

435 lines

ast-dump-openmp-target-teams-distribute-parallel-for-simd.c

3725 lines

ast-dump-openmp-target-teams-distribute-parallel-for.c

3725 lines

ast-dump-openmp-teams-distribute-parallel-for-simd.c

4121 lines

ast-dump-openmp-teams-distribute-parallel-for.c

4121 lines

CodeGen/

PowerPC/

ppc64le-varargs-f128.c

3 lines

OpenMP/

amdgpu_target_with_aligned_attribute.c

284 lines

bug54082.c

39 lines

bug60602.cpp

359 lines

cancel_codegen.cpp

684 lines

cancellation_point_codegen.cpp

431 lines

debug-info-complex-byval.cpp

56 lines

debug-info-openmp-array.cpp

184 lines

debug_threadprivate_copyin.c

35 lines

declare_target_codegen_globalization.cpp

24 lines

declare_target_constexpr_codegen.cpp

15 lines

declare_variant_construct_codegen_1.c

8 lines

distribute_codegen.cpp

1976 lines

distribute_firstprivate_codegen.cpp

970 lines

distribute_lastprivate_codegen.cpp

1002 lines

distribute_parallel_for_codegen.cpp

11090 lines

distribute_parallel_for_firstprivate_codegen.cpp

1912 lines

distribute_parallel_for_if_codegen.cpp

964 lines

distribute_parallel_for_lastprivate_codegen.cpp

2088 lines

distribute_parallel_for_num_threads_codegen.cpp

2996 lines

distribute_parallel_for_private_codegen.cpp

942 lines

distribute_parallel_for_proc_bind_codegen.cpp

348 lines

distribute_parallel_for_reduction_task_codegen.cpp

512 lines

distribute_parallel_for_simd_codegen.cpp

11892 lines

distribute_parallel_for_simd_firstprivate_codegen.cpp

1984 lines

distribute_parallel_for_simd_if_codegen.cpp

4876 lines

distribute_parallel_for_simd_lastprivate_codegen.cpp

2160 lines

distribute_parallel_for_simd_num_threads_codegen.cpp

3284 lines

distribute_parallel_for_simd_private_codegen.cpp

1014 lines

distribute_parallel_for_simd_proc_bind_codegen.cpp

384 lines

distribute_private_codegen.cpp

542 lines

distribute_simd_codegen.cpp

4808 lines

distribute_simd_firstprivate_codegen.cpp

1006 lines

distribute_simd_lastprivate_codegen.cpp

1038 lines

distribute_simd_private_codegen.cpp

624 lines

distribute_simd_reduction_codegen.cpp

399 lines

for_firstprivate_codegen.cpp

367 lines

for_lastprivate_codegen.cpp

2949 lines

for_linear_codegen.cpp

1545 lines

for_private_codegen.cpp

359 lines

for_reduction_codegen.cpp

3799 lines

for_reduction_codegen_UDR.cpp

2315 lines

for_reduction_task_codegen.cpp

428 lines

irbuilder_safelen.cpp

15 lines

irbuilder_safelen_order_concurrent.cpp

17 lines

irbuilder_simd_aligned.cpp

19 lines

irbuilder_simdlen.cpp

17 lines

irbuilder_simdlen_safelen.cpp

15 lines

master_taskloop_in_reduction_codegen.cpp

141 lines

master_taskloop_simd_in_reduction_codegen.cpp

141 lines

metadirective_device_kind_codegen.c

55 lines

metadirective_device_kind_codegen.cpp

55 lines

metadirective_implementation_codegen.cpp

42 lines

nested_loop_codegen.cpp

308 lines

nvptx_SPMD_codegen.cpp

22246 lines

nvptx_allocate_codegen.cpp

27 lines

nvptx_data_sharing.cpp

77 lines

nvptx_distribute_parallel_generic_mode_codegen.cpp

786 lines

nvptx_lambda_capturing.cpp

703 lines

nvptx_lambda_pointer_capturing.cpp

25 lines

nvptx_multi_target_parallel_codegen.cpp

88 lines

nvptx_nested_parallel_codegen.cpp

128 lines

nvptx_parallel_codegen.cpp

344 lines

nvptx_parallel_for_codegen.cpp

105 lines

nvptx_target_codegen.cpp

224 lines

nvptx_target_parallel_codegen.cpp

164 lines

nvptx_target_parallel_num_threads_codegen.cpp

164 lines

nvptx_target_parallel_proc_bind_codegen.cpp

756 lines

nvptx_target_parallel_reduction_codegen.cpp

897 lines

nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp

604 lines

nvptx_target_teams_codegen.cpp

250 lines

nvptx_target_teams_distribute_codegen.cpp

156 lines

nvptx_target_teams_distribute_parallel_for_codegen.cpp

4762 lines

nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp

634 lines

nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp

2108 lines

nvptx_target_teams_distribute_simd_codegen.cpp

2776 lines

nvptx_teams_codegen.cpp

136 lines

nvptx_teams_reduction_codegen.cpp

1122 lines

openmp_win_codegen.cpp

66 lines

outlined_artificial.c

5 lines

parallel_codegen.cpp

724 lines

parallel_copyin_codegen.cpp

805 lines

parallel_copyin_combined_codegen.c

984 lines

parallel_firstprivate_codegen.cpp

2078 lines

parallel_for_codegen.cpp

4658 lines

parallel_for_lastprivate_conditional.cpp

273 lines

parallel_for_linear_codegen.cpp

435 lines

parallel_for_reduction_task_codegen.cpp

420 lines

parallel_for_simd_aligned_codegen.cpp

59 lines

parallel_if_codegen.cpp

94 lines

parallel_if_codegen_PR51349.cpp

36 lines

parallel_masked.cpp

60 lines

parallel_masked_target.cpp

60 lines

parallel_master_codegen.cpp

343 lines

parallel_master_reduction_task_codegen.cpp

390 lines

parallel_master_taskloop_codegen.cpp

519 lines

parallel_master_taskloop_firstprivate_codegen.cpp

890 lines

parallel_master_taskloop_lastprivate_codegen.cpp

639 lines

parallel_master_taskloop_simd_codegen.cpp

1494 lines

parallel_master_taskloop_simd_firstprivate_codegen.cpp

890 lines

parallel_master_taskloop_simd_lastprivate_codegen.cpp

652 lines

parallel_private_codegen.cpp

311 lines

parallel_reduction_codegen.cpp

2015 lines

parallel_reduction_task_codegen.cpp

378 lines

parallel_sections_codegen.cpp

96 lines

parallel_sections_reduction_task_codegen.cpp

416 lines

reduction_compound_op.cpp

2272 lines

reduction_implicit_map.cpp

1358 lines

remarks_parallel_in_multiple_target_state_machines.c

19 lines

remarks_parallel_in_target_state_machine.c

15 lines

sections_firstprivate_codegen.cpp

307 lines

sections_lastprivate_codegen.cpp

1047 lines

sections_private_codegen.cpp

227 lines

sections_reduction_codegen.cpp

675 lines

sections_reduction_task_codegen.cpp

424 lines

single_codegen.cpp

1936 lines

single_firstprivate_codegen.cpp

221 lines

single_private_codegen.cpp

123 lines

target_codegen_global_capture.cpp

1562 lines

target_data_map_codegen_hold.cpp

10 lines

target_in_reduction_codegen.cpp

106 lines

target_map_codegen_03.cpp

236 lines

target_map_codegen_hold.cpp

104 lines

target_map_member_expr_codegen.cpp

54 lines

target_ompx_dyn_cgroup_mem_codegen.cpp

1232 lines

target_parallel_codegen.cpp

1858 lines

target_parallel_debug_codegen.cpp

833 lines

target_parallel_for_codegen.cpp

4936 lines

target_parallel_for_debug_codegen.cpp

1177 lines

target_parallel_for_reduction_task_codegen.cpp

420 lines

target_parallel_for_simd_codegen.cpp

7090 lines

target_parallel_if_codegen.cpp

608 lines

target_parallel_num_threads_codegen.cpp

476 lines

target_parallel_reduction_task_codegen.cpp

378 lines

target_teams_codegen.cpp

2242 lines

target_teams_distribute_codegen.cpp

3326 lines

target_teams_distribute_collapse_codegen.cpp

712 lines

target_teams_distribute_dist_schedule_codegen.cpp

1380 lines

target_teams_distribute_firstprivate_codegen.cpp

754 lines

target_teams_distribute_lastprivate_codegen.cpp

1020 lines

target_teams_distribute_parallel_for_codegen.cpp

2306 lines

target_teams_distribute_parallel_for_collapse_codegen.cpp

1414 lines

target_teams_distribute_parallel_for_dist_schedule_codegen.cpp

3250 lines

target_teams_distribute_parallel_for_firstprivate_codegen.cpp

3238 lines

target_teams_distribute_parallel_for_if_codegen.cpp

1040 lines

target_teams_distribute_parallel_for_lastprivate_codegen.cpp

2190 lines

target_teams_distribute_parallel_for_order_codegen.cpp

120 lines

target_teams_distribute_parallel_for_private_codegen.cpp

1642 lines

target_teams_distribute_parallel_for_proc_bind_codegen.cpp

366 lines

target_teams_distribute_parallel_for_reduction_codegen.cpp

992 lines

target_teams_distribute_parallel_for_reduction_task_codegen.cpp

830 lines

target_teams_distribute_parallel_for_schedule_codegen.cpp

10648 lines

target_teams_distribute_parallel_for_simd_codegen.cpp

2732 lines

target_teams_distribute_parallel_for_simd_collapse_codegen.cpp

1582 lines

target_teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp

3514 lines

target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp

3358 lines

target_teams_distribute_parallel_for_simd_if_codegen.cpp

5396 lines

target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp

2262 lines

target_teams_distribute_parallel_for_simd_private_codegen.cpp

1762 lines

target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp

402 lines

target_teams_distribute_parallel_for_simd_reduction_codegen.cpp

1052 lines

target_teams_distribute_parallel_for_simd_schedule_codegen.cpp

11528 lines

target_teams_distribute_private_codegen.cpp

326 lines

target_teams_distribute_reduction_codegen.cpp

5917 lines

target_teams_distribute_simd_codegen.cpp

7160 lines

target_teams_distribute_simd_collapse_codegen.cpp

796 lines

target_teams_distribute_simd_dist_schedule_codegen.cpp

1512 lines

target_teams_distribute_simd_firstprivate_codegen.cpp

784 lines

target_teams_distribute_simd_lastprivate_codegen.cpp

1056 lines

target_teams_distribute_simd_private_codegen.cpp

356 lines

target_teams_distribute_simd_reduction_codegen.cpp

479 lines

target_teams_map_codegen.cpp

1852 lines

target_teams_num_teams_codegen.cpp

476 lines

target_teams_thread_limit_codegen.cpp

476 lines

task_codegen.cpp

253 lines

task_if_codegen.cpp

84 lines

task_in_reduction_codegen.cpp

131 lines

taskgroup_codegen.cpp

52 lines

taskloop_in_reduction_codegen.cpp

133 lines

taskloop_simd_in_reduction_codegen.cpp

133 lines

teams_codegen.cpp

598 lines

teams_distribute_codegen.cpp

926 lines

teams_distribute_collapse_codegen.cpp

688 lines

teams_distribute_dist_schedule_codegen.cpp

1344 lines

teams_distribute_firstprivate_codegen.cpp

754 lines

teams_distribute_lastprivate_codegen.cpp

978 lines

teams_distribute_parallel_for_codegen.cpp

2154 lines

teams_distribute_parallel_for_collapse_codegen.cpp

1330 lines

teams_distribute_parallel_for_copyin_codegen.cpp

762 lines

teams_distribute_parallel_for_dist_schedule_codegen.cpp

3070 lines

teams_distribute_parallel_for_firstprivate_codegen.cpp

1608 lines

teams_distribute_parallel_for_if_codegen.cpp

992 lines

teams_distribute_parallel_for_lastprivate_codegen.cpp

2064 lines

teams_distribute_parallel_for_num_threads_codegen.cpp

1532 lines

teams_distribute_parallel_for_private_codegen.cpp

808 lines

teams_distribute_parallel_for_proc_bind_codegen.cpp

348 lines

teams_distribute_parallel_for_reduction_codegen.cpp

962 lines

teams_distribute_parallel_for_reduction_task_codegen.cpp

824 lines

teams_distribute_parallel_for_schedule_codegen.cpp

10048 lines

teams_distribute_parallel_for_simd_codegen.cpp

2876 lines

teams_distribute_parallel_for_simd_collapse_codegen.cpp

1498 lines

teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp

3334 lines

teams_distribute_parallel_for_simd_firstprivate_codegen.cpp

1668 lines

teams_distribute_parallel_for_simd_if_codegen.cpp

5080 lines

teams_distribute_parallel_for_simd_lastprivate_codegen.cpp

2136 lines

teams_distribute_parallel_for_simd_num_threads_codegen.cpp

1676 lines

teams_distribute_parallel_for_simd_private_codegen.cpp

868 lines

teams_distribute_parallel_for_simd_proc_bind_codegen.cpp

384 lines

teams_distribute_parallel_for_simd_reduction_codegen.cpp

1022 lines

teams_distribute_parallel_for_simd_schedule_codegen.cpp

10932 lines

teams_distribute_private_codegen.cpp

326 lines

teams_distribute_reduction_codegen.cpp

449 lines

teams_distribute_simd_codegen.cpp

1890 lines

teams_distribute_simd_collapse_codegen.cpp

772 lines

teams_distribute_simd_dist_schedule_codegen.cpp

1476 lines

teams_distribute_simd_firstprivate_codegen.cpp

784 lines

teams_distribute_simd_lastprivate_codegen.cpp

1014 lines

teams_distribute_simd_private_codegen.cpp

356 lines

teams_distribute_simd_reduction_codegen.cpp

479 lines

teams_firstprivate_codegen.cpp

1096 lines

teams_private_codegen.cpp

358 lines

tile_codegen.cpp

140 lines

unroll_codegen_parallel_for_factor.cpp

363 lines

vla_crash.c

103 lines

utils/

update_cc_test_checks/

Inputs/

generated-funcs.c.generated.expected

88 lines

generated-funcs.c.no-generated.expected

6 lines

llvm/

include/

llvm/

Frontend/

OpenMP/

OMPKinds.def

14 lines

lib/

Transforms/

IPO/

OpenMPOpt.cpp

1 line

openmp/

libomptarget/

DeviceRTL/

include/

Interface.h

27 lines

generated_microtask_cases.gen

src/

Parallelism.cpp

84 lines

State.cpp

42 lines

utils/

generate_microtask_cases.py

Diff 510797

clang/lib/CodeGen/CGOpenMPRuntime.h

Load File

clang/lib/CodeGen/CGOpenMPRuntime.cpp

Load File

clang/lib/CodeGen/CGOpenMPRuntimeGPU.h

Show First 20 Lines • Show All 244 Lines • ▼ Show 20 Lines	public:
/// \param IfCond Condition in the associated 'if' clause, if it was		/// \param IfCond Condition in the associated 'if' clause, if it was
/// specified, nullptr otherwise.		/// specified, nullptr otherwise.
/// \param NumThreads The value corresponding to the num_threads clause, if		/// \param NumThreads The value corresponding to the num_threads clause, if
/// any,		/// any,
/// or nullptr.		/// or nullptr.
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc,		void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
llvm::Function *OutlinedFn,		llvm::Function *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars,		ArrayRef<llvm::Value *> CapturedVars,
		ArrayRef<llvm::Type *> CapturedVarsElemTypes,
		dhruvachakUnsubmitted Not Done Reply Inline Actions CapturedVarsElemTypes added to handle opaque pointers. dhruvachak: CapturedVarsElemTypes added to handle opaque pointers.
const Expr IfCond, llvm::Value NumThreads) override;		const Expr IfCond, llvm::Value NumThreads) override;

/// Emit an implicit/explicit barrier for OpenMP threads.		/// Emit an implicit/explicit barrier for OpenMP threads.
/// \param Kind Directive for which this implicit barrier call must be		/// \param Kind Directive for which this implicit barrier call must be
/// generated. Must be OMPD_barrier for explicit barrier generation.		/// generated. Must be OMPD_barrier for explicit barrier generation.
/// \param EmitChecks true if need to emit checks for cancellation barriers.		/// \param EmitChecks true if need to emit checks for cancellation barriers.
/// \param ForceSimpleCall true simple barrier call must be emitted, false if		/// \param ForceSimpleCall true simple barrier call must be emitted, false if
/// runtime class decides which one to emit (simple or with cancellation		/// runtime class decides which one to emit (simple or with cancellation
▲ Show 20 Lines • Show All 168 Lines • Show Last 20 Lines

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

Show First 20 Lines • Show All 1,158 Lines • ▼ Show 20 Lines	void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
CGF.Builder.CreateStore(CGF.Builder.getInt32(/C/ 0), ZeroAddr);		CGF.Builder.CreateStore(CGF.Builder.getInt32(/C/ 0), ZeroAddr);
llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;		llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());		OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
OutlinedFnArgs.push_back(ZeroAddr.getPointer());		OutlinedFnArgs.push_back(ZeroAddr.getPointer());
OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());		OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);		emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
}		}

void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,		void CGOpenMPRuntimeGPU::emitParallelCall(
SourceLocation Loc,		CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn,
llvm::Function *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars,		ArrayRef<llvm::Value *> CapturedVars,
const Expr *IfCond,		ArrayRef<llvm::Type > CapturedVarsElemTypes, const Expr IfCond,
llvm::Value *NumThreads) {		llvm::Value *NumThreads) {
if (!CGF.HaveInsertPoint())		if (!CGF.HaveInsertPoint())
return;		return;

auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,		auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars,
NumThreads](CodeGenFunction &CGF,		CapturedVarsElemTypes, IfCond, NumThreads](
PrePostActionTy &Action) {		CodeGenFunction &CGF, PrePostActionTy &Action) {
CGBuilderTy &Bld = CGF.Builder;		CGBuilderTy &Bld = CGF.Builder;
llvm::Value *NumThreadsVal = NumThreads;		llvm::Value *NumThreadsVal = NumThreads;
llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];		llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);		llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
if (WFn)		if (WFn)
ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);		ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);		llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);

// Create a private scope that will globalize the arguments		// Create a private scope that will globalize the arguments
// passed from the outside of the target region.		// passed from the outside of the target region.
// TODO: Is that needed?		// TODO: Is that needed?
CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);		CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);

Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(		assert(CapturedVars.size() == 1 &&
llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),		"Expected single aggregate argument to outlined function");
"captured_vars_addrs");
// There's something to share.		// Globalize the single aggregate argument, if needed, or use a local
if (!CapturedVars.empty()) {		// alloca, or emit null when there are no arguments.
// Prepare for parallel region. Indicate the outlined function.		llvm::Value *AggregateV = CapturedVars[0];
ASTContext &Ctx = CGF.getContext();		assert(AggregateV->getType()->isPointerTy() &&
unsigned Idx = 0;		"Expected pointer type for aggregate argument.");
for (llvm::Value *V : CapturedVars) {
Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);		assert(CapturedVarsElemTypes.size() == 1 &&
		jdoerfertUnsubmitted Not Done Reply Inline Actions This doesn't work anymore with opaque pointers, IIRC. We should remember the type and pass to this place. jdoerfert: This doesn't work anymore with opaque pointers, IIRC. We should remember the type and pass to…
		dhruvachakUnsubmitted Not Done Reply Inline Actions Thanks. Changing this fixed the assertions. dhruvachak: Thanks. Changing this fixed the assertions.
llvm::Value *PtrV;		"Expected single element in array of types.");
if (V->getType()->isIntegerTy())		llvm::Type *PtrElemTy = CapturedVarsElemTypes[0];
PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);		auto &DL = CGM.getDataLayout();
else		unsigned AllocSize = DL.getTypeAllocSize(PtrElemTy);
PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
CGF.EmitStoreOfScalar(PtrV, Dst, /Volatile=/false,		llvm::CallBase *GlobalPtr = nullptr;
Ctx.getPointerType(Ctx.VoidPtrTy));		llvm::Value *AggregatePtr = nullptr;
++Idx;
}		if (AllocSize) {
		llvm::AllocaInst *LocalAlloc =
		CGF.CreateTempAlloca(PtrElemTy, ".tmp.outlined.agg.arg");
		llvm::Value *LocalPtr = Bld.CreatePointerCast(LocalAlloc, CGF.VoidPtrTy);
		GlobalPtr =
		CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
		CGM.getModule(), OMPRTL___kmpc_alloc_shared),
		{llvm::ConstantInt::get(CGM.SizeTy, AllocSize)});
		GlobalPtr->addRetAttr(llvm::Attribute::get(
		dhruvachakUnsubmitted Not Done Reply Inline Actions Added align attribute for call to __kmpc_alloc_shared. dhruvachak: Added align attribute for call to __kmpc_alloc_shared.
		CGM.getLLVMContext(), llvm::Attribute::Alignment,
		CGM.getContext().getTargetInfo().getNewAlign() / 8));

		llvm::Value *AllocArgs[] = {LocalPtr, GlobalPtr};
		AggregatePtr = CGF.EmitRuntimeCall(
		OMPBuilder.getOrCreateRuntimeFunction(
		CGM.getModule(), OMPRTL___kmpc_alloc_aggregate_arg),
		AllocArgs);

		llvm::Value *CapturedVarVal = Bld.CreateAlignedLoad(
		PtrElemTy, AggregateV, DL.getABITypeAlign(PtrElemTy));
		llvm::Value *AggregatePtrCast = Bld.CreatePointerBitCastOrAddrSpaceCast(
		AggregatePtr, PtrElemTy->getPointerTo());
		Bld.CreateDefaultAlignedStore(CapturedVarVal, AggregatePtrCast);
		} else {
		AggregatePtr = llvm::Constant::getNullValue(OMPBuilder.VoidPtr);
}		}

llvm::Value *IfCondVal = nullptr;		llvm::Value *IfCondVal = nullptr;
if (IfCond)		if (IfCond)
IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,		IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
/* isSigned */ false);		/* isSigned */ false);
else		else
IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);		IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);

if (!NumThreadsVal)		if (!NumThreadsVal)
NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);		NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);
else		else
NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),		NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),

assert(IfCondVal && "Expected a value");		assert(IfCondVal && "Expected a value");
		assert(AggregatePtr && "Expected non-null aggregate pointer value");
		// Create the parallel call.
llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);		llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
llvm::Value *Args[] = {		llvm::Value *Args[] = {RTLoc,
RTLoc,
getThreadID(CGF, Loc),		getThreadID(CGF, Loc),
IfCondVal,		IfCondVal,
NumThreadsVal,		NumThreadsVal,
llvm::ConstantInt::get(CGF.Int32Ty, -1),		llvm::ConstantInt::get(CGF.Int32Ty, -1),
FnPtr,		FnPtr,
ID,		ID,
Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),		AggregatePtr};
CGF.VoidPtrPtrTy),
llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(		CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_parallel_51),		CGM.getModule(), OMPRTL___kmpc_parallel_51),
Args);		Args);

		if (AllocSize) {
		assert(GlobalPtr && "Expected non-null global pointer value");
		// Pop global memory used for argument allocation.
		CGF.EmitRuntimeCall(
		OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
		jhuber6Unsubmitted Done Reply Inline Actions This needs to include the size of the accompanying push after D106496. jhuber6: This needs to include the size of the accompanying push after D106496.
		OMPRTL___kmpc_free_shared),
		{GlobalPtr, llvm::ConstantInt::get(CGM.SizeTy, AllocSize)});
		ABataevUnsubmitted Done Reply Inline Actions Enclose into braces too ABataev: Enclose into braces too
		}
};		};

RegionCodeGenTy RCG(ParallelGen);		RegionCodeGenTy RCG(ParallelGen);
RCG(CGF);		RCG(CGF);
}		}

void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {		void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
// Always emit simple barriers!		// Always emit simple barriers!
▲ Show 20 Lines • Show All 1,921 Lines • ▼ Show 20 Lines	llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
Fn->setLinkage(llvm::GlobalValue::InternalLinkage);		Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
Fn->setDoesNotRecurse();		Fn->setDoesNotRecurse();

CodeGenFunction CGF(CGM, /suppressNewContext=/true);		CodeGenFunction CGF(CGM, /suppressNewContext=/true);
CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,		CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
D.getBeginLoc(), D.getBeginLoc());		D.getBeginLoc(), D.getBeginLoc());

const auto *RD = CS.getCapturedRecordDecl();		const auto *RD = CS.getCapturedRecordDecl();
auto CurField = RD->field_begin();

Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,		Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
/Name=/".zero.addr");		/Name=/".zero.addr");
CGF.Builder.CreateStore(CGF.Builder.getInt32(/C/ 0), ZeroAddr);		CGF.Builder.CreateStore(CGF.Builder.getInt32(/C/ 0), ZeroAddr);
// Get the array of arguments.		// Get the array of arguments.
SmallVector<llvm::Value *, 8> Args;		SmallVector<llvm::Value *, 8> Args;

Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());		Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
Args.emplace_back(ZeroAddr.getPointer());		Args.emplace_back(ZeroAddr.getPointer());

CGBuilderTy &Bld = CGF.Builder;		CGBuilderTy &Bld = CGF.Builder;
auto CI = CS.capture_begin();

// Use global memory for data sharing.		// Use global memory for data sharing.
// Handle passing of global args to workers.		// Handle passing of global args to workers.
Address GlobalArgs =		Address GlobalArgs =
CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");		CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrTy, "global_args");
llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();		llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};		llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(		CGF.EmitRuntimeCall(
CGM.getModule(), OMPRTL___kmpc_get_shared_variables),		OMPBuilder.getOrCreateRuntimeFunction(
		CGM.getModule(), OMPRTL___kmpc_get_shared_variables_aggregate),
DataSharingArgs);		DataSharingArgs);

// Retrieve the shared variables from the list of references returned		// Retrieve the shared variables from the list of references returned
// by the runtime. Pass the variables to the outlined function.		// by the runtime. Pass the variables to the outlined function.
Address SharedArgListAddress = Address::invalid();		Address SharedArgAggregateAddress = Address::invalid();
if (CS.capture_size() > 0 \|\|		if (CS.capture_size() > 0) {
isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {		SharedArgAggregateAddress = CGF.EmitLoadOfPointer(
SharedArgListAddress = CGF.EmitLoadOfPointer(
GlobalArgs, CGF.getContext()		GlobalArgs, CGF.getContext()
.getPointerType(CGF.getContext().VoidPtrTy)		.getPointerType(CGF.getContext().VoidTy)
		dhruvachakUnsubmitted Not Done Reply Inline Actions This should be VoidTy now that GlobalArgs type has changed. dhruvachak: This should be VoidTy now that GlobalArgs type has changed.
.castAs<PointerType>());		.castAs<PointerType>());
}		// Load the outlined arg aggregate struct.
unsigned Idx = 0;
if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
llvm::Value *LB = CGF.EmitLoadOfScalar(
TypedAddress,
/Volatile=/false,
CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
Args.emplace_back(LB);
++Idx;
Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
llvm::Value *UB = CGF.EmitLoadOfScalar(
TypedAddress,
/Volatile=/false,
CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
Args.emplace_back(UB);
++Idx;
}
if (CS.capture_size() > 0) {
ASTContext &CGFContext = CGF.getContext();		ASTContext &CGFContext = CGF.getContext();
for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {		QualType RecordPointerTy =
QualType ElemTy = CurField->getType();		CGFContext.getPointerType(CGFContext.getRecordType(RD));
Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(		Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)),		SharedArgAggregateAddress,
CGF.ConvertTypeForMem(ElemTy));		CGF.ConvertTypeForMem(RecordPointerTy)->getPointerTo(),
llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,		CGF.ConvertTypeForMem(RecordPointerTy));
/Volatile=/false,		llvm::Value *Arg = TypedAddress.getPointer();
CGFContext.getPointerType(ElemTy),		Args.emplace_back(Arg);
CI->getLocation());		} else {
if (CI->capturesVariableByCopy() &&		// If there are no captured arguments, use nullptr.
!CI->getCapturedVar()->getType()->isAnyPointerType()) {		ASTContext &CGFContext = CGF.getContext();
Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),		QualType RecordPointerTy =
CI->getLocation());		CGFContext.getPointerType(CGFContext.getRecordType(RD));
}		llvm::Value *Arg =
		llvm::Constant::getNullValue(CGF.ConvertTypeForMem(RecordPointerTy));
Args.emplace_back(Arg);		Args.emplace_back(Arg);
}
}		}

emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);		emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
CGF.FinishFunction();		CGF.FinishFunction();
return Fn;		return Fn;
}		}

void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,		void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
▲ Show 20 Lines • Show All 409 Lines • Show Last 20 Lines

clang/lib/CodeGen/CGStmtOpenMP.cpp

Load File

clang/lib/CodeGen/CodeGenFunction.h

	Show First 20 Lines • Show All 3,343 Lines • ▼ Show 20 Lines
	};			};

	/// Returns calculated size of the specified type.			/// Returns calculated size of the specified type.
	llvm::Value *getTypeSize(QualType Ty);			llvm::Value *getTypeSize(QualType Ty);
	LValue InitCapturedStruct(const CapturedStmt &S);			LValue InitCapturedStruct(const CapturedStmt &S);
	llvm::Function *EmitCapturedStmt(const CapturedStmt &S, CapturedRegionKind K);			llvm::Function *EmitCapturedStmt(const CapturedStmt &S, CapturedRegionKind K);
	llvm::Function *GenerateCapturedStmtFunction(const CapturedStmt &S);			llvm::Function *GenerateCapturedStmtFunction(const CapturedStmt &S);
	Address GenerateCapturedStmtArgument(const CapturedStmt &S);			Address GenerateCapturedStmtArgument(const CapturedStmt &S);
				llvm::Function *
				GenerateOpenMPCapturedStmtFunctionAggregate(const CapturedStmt &S,
				SourceLocation Loc);
	llvm::Function *GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S,			llvm::Function *GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S,
	SourceLocation Loc);			SourceLocation Loc);
				void GenerateOpenMPCapturedVarsAggregate(
				const CapturedStmt &S, SmallVectorImpl<llvm::Value *> &CapturedVars,
				SmallVectorImpl<llvm::Type *> &CapturedVarsElemTypes);
				dhruvachakUnsubmitted Not Done Reply Inline Actions CapturedVarsElemTypes introduced to handle opaque pointers. dhruvachak: CapturedVarsElemTypes introduced to handle opaque pointers.
	void GenerateOpenMPCapturedVars(const CapturedStmt &S,			void GenerateOpenMPCapturedVars(const CapturedStmt &S,
	SmallVectorImpl<llvm::Value *> &CapturedVars);			SmallVectorImpl<llvm::Value *> &CapturedVars);
	void emitOMPSimpleStore(LValue LVal, RValue RVal, QualType RValTy,			void emitOMPSimpleStore(LValue LVal, RValue RVal, QualType RValTy,
	SourceLocation Loc);			SourceLocation Loc);
	/// Perform element by element copying of arrays with type \a			/// Perform element by element copying of arrays with type \a
	/// OriginalType from \a SrcAddr to \a DestAddr using copying procedure			/// OriginalType from \a SrcAddr to \a DestAddr using copying procedure
	/// generated by \a CopyGen.			/// generated by \a CopyGen.
	///			///
	▲ Show 20 Lines • Show All 1,534 Lines • Show Last 20 Lines

clang/lib/Sema/SemaOpenMP.cpp

Load File

clang/test/AST/ast-dump-openmp-distribute-parallel-for-simd.c

Load File

clang/test/AST/ast-dump-openmp-distribute-parallel-for.c

Load File

clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for-simd.c

Load File

clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for.c

Load File

clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for-simd.c

Load File

clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for.c

Load File

clang/test/CodeGen/PowerPC/ppc64le-varargs-f128.c

Load File

clang/test/OpenMP/amdgpu_target_with_aligned_attribute.c

Load File

clang/test/OpenMP/bug54082.c

Load File

clang/test/OpenMP/bug60602.cpp

Load File

clang/test/OpenMP/cancel_codegen.cpp

Load File

clang/test/OpenMP/cancellation_point_codegen.cpp

Load File

clang/test/OpenMP/debug-info-complex-byval.cpp

Load File

clang/test/OpenMP/debug-info-openmp-array.cpp

Load File

clang/test/OpenMP/debug_threadprivate_copyin.c

Load File

clang/test/OpenMP/declare_target_codegen_globalization.cpp

Load File

clang/test/OpenMP/declare_target_constexpr_codegen.cpp

Load File

clang/test/OpenMP/declare_variant_construct_codegen_1.c

Show First 20 Lines • Show All 66 Lines • ▼ Show 20 Lines	// CK1: call void @__omp_offloading_[[OFFLOAD:.+]]({{.+}})

vxv(v1, v2, v3, N);		vxv(v1, v2, v3, N);
// CK1: call void @vxv		// CK1: call void @vxv

#pragma omp parallel		#pragma omp parallel
{		{
vxv(v1, v2, v3, N);		vxv(v1, v2, v3, N);
}		}
// CK1: call void ({{.+}}) @__kmpc_fork_call(%struct.ident_t* {{.+}}, i32 3, void ({{.+}})* bitcast (void (i32, i32, [100 x i32], [100 x i32], [100 x i32]) [[PARALLEL_REGION:@.+]] to void		// CK1: call void ({{.+}}) @__kmpc_fork_call(%struct.ident_t* {{.+}}, i32 1, void ({{.+}})* bitcast (void (i32, i32, %struct.anon{{[\.0-9]}})* [[PARALLEL_REGION:@.+]] to void

return 0;		return 0;
}		}

// CK1: define internal void @__omp_offloading_[[OFFLOAD]]({{.+}})		// CK1: define internal void @__omp_offloading_[[OFFLOAD]]({{.+}})
// CK1: call void ({{.+}}) @__kmpc_fork_teams(%struct.ident_t* {{.+}}, i32 3, void ({{.+}})* bitcast (void (i32, i32, [100 x i32], [100 x i32], [100 x i32]) [[TARGET_REGION:@.+]] to void		// CK1: call void ({{.+}}) @__kmpc_fork_teams(%struct.ident_t* {{.+}}, i32 1, void ({{.+}})* bitcast (void (i32, i32, %struct.anon{{[\.0-9]}})* [[TARGET_REGION:@.+]] to void
// CK1: define internal void [[TARGET_REGION]](		// CK1: define internal void [[TARGET_REGION]](
// CK1: call void @t_vxv		// CK1: call void @t_vxv

// CK1: define internal void [[PARALLEL_REGION]](		// CK1: define internal void [[PARALLEL_REGION]](
// CK1: call void @p_vxv		// CK1: call void @p_vxv
#endif // CK1		#endif // CK1

// RUN: %clang_cc1 -no-opaque-pointers -DCK2 -verify -fopenmp -triple x86_64-unknown-linux -emit-llvm %s -o - \| FileCheck %s --check-prefix=CK2		// RUN: %clang_cc1 -no-opaque-pointers -DCK2 -verify -fopenmp -triple x86_64-unknown-linux -emit-llvm %s -o - \| FileCheck %s --check-prefix=CK2
▲ Show 20 Lines • Show All 72 Lines • ▼ Show 20 Lines	#pragma omp target
test_base(v1, v2, v3, 0);		test_base(v1, v2, v3, 0);
}		}
// CK2: call void @__omp_offloading_[[OFFLOAD_2:.+]]({{.+}})		// CK2: call void @__omp_offloading_[[OFFLOAD_2:.+]]({{.+}})

#pragma omp parallel		#pragma omp parallel
{		{
test_base(v1, v2, v3, 0);		test_base(v1, v2, v3, 0);
}		}
// CK2: call void ({{.+}}) @__kmpc_fork_call(%struct.ident_t* {{.+}}, i32 3, void (i32, i32, ...)* bitcast (void (i32, i32, i32**, i32, i32*) [[PARALLEL_REGION:@.+]] to void		// CK2: call void ({{.+}}) @__kmpc_fork_call(%struct.ident_t* {{.+}}, i32 1, void (i32, i32, ...)* bitcast (void (i32, i32, %struct.anon{{[\.0-9]}})* [[PARALLEL_REGION:@.+]] to void
}		}

// CK2: define internal void @__omp_offloading_[[OFFLOAD_1]]({{.+}})		// CK2: define internal void @__omp_offloading_[[OFFLOAD_1]]({{.+}})
// CK2: call void ({{.+}}) @__kmpc_fork_teams(%struct.ident_t* {{.+}}, i32 3, void ({{.+}})* bitcast (void (i32, i32, i32**, i32, i32*) [[TARGET_REGION_1:@.+]] to void		// CK2: call void ({{.+}}) @__kmpc_fork_teams(%struct.ident_t* {{.+}}, i32 1, void ({{.+}})* bitcast (void (i32, i32, %struct.anon{{[\.0-9]}})* [[TARGET_REGION_1:@.+]] to void
// CK2: define internal void [[TARGET_REGION_1]](		// CK2: define internal void [[TARGET_REGION_1]](
// CK2: call void @test_teams		// CK2: call void @test_teams

// CK2: define internal void @__omp_offloading_[[OFFLOAD_2]]({{.+}})		// CK2: define internal void @__omp_offloading_[[OFFLOAD_2]]({{.+}})
// CK2: call void @test_target		// CK2: call void @test_target

// CK2: define internal void [[PARALLEL_REGION]](		// CK2: define internal void [[PARALLEL_REGION]](
// CK2: call void @test_parallel		// CK2: call void @test_parallel
▲ Show 20 Lines • Show All 143 Lines • ▼ Show 20 Lines	for (int i = 0; i < N; i++)
vxv(v1, v2, v3, N);		vxv(v1, v2, v3, N);
// CK4: call void @__omp_offloading_[[OFFLOAD_2:.+]]({{.+}})		// CK4: call void @__omp_offloading_[[OFFLOAD_2:.+]]({{.+}})
}		}
// CK4-DAG: call void @all_vxv		// CK4-DAG: call void @all_vxv
// CK4-DAG: call void @combined_vxv		// CK4-DAG: call void @combined_vxv

#endif // CK4		#endif // CK4

#endif // HEADER		#endif // HEADER
		jdoerfertUnsubmitted Not Done Reply Inline Actions Something went wrong here. Might be easier to manually change the kmpc_forc_call line (should not be much more) jdoerfert: Something went wrong here. Might be easier to manually change the kmpc_forc_call line (should…

clang/test/OpenMP/distribute_codegen.cpp

Load File

clang/test/OpenMP/distribute_firstprivate_codegen.cpp

Load File

clang/test/OpenMP/distribute_lastprivate_codegen.cpp

Load File

clang/test/OpenMP/distribute_parallel_for_codegen.cpp

Load File

clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp

Load File

clang/test/OpenMP/distribute_parallel_for_if_codegen.cpp

Load File

clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp

Load File

clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp

Load File

clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp

Load File

clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp

Load File

clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp

Load File

clang/test/OpenMP/distribute_parallel_for_simd_codegen.cpp

Load File

clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp

Load File

clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp

Load File

clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp

Load File

clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp

Load File

clang/test/OpenMP/distribute_parallel_for_simd_private_codegen.cpp

Load File

clang/test/OpenMP/distribute_parallel_for_simd_proc_bind_codegen.cpp

Load File

clang/test/OpenMP/distribute_private_codegen.cpp

Load File

clang/test/OpenMP/distribute_simd_codegen.cpp

Load File

clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp

Load File

clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp

Load File

clang/test/OpenMP/distribute_simd_private_codegen.cpp

Load File

clang/test/OpenMP/distribute_simd_reduction_codegen.cpp

Load File

clang/test/OpenMP/for_firstprivate_codegen.cpp

Load File

clang/test/OpenMP/for_lastprivate_codegen.cpp

Load File

clang/test/OpenMP/for_linear_codegen.cpp

Load File

clang/test/OpenMP/for_private_codegen.cpp

Load File

clang/test/OpenMP/for_reduction_codegen.cpp

Load File

clang/test/OpenMP/for_reduction_codegen_UDR.cpp

Load File

clang/test/OpenMP/for_reduction_task_codegen.cpp

Load File

clang/test/OpenMP/irbuilder_safelen.cpp

Load File

clang/test/OpenMP/irbuilder_safelen_order_concurrent.cpp

Load File

clang/test/OpenMP/irbuilder_simd_aligned.cpp

Load File

clang/test/OpenMP/irbuilder_simdlen.cpp

Load File

clang/test/OpenMP/irbuilder_simdlen_safelen.cpp

Load File

clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp

Load File

clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp

Load File

clang/test/OpenMP/metadirective_device_kind_codegen.c

Load File

clang/test/OpenMP/metadirective_device_kind_codegen.cpp

Load File

clang/test/OpenMP/metadirective_implementation_codegen.cpp

Load File

clang/test/OpenMP/nested_loop_codegen.cpp

Load File

clang/test/OpenMP/nvptx_SPMD_codegen.cpp

Load File

clang/test/OpenMP/nvptx_allocate_codegen.cpp

Load File

clang/test/OpenMP/nvptx_data_sharing.cpp

Load File

clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp

Load File

clang/test/OpenMP/nvptx_lambda_capturing.cpp

Load File

clang/test/OpenMP/nvptx_lambda_pointer_capturing.cpp

Load File

clang/test/OpenMP/nvptx_multi_target_parallel_codegen.cpp

Load File

clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp

Load File

clang/test/OpenMP/nvptx_parallel_codegen.cpp

Load File

clang/test/OpenMP/nvptx_parallel_for_codegen.cpp

Load File

clang/test/OpenMP/nvptx_target_codegen.cpp

Load File

clang/test/OpenMP/nvptx_target_parallel_codegen.cpp

Load File

clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp

Load File

clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp

Load File

clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp

Load File

clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp

Load File

clang/test/OpenMP/nvptx_target_teams_codegen.cpp

Load File

clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp

Load File

clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp

Load File

clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp

Load File

clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp

Load File

clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp

Load File

clang/test/OpenMP/nvptx_teams_codegen.cpp

Load File

clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp

Load File

clang/test/OpenMP/openmp_win_codegen.cpp

Load File

clang/test/OpenMP/outlined_artificial.c

Load File

clang/test/OpenMP/parallel_codegen.cpp

Load File

clang/test/OpenMP/parallel_copyin_codegen.cpp

Load File

clang/test/OpenMP/parallel_copyin_combined_codegen.c

Load File

clang/test/OpenMP/parallel_firstprivate_codegen.cpp

Load File

clang/test/OpenMP/parallel_for_codegen.cpp

Load File

clang/test/OpenMP/parallel_for_lastprivate_conditional.cpp

Load File

clang/test/OpenMP/parallel_for_linear_codegen.cpp

Load File

clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp

Load File

clang/test/OpenMP/parallel_for_simd_aligned_codegen.cpp

Load File

clang/test/OpenMP/parallel_if_codegen.cpp

Load File

clang/test/OpenMP/parallel_if_codegen_PR51349.cpp

Load File

clang/test/OpenMP/parallel_masked.cpp

Load File

clang/test/OpenMP/parallel_masked_target.cpp

Load File

clang/test/OpenMP/parallel_master_codegen.cpp

Load File

clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp

Load File

clang/test/OpenMP/parallel_master_taskloop_codegen.cpp

Load File

clang/test/OpenMP/parallel_master_taskloop_firstprivate_codegen.cpp

Load File

clang/test/OpenMP/parallel_master_taskloop_lastprivate_codegen.cpp

Load File

clang/test/OpenMP/parallel_master_taskloop_simd_codegen.cpp

Load File

clang/test/OpenMP/parallel_master_taskloop_simd_firstprivate_codegen.cpp

Load File

clang/test/OpenMP/parallel_master_taskloop_simd_lastprivate_codegen.cpp

Load File

clang/test/OpenMP/parallel_private_codegen.cpp

Load File

clang/test/OpenMP/parallel_reduction_codegen.cpp

Load File

clang/test/OpenMP/parallel_reduction_task_codegen.cpp

Load File

clang/test/OpenMP/parallel_sections_codegen.cpp

Load File

clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp

Load File

clang/test/OpenMP/reduction_compound_op.cpp

Load File

clang/test/OpenMP/reduction_implicit_map.cpp

Load File

clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c

Load File

clang/test/OpenMP/remarks_parallel_in_target_state_machine.c

Load File

clang/test/OpenMP/sections_firstprivate_codegen.cpp

Load File

clang/test/OpenMP/sections_lastprivate_codegen.cpp

Load File

clang/test/OpenMP/sections_private_codegen.cpp

Load File

clang/test/OpenMP/sections_reduction_codegen.cpp

Load File

clang/test/OpenMP/sections_reduction_task_codegen.cpp

Load File

clang/test/OpenMP/single_codegen.cpp

Load File

clang/test/OpenMP/single_firstprivate_codegen.cpp

Load File

clang/test/OpenMP/single_private_codegen.cpp

Load File

clang/test/OpenMP/target_codegen_global_capture.cpp

Load File

clang/test/OpenMP/target_data_map_codegen_hold.cpp

Load File

clang/test/OpenMP/target_in_reduction_codegen.cpp

Load File

clang/test/OpenMP/target_map_codegen_03.cpp

Load File

clang/test/OpenMP/target_map_codegen_hold.cpp

Load File

clang/test/OpenMP/target_map_member_expr_codegen.cpp

Load File

clang/test/OpenMP/target_ompx_dyn_cgroup_mem_codegen.cpp

Load File

clang/test/OpenMP/target_parallel_codegen.cpp

Load File

clang/test/OpenMP/target_parallel_debug_codegen.cpp

Load File

clang/test/OpenMP/target_parallel_for_codegen.cpp

Load File

clang/test/OpenMP/target_parallel_for_debug_codegen.cpp

Load File

clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp

Load File

clang/test/OpenMP/target_parallel_for_simd_codegen.cpp

Load File

clang/test/OpenMP/target_parallel_if_codegen.cpp

Load File

clang/test/OpenMP/target_parallel_num_threads_codegen.cpp

Load File

clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp

Load File

clang/test/OpenMP/target_teams_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_collapse_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_dist_schedule_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_parallel_for_collapse_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_parallel_for_dist_schedule_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_parallel_for_order_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_parallel_for_private_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_parallel_for_proc_bind_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_parallel_for_schedule_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_parallel_for_simd_collapse_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_parallel_for_simd_private_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_parallel_for_simd_reduction_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_parallel_for_simd_schedule_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_private_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_reduction_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_simd_collapse_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_simd_dist_schedule_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_simd_private_codegen.cpp

Load File

clang/test/OpenMP/target_teams_distribute_simd_reduction_codegen.cpp

Load File

clang/test/OpenMP/target_teams_map_codegen.cpp

Load File

clang/test/OpenMP/target_teams_num_teams_codegen.cpp

Load File

clang/test/OpenMP/target_teams_thread_limit_codegen.cpp

Load File

clang/test/OpenMP/task_codegen.cpp

Load File

clang/test/OpenMP/task_if_codegen.cpp

Load File

clang/test/OpenMP/task_in_reduction_codegen.cpp

Load File

clang/test/OpenMP/taskgroup_codegen.cpp

Load File

clang/test/OpenMP/taskloop_in_reduction_codegen.cpp

Load File

clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp

Load File

clang/test/OpenMP/teams_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_collapse_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_dist_schedule_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_lastprivate_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_parallel_for_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_parallel_for_collapse_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_parallel_for_copyin_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_parallel_for_dist_schedule_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_parallel_for_if_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_parallel_for_num_threads_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_parallel_for_private_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_parallel_for_proc_bind_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_parallel_for_reduction_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_parallel_for_schedule_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_parallel_for_simd_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_parallel_for_simd_collapse_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_parallel_for_simd_if_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_parallel_for_simd_num_threads_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_parallel_for_simd_private_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_parallel_for_simd_proc_bind_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_parallel_for_simd_reduction_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_parallel_for_simd_schedule_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_private_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_reduction_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_simd_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_simd_collapse_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_simd_dist_schedule_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_simd_private_codegen.cpp

Load File

clang/test/OpenMP/teams_distribute_simd_reduction_codegen.cpp

Load File

clang/test/OpenMP/teams_firstprivate_codegen.cpp

Load File

clang/test/OpenMP/teams_private_codegen.cpp

Load File

clang/test/OpenMP/tile_codegen.cpp

Load File

clang/test/OpenMP/unroll_codegen_parallel_for_factor.cpp

Load File

clang/test/OpenMP/vla_crash.c

Load File

clang/test/utils/update_cc_test_checks/Inputs/generated-funcs.c.generated.expected

Load File

clang/test/utils/update_cc_test_checks/Inputs/generated-funcs.c.no-generated.expected

Load File

llvm/include/llvm/Frontend/OpenMP/OMPKinds.def

Show First 20 Lines • Show All 450 Lines • ▼ Show 20 Lines
__OMP_RTL(__kmpc_task_allow_completion_event, false, VoidPtr, IdentPtr,		__OMP_RTL(__kmpc_task_allow_completion_event, false, VoidPtr, IdentPtr,
/* Int / Int32, / kmp_task_t */ VoidPtr)		/* Int / Int32, / kmp_task_t */ VoidPtr)

/// OpenMP Device runtime functions		/// OpenMP Device runtime functions
__OMP_RTL(__kmpc_target_init, false, Int32, IdentPtr, Int8, Int1)		__OMP_RTL(__kmpc_target_init, false, Int32, IdentPtr, Int8, Int1)
__OMP_RTL(__kmpc_target_deinit, false, Void, IdentPtr, Int8)		__OMP_RTL(__kmpc_target_deinit, false, Void, IdentPtr, Int8)
__OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)		__OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
__OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,		__OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
VoidPtr, VoidPtr, VoidPtrPtr, SizeTy)		VoidPtr, VoidPtr, VoidPtr)
__OMP_RTL(__kmpc_kernel_parallel, false, Int1, VoidPtrPtr)		__OMP_RTL(__kmpc_kernel_parallel, false, Int1, VoidPtrPtr)
__OMP_RTL(__kmpc_kernel_end_parallel, false, Void, )		__OMP_RTL(__kmpc_kernel_end_parallel, false, Void, )
__OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32)		__OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32)
__OMP_RTL(__kmpc_end_serialized_parallel, false, Void, IdentPtr, Int32)		__OMP_RTL(__kmpc_end_serialized_parallel, false, Void, IdentPtr, Int32)
__OMP_RTL(__kmpc_shuffle_int32, false, Int32, Int32, Int16, Int16)		__OMP_RTL(__kmpc_shuffle_int32, false, Int32, Int32, Int16, Int16)
__OMP_RTL(__kmpc_nvptx_parallel_reduce_nowait_v2, false, Int32, IdentPtr, Int32,		__OMP_RTL(__kmpc_nvptx_parallel_reduce_nowait_v2, false, Int32, IdentPtr, Int32,
Int32, SizeTy, VoidPtr, ShuffleReducePtr, InterWarpCopyPtr)		Int32, SizeTy, VoidPtr, ShuffleReducePtr, InterWarpCopyPtr)
__OMP_RTL(__kmpc_nvptx_end_reduce_nowait, false, Void, Int32)		__OMP_RTL(__kmpc_nvptx_end_reduce_nowait, false, Void, Int32)
__OMP_RTL(__kmpc_nvptx_teams_reduce_nowait_v2, false, Int32, IdentPtr, Int32,		__OMP_RTL(__kmpc_nvptx_teams_reduce_nowait_v2, false, Int32, IdentPtr, Int32,
VoidPtr, Int32, VoidPtr, ShuffleReducePtr, InterWarpCopyPtr,		VoidPtr, Int32, VoidPtr, ShuffleReducePtr, InterWarpCopyPtr,
GlobalListPtr, GlobalListPtr, GlobalListPtr, GlobalListPtr)		GlobalListPtr, GlobalListPtr, GlobalListPtr, GlobalListPtr)

__OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16)		__OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16)

__OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy)		__OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy)
__OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr, SizeTy)		__OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr, SizeTy)
__OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy)		__OMP_RTL(__kmpc_alloc_aggregate_arg, false, VoidPtr, VoidPtr, VoidPtr)
__OMP_RTL(__kmpc_end_sharing_variables, false, Void, )		__OMP_RTL(__kmpc_get_shared_variables_aggregate, false, Void, VoidPtrPtr)
__OMP_RTL(__kmpc_get_shared_variables, false, Void, VoidPtrPtrPtr)
__OMP_RTL(__kmpc_parallel_level, false, Int16, IdentPtr, Int32)		__OMP_RTL(__kmpc_parallel_level, false, Int16, IdentPtr, Int32)
__OMP_RTL(__kmpc_is_spmd_exec_mode, false, Int8, )		__OMP_RTL(__kmpc_is_spmd_exec_mode, false, Int8, )
__OMP_RTL(__kmpc_barrier_simple_spmd, false, Void, IdentPtr, Int32)		__OMP_RTL(__kmpc_barrier_simple_spmd, false, Void, IdentPtr, Int32)
__OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32)		__OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32)

__OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)		__OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
__OMP_RTL(__kmpc_syncwarp, false, Void, Int64)		__OMP_RTL(__kmpc_syncwarp, false, Void, Int64)

▲ Show 20 Lines • Show All 437 Lines • ▼ Show 20 Lines	__OMP_RTL_ATTRS(__kmpc_doacross_fini, BarrierAttrs, AttributeSet(),
ParamAttrs(ReadOnlyPtrAttrs, SExt))		ParamAttrs(ReadOnlyPtrAttrs, SExt))

__OMP_RTL_ATTRS(__kmpc_alloc_shared,		__OMP_RTL_ATTRS(__kmpc_alloc_shared,
AttributeSet(EnumAttr(NoUnwind), EnumAttr(NoSync),		AttributeSet(EnumAttr(NoUnwind), EnumAttr(NoSync),
AllocSizeAttr(0, std::nullopt)),		AllocSizeAttr(0, std::nullopt)),
ReturnPtrAttrs, ParamAttrs(SizeTyExt))		ReturnPtrAttrs, ParamAttrs(SizeTyExt))
__OMP_RTL_ATTRS(__kmpc_free_shared, DeviceAllocAttrs, AttributeSet(),		__OMP_RTL_ATTRS(__kmpc_free_shared, DeviceAllocAttrs, AttributeSet(),
ParamAttrs(AttributeSet(EnumAttr(NoCapture),		ParamAttrs(AttributeSet(EnumAttr(NoCapture),
EnumAttr(AllocatedPointer)),		EnumAttr(AllocatedPointer)),
		dhruvachakUnsubmitted Not Done Reply Inline Actions NoCapture attributes for the parameters need to be removed. See https://github.com/llvm/llvm-project/issues/54654 dhruvachak: NoCapture attributes for the parameters need to be removed. See https://github.com/llvm/llvm…
SizeTyExt))		SizeTyExt))
__OMP_RTL_ATTRS(__kmpc_begin_sharing_variables, AttributeSet(), AttributeSet(),
ParamAttrs(AttributeSet(), SizeTyExt))

		__OMP_RTL_ATTRS(__kmpc_alloc_aggregate_arg, DefaultAttrs, ReturnPtrAttrs,
		dhruvachakUnsubmitted Not Done Reply Inline Actions Fixed attribute of __kmpc_alloc_aggregate_arg, dhruvachak: Fixed attribute of __kmpc_alloc_aggregate_arg,
		ParamAttrs())
__OMP_RTL_ATTRS(__kmpc_alloc, DefaultAttrs, ReturnPtrAttrs,		__OMP_RTL_ATTRS(__kmpc_alloc, DefaultAttrs, ReturnPtrAttrs,
ParamAttrs(SExt, SizeTyExt))		ParamAttrs(SExt, SizeTyExt))
__OMP_RTL_ATTRS(__kmpc_aligned_alloc, DefaultAttrs, ReturnPtrAttrs,		__OMP_RTL_ATTRS(__kmpc_aligned_alloc, DefaultAttrs, ReturnPtrAttrs,
ParamAttrs(SExt, SizeTyExt, SizeTyExt))		ParamAttrs(SExt, SizeTyExt, SizeTyExt))
__OMP_RTL_ATTRS(__kmpc_free, AllocAttrs, AttributeSet(),		__OMP_RTL_ATTRS(__kmpc_free, AllocAttrs, AttributeSet(),
ParamAttrs(SExt))		ParamAttrs(SExt))

__OMP_RTL_ATTRS(__tgt_interop_init, AttributeSet(), AttributeSet(),		__OMP_RTL_ATTRS(__tgt_interop_init, AttributeSet(), AttributeSet(),
▲ Show 20 Lines • Show All 67 Lines • ▼ Show 20 Lines	__OMP_RTL_ATTRS(__kmpc_task_allow_completion_event, DefaultAttrs,
ReturnPtrAttrs, ParamAttrs(ReadOnlyPtrAttrs, SExt))		ReturnPtrAttrs, ParamAttrs(ReadOnlyPtrAttrs, SExt))

__OMP_RTL_ATTRS(__kmpc_target_init, AttributeSet(), SExt,		__OMP_RTL_ATTRS(__kmpc_target_init, AttributeSet(), SExt,
ParamAttrs(AttributeSet(), SExt, SExt))		ParamAttrs(AttributeSet(), SExt, SExt))
__OMP_RTL_ATTRS(__kmpc_target_deinit, AttributeSet(), AttributeSet(),		__OMP_RTL_ATTRS(__kmpc_target_deinit, AttributeSet(), AttributeSet(),
ParamAttrs(AttributeSet(), SExt))		ParamAttrs(AttributeSet(), SExt))
__OMP_RTL_ATTRS(__kmpc_parallel_51, AlwaysInlineAttrs, AttributeSet(),		__OMP_RTL_ATTRS(__kmpc_parallel_51, AlwaysInlineAttrs, AttributeSet(),
ParamAttrs(AttributeSet(), SExt, SExt, SExt, SExt,		ParamAttrs(AttributeSet(), SExt, SExt, SExt, SExt,
AttributeSet(), AttributeSet(), AttributeSet(),		AttributeSet(), AttributeSet(), AttributeSet()))
SizeTyExt))
__OMP_RTL_ATTRS(__kmpc_serialized_parallel, InaccessibleArgOnlyAttrs,		__OMP_RTL_ATTRS(__kmpc_serialized_parallel, InaccessibleArgOnlyAttrs,
AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs, SExt))		AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs, SExt))
__OMP_RTL_ATTRS(__kmpc_end_serialized_parallel, InaccessibleArgOnlyAttrs,		__OMP_RTL_ATTRS(__kmpc_end_serialized_parallel, InaccessibleArgOnlyAttrs,
AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs, SExt))		AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs, SExt))
__OMP_RTL_ATTRS(__kmpc_shuffle_int32, AttributeSet(), SExt,		__OMP_RTL_ATTRS(__kmpc_shuffle_int32, AttributeSet(), SExt,
ParamAttrs(SExt, SExt, SExt))		ParamAttrs(SExt, SExt, SExt))
__OMP_RTL_ATTRS(__kmpc_nvptx_parallel_reduce_nowait_v2, AttributeSet(), SExt,		__OMP_RTL_ATTRS(__kmpc_nvptx_parallel_reduce_nowait_v2, AttributeSet(), SExt,
ParamAttrs(AttributeSet(), SExt, SExt, SizeTyExt))		ParamAttrs(AttributeSet(), SExt, SExt, SizeTyExt))
▲ Show 20 Lines • Show All 293 Lines • Show Last 20 Lines

llvm/lib/Transforms/IPO/OpenMPOpt.cpp

Show First 20 Lines • Show All 4,666 Lines • ▼ Show 20 Lines	void initialize(Attributor &A) override {
case OMPRTL___kmpc_single:		case OMPRTL___kmpc_single:
case OMPRTL___kmpc_end_single:		case OMPRTL___kmpc_end_single:
case OMPRTL___kmpc_master:		case OMPRTL___kmpc_master:
case OMPRTL___kmpc_end_master:		case OMPRTL___kmpc_end_master:
case OMPRTL___kmpc_barrier:		case OMPRTL___kmpc_barrier:
case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2:		case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2:
case OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2:		case OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2:
case OMPRTL___kmpc_nvptx_end_reduce_nowait:		case OMPRTL___kmpc_nvptx_end_reduce_nowait:
		case OMPRTL___kmpc_alloc_aggregate_arg:
		dhruvachakUnsubmitted Not Done Reply Inline Actions @jdoerfert Is this enough to enable SPMDization or is further handling required? dhruvachak: @jdoerfert Is this enough to enable SPMDization or is further handling required?
		dhruvachakUnsubmitted Not Done Reply Inline Actions Just to be clear, this change does allow SPMDization now but want to make sure nothing else is missing. dhruvachak: Just to be clear, this change does allow SPMDization now but want to make sure nothing else is…
		dhruvachakUnsubmitted Not Done Reply Inline Actions Added alloc_aggregate_arg entry point to OpenMPOpt SPMD list. dhruvachak: Added alloc_aggregate_arg entry point to OpenMPOpt SPMD list.
break;		break;
case OMPRTL___kmpc_distribute_static_init_4:		case OMPRTL___kmpc_distribute_static_init_4:
case OMPRTL___kmpc_distribute_static_init_4u:		case OMPRTL___kmpc_distribute_static_init_4u:
case OMPRTL___kmpc_distribute_static_init_8:		case OMPRTL___kmpc_distribute_static_init_8:
case OMPRTL___kmpc_distribute_static_init_8u:		case OMPRTL___kmpc_distribute_static_init_8u:
case OMPRTL___kmpc_for_static_init_4:		case OMPRTL___kmpc_for_static_init_4:
case OMPRTL___kmpc_for_static_init_4u:		case OMPRTL___kmpc_for_static_init_4u:
case OMPRTL___kmpc_for_static_init_8:		case OMPRTL___kmpc_for_static_init_8:
▲ Show 20 Lines • Show All 884 Lines • Show Last 20 Lines

openmp/libomptarget/DeviceRTL/include/Interface.h

Load File

openmp/libomptarget/DeviceRTL/include/generated_microtask_cases.gen

Load File

openmp/libomptarget/DeviceRTL/src/Parallelism.cpp

Load File

openmp/libomptarget/DeviceRTL/src/State.cpp

Load File

openmp/libomptarget/utils/generate_microtask_cases.py

Load File

This is an archive of the discontinued LLVM Phabricator instance.

[OpenMP] Codegen aggregate for outlined function capturesAcceptedPublic

Details

Diff Detail

Event Timeline

command stderr:

Large Diff

Revision Contents

Diff 510797

clang/lib/CodeGen/CGOpenMPRuntime.h

clang/lib/CodeGen/CGOpenMPRuntime.cpp

clang/lib/CodeGen/CGOpenMPRuntimeGPU.h

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

clang/lib/CodeGen/CGStmtOpenMP.cpp

clang/lib/CodeGen/CodeGenFunction.h

clang/lib/Sema/SemaOpenMP.cpp

clang/test/AST/ast-dump-openmp-distribute-parallel-for-simd.c

clang/test/AST/ast-dump-openmp-distribute-parallel-for.c

clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for-simd.c

clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for.c

clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for-simd.c

clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for.c

clang/test/CodeGen/PowerPC/ppc64le-varargs-f128.c

clang/test/OpenMP/amdgpu_target_with_aligned_attribute.c

clang/test/OpenMP/bug54082.c

clang/test/OpenMP/bug60602.cpp

clang/test/OpenMP/cancel_codegen.cpp

clang/test/OpenMP/cancellation_point_codegen.cpp

clang/test/OpenMP/debug-info-complex-byval.cpp

clang/test/OpenMP/debug-info-openmp-array.cpp

clang/test/OpenMP/debug_threadprivate_copyin.c

clang/test/OpenMP/declare_target_codegen_globalization.cpp

clang/test/OpenMP/declare_target_constexpr_codegen.cpp

clang/test/OpenMP/declare_variant_construct_codegen_1.c

clang/test/OpenMP/distribute_codegen.cpp

clang/test/OpenMP/distribute_firstprivate_codegen.cpp

clang/test/OpenMP/distribute_lastprivate_codegen.cpp

clang/test/OpenMP/distribute_parallel_for_codegen.cpp

clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp

clang/test/OpenMP/distribute_parallel_for_if_codegen.cpp

clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp

clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp

clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp

clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp

clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp

clang/test/OpenMP/distribute_parallel_for_simd_codegen.cpp

clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp

clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp

clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp

clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp

clang/test/OpenMP/distribute_parallel_for_simd_private_codegen.cpp

clang/test/OpenMP/distribute_parallel_for_simd_proc_bind_codegen.cpp

clang/test/OpenMP/distribute_private_codegen.cpp

clang/test/OpenMP/distribute_simd_codegen.cpp

clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp

clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp

clang/test/OpenMP/distribute_simd_private_codegen.cpp

clang/test/OpenMP/distribute_simd_reduction_codegen.cpp

clang/test/OpenMP/for_firstprivate_codegen.cpp

clang/test/OpenMP/for_lastprivate_codegen.cpp

clang/test/OpenMP/for_linear_codegen.cpp

clang/test/OpenMP/for_private_codegen.cpp

clang/test/OpenMP/for_reduction_codegen.cpp

clang/test/OpenMP/for_reduction_codegen_UDR.cpp

clang/test/OpenMP/for_reduction_task_codegen.cpp

clang/test/OpenMP/irbuilder_safelen.cpp

clang/test/OpenMP/irbuilder_safelen_order_concurrent.cpp

clang/test/OpenMP/irbuilder_simd_aligned.cpp

clang/test/OpenMP/irbuilder_simdlen.cpp

clang/test/OpenMP/irbuilder_simdlen_safelen.cpp

clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp

clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp

clang/test/OpenMP/metadirective_device_kind_codegen.c

clang/test/OpenMP/metadirective_device_kind_codegen.cpp

clang/test/OpenMP/metadirective_implementation_codegen.cpp

clang/test/OpenMP/nested_loop_codegen.cpp

clang/test/OpenMP/nvptx_SPMD_codegen.cpp

clang/test/OpenMP/nvptx_allocate_codegen.cpp

clang/test/OpenMP/nvptx_data_sharing.cpp

clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp

[OpenMP] Codegen aggregate for outlined function captures
AcceptedPublic