[committed] OpenACC Profiling Interface (incomplete)

Message ID yxfplfz4anys.fsf@hertz.schwinge.homeip.net
State New
Headers show
Series
  • [committed] OpenACC Profiling Interface (incomplete)
Related show

Commit Message

Thomas Schwinge May 17, 2019, 7:19 p.m.
Hi!

On Thu, 16 May 2019 21:43:47 +0200, I wrote:
> On Thu, 16 May 2019 17:54:23 +0200, Jakub Jelinek <jakub@redhat.com> wrote:

> > On Thu, May 16, 2019 at 05:21:56PM +0200, Thomas Schwinge wrote:

> > > > Jakub, would you please especially review the non-OpenACC-specific

> > > > changes here, including the libgomp ABI changes?

> > > 

> > > Given a baseline that I've not yet posted ;-) [...]



> > > "The OpenACC runtime will invoke 'acc_register_library',


> Will [...] work for the case of static linking, though?


I think I have an idea, but:

> Anyway, I'll defer the actual implementation for later.



> But I'll still now include in the commit that I'm preparing the

> 'acc_register_library' prototype in <openacc.h>, and also its symbol

> version, because these things apply no matter whether we now call that

> function from 'goacc_profiling_initialize' or not.

> 

> Does the 'acc_register_library' symbol version need to be backed by a

> (stub) function definition?  It builds without, but it doesn't appear in

> 'readelf --dyn-syms x86_64-pc-linux-gnu/libgomp/.libs/libgomp.so'; is

> that OK or not?


I'm simply including a stub function, which can then be replaced, later
on.


Committed to trunk in r271346 "OpenACC Profiling Interface (incomplete)",
see attached.


Grüße
 Thomas

Comments

Thomas Schwinge July 27, 2021, 9:33 a.m. | #1
Hi!

On 2019-05-17T21:19:07+0200, I wrote:
> Committed to trunk in r271346 "OpenACC Profiling Interface (incomplete)"


There in an 'async' issue in two of the test cases added here, as figured
out during review/testing with GCN offloading Julian's patch to 'Fix
OpenACC "ephemeral" asynchronous host-to-device copies'.  I've pushed
"[OpenACC] Clarify sequencing of 'async' data copying vs. profiling
events in 'libgomp.oacc-c-c++-common/acc_prof-{init,parallel}-1.c'" to
master branch in commit 29ddaf43f70e19fd1110b539e8b3d0436c757e34, see
attached.


Grüße
 Thomas


-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
From 29ddaf43f70e19fd1110b539e8b3d0436c757e34 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge <thomas@codesourcery.com>

Date: Fri, 23 Jul 2021 15:07:34 +0200
Subject: [PATCH] [OpenACC] Clarify sequencing of 'async' data copying vs.
 profiling events in 'libgomp.oacc-c-c++-common/acc_prof-{init,parallel}-1.c'

... as noticed with GCN offloading.

Fix-up for r271346 (commit 5fae049dc272144f8e61af94ee0ba42b270915e5)
"OpenACC Profiling Interface (incomplete)".

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c: Clarify
	sequencing of 'async' data copying vs. profiling events.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c:
	Likewise.
---
 .../acc_prof-init-1.c                         | 49 ++++++++++++------
 .../acc_prof-parallel-1.c                     | 51 +++++++++++++------
 2 files changed, 68 insertions(+), 32 deletions(-)

diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
index 7d05f482f46..b5e77155460 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
@@ -19,6 +19,19 @@
 #define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
 
 
+static acc_prof_reg reg;
+static acc_prof_reg unreg;
+static acc_prof_lookup_func lookup;
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg = reg_;
+  unreg = unreg_;
+  lookup = lookup_;
+}
+
+
 static int state = -1;
 
 #define STATE_OP(state, op) \
@@ -34,7 +47,7 @@ static int state = -1;
 
 static acc_device_t acc_device_type;
 static int acc_device_num;
-static int acc_async;
+static int acc_async = acc_async_sync;
 
 
 struct tool_info
@@ -192,6 +205,21 @@ static void cb_compute_construct_end (acc_prof_info *prof_info, acc_event_info *
 
   assert (state == 11
 	  || state == 111);
+#if defined COPYIN
+  /* In an 'async' setting, this event may be triggered before actual 'async'
+     data copying has completed.  Given that 'state' appears in 'COPYIN', we
+     first have to synchronize (that is, let the 'async' 'COPYIN' read the
+     current 'state' value)...  */
+  if (acc_async != acc_async_sync)
+    {
+      /* "We're not yet accounting for the fact that _OpenACC events may occur
+	 during event processing_"; temporarily disable to avoid deadlock.  */
+      unreg (acc_ev_none, NULL, acc_toggle_per_thread);
+      acc_wait (acc_async);
+      reg (acc_ev_none, NULL, acc_toggle_per_thread);
+    }
+  /* ... before modifying it in the following.  */
+#endif
   STATE_OP (state, ++);
 
   assert (tool_info != NULL);
@@ -240,19 +268,6 @@ static void cb_compute_construct_end (acc_prof_info *prof_info, acc_event_info *
 }
 
 
-static acc_prof_reg reg;
-static acc_prof_reg unreg;
-static acc_prof_lookup_func lookup;
-void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
-{
-  DEBUG_printf ("%s\n", __FUNCTION__);
-
-  reg = reg_;
-  unreg = unreg_;
-  lookup = lookup_;
-}
-
-
 int main()
 {
   acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
@@ -271,14 +286,15 @@ int main()
 
   acc_device_type = acc_get_device_type ();
   acc_device_num = acc_get_device_num (acc_device_type);
-  acc_async = 12;
 
   {
     int state_init;
+    acc_async = 12;
 #pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init)
     {
       state_init = state;
     }
+    acc_async = acc_async_sync;
 #pragma acc wait
     assert (state_init == 11);
   }
@@ -297,14 +313,15 @@ int main()
 
   acc_device_type = acc_get_device_type ();
   acc_device_num = acc_get_device_num (acc_device_type);
-  acc_async = 12;
 
   {
     int state_init;
+    acc_async = 12;
 #pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init)
     {
       state_init = state;
     }
+    acc_async = acc_async_sync;
 #pragma acc wait
     assert (state_init == 111);
   }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
index a5e9ab3f936..1f503861cb6 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
@@ -29,6 +29,19 @@
 #define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
 
 
+static acc_prof_reg reg;
+static acc_prof_reg unreg;
+static acc_prof_lookup_func lookup;
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg = reg_;
+  unreg = unreg_;
+  lookup = lookup_;
+}
+
+
 static int state = -1;
 
 #define STATE_OP(state, op) \
@@ -44,7 +57,7 @@ static int state = -1;
 
 static acc_device_t acc_device_type;
 static int acc_device_num;
-static int acc_async;
+static int acc_async = acc_async_sync;
 
 
 struct tool_info
@@ -235,6 +248,25 @@ static void cb_enter_data_end (acc_prof_info *prof_info, acc_event_info *event_i
 
   assert (state == 4
 	  || state == 104);
+#if defined COPYIN
+  /* Conceptually, 'acc_ev_enter_data_end' marks the end of data copying,
+     before 'acc_ev_enqueue_launch_start' marks invoking the compute region.
+     That's the 'state_init = state;' intended to be captured in the compute
+     regions.  */
+  /* In an 'async' setting, this event may be triggered before actual 'async'
+     data copying has completed.  Given that 'state' appears in 'COPYIN', we
+     first have to synchronize (that is, let the 'async' 'COPYIN' read the
+     current 'state' value)...  */
+  if (acc_async != acc_async_sync)
+    {
+      /* "We're not yet accounting for the fact that _OpenACC events may occur
+	 during event processing_"; temporarily disable to avoid deadlock.  */
+      unreg (acc_ev_none, NULL, acc_toggle_per_thread);
+      acc_wait (acc_async);
+      reg (acc_ev_none, NULL, acc_toggle_per_thread);
+    }
+  /* ... before modifying it in the following.  */
+#endif
   STATE_OP (state, ++);
 
   assert (tool_info != NULL);
@@ -664,19 +696,6 @@ static void cb_enqueue_launch_end (acc_prof_info *prof_info, acc_event_info *eve
 }
 
 
-static acc_prof_reg reg;
-static acc_prof_reg unreg;
-static acc_prof_lookup_func lookup;
-void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
-{
-  DEBUG_printf ("%s\n", __FUNCTION__);
-
-  reg = reg_;
-  unreg = unreg_;
-  lookup = lookup_;
-}
-
-
 int main()
 {
   acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
@@ -696,7 +715,6 @@ int main()
 
   acc_device_type = acc_get_device_type ();
   acc_device_num = acc_get_device_num (acc_device_type);
-  acc_async = acc_async_sync;
   assert (state == 0);
 
   {
@@ -713,15 +731,16 @@ int main()
 
   STATE_OP (state, = 100);
 
-  acc_async = 12;
   {
     int state_init;
+    acc_async = 12;
 #pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init)
     {
       asm volatile ("" : : : "memory"); // TODO PR90488
 
       state_init = state;
     }
+    acc_async = acc_async_sync;
 #pragma acc wait
     assert (state_init == 104);
   }
-- 
2.30.2

Patch

From 5feffd51c51c0ca461c3819db9ff364ed2eec1c5 Mon Sep 17 00:00:00 2001
From: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4>
Date: Fri, 17 May 2019 19:13:36 +0000
Subject: [PATCH] OpenACC Profiling Interface (incomplete)

	libgomp/
	* acc_prof.h: New file.
	* oacc-profiling.c: Likewise.
	* Makefile.am (nodist_libsubinclude_HEADERS, libgomp_la_SOURCES):
	Add these, respectively.
	* Makefile.in: Regenerate.
	* env.c (initialize_env): Call goacc_profiling_initialize.
	* oacc-plugin.c (GOMP_PLUGIN_goacc_thread)
	(GOMP_PLUGIN_goacc_profiling_dispatch): New functions.
	* oacc-plugin.h (GOMP_PLUGIN_goacc_thread)
	(GOMP_PLUGIN_goacc_profiling_dispatch): Declare.
	* libgomp.map (OACC_2.5.1): Add acc_prof_lookup,
	acc_prof_register, acc_prof_unregister, and acc_register_library.
	(GOMP_PLUGIN_1.3): Add GOMP_PLUGIN_goacc_profiling_dispatch, and
	GOMP_PLUGIN_goacc_thread.
	* oacc-int.h (struct goacc_thread): Add prof_info, api_info,
	prof_callbacks_enabled members.
	(goacc_prof_enabled, goacc_profiling_initialize)
	(_goacc_profiling_dispatch_p, _goacc_profiling_setup_p)
	(goacc_profiling_dispatch): Declare.
	(GOACC_PROF_ENABLED, GOACC_PROFILING_DISPATCH_P)
	(GOACC_PROFILING_SETUP_P): Define.
	* oacc-async.c (acc_async_test, acc_async_test_all, acc_wait)
	(acc_wait_async, acc_wait_all, acc_wait_all_async): Update for
	OpenACC Profiling Interface.
	* oacc-cuda.c (acc_get_current_cuda_device)
	(acc_get_current_cuda_context, acc_get_cuda_stream)
	(acc_set_cuda_stream): Likewise.
	* oacc-init.c (acc_init_1, goacc_attach_host_thread_to_device)
	(acc_init, acc_set_device_type, acc_get_device_type)
	(acc_get_device_num, goacc_lazy_initialize): Likewise.
	* oacc-mem.c (acc_malloc, acc_free, memcpy_tofrom_device)
	(acc_deviceptr, acc_hostptr, acc_is_present, acc_map_data)
	(acc_unmap_data, present_create_copy, delete_copyout)
	(update_dev_host): Likewise.
	* oacc-parallel.c (GOACC_parallel_keyed, GOACC_data_start)
	(GOACC_data_end, GOACC_enter_exit_data, GOACC_update, GOACC_wait):
	Likewise.
	* plugin/plugin-nvptx.c (nvptx_exec, nvptx_alloc, nvptx_free)
	(GOMP_OFFLOAD_openacc_exec, GOMP_OFFLOAD_openacc_async_exec):
	Likewise.
	* libgomp.texi: Update.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c: New
	file.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c:
	Likewise.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@271346 138bc75d-0d04-0410-961f-82ee72b054a4
---
 libgomp/ChangeLog                             |  55 ++
 libgomp/Makefile.am                           |   4 +-
 libgomp/Makefile.in                           |   7 +-
 libgomp/acc_prof.h                            | 252 ++++++
 libgomp/env.c                                 |   2 +
 libgomp/libgomp.map                           |  14 +
 libgomp/libgomp.texi                          | 412 ++++++++++
 libgomp/oacc-async.c                          | 121 ++-
 libgomp/oacc-cuda.c                           |  81 +-
 libgomp/oacc-init.c                           | 124 ++-
 libgomp/oacc-int.h                            |  29 +
 libgomp/oacc-mem.c                            | 113 ++-
 libgomp/oacc-parallel.c                       | 460 ++++++++++-
 libgomp/oacc-plugin.c                         |  17 +
 libgomp/oacc-plugin.h                         |   7 +
 libgomp/oacc-profiling.c                      | 662 ++++++++++++++++
 libgomp/plugin/plugin-nvptx.c                 | 196 ++++-
 .../acc_prof-dispatch-1.c                     | 353 +++++++++
 .../acc_prof-init-1.c                         | 316 ++++++++
 .../acc_prof-kernels-1.c                      | 229 ++++++
 .../acc_prof-parallel-1.c                     | 719 ++++++++++++++++++
 .../acc_prof-valid_bytes-1.c                  | 226 ++++++
 .../acc_prof-version-1.c                      |  76 ++
 23 files changed, 4400 insertions(+), 75 deletions(-)
 create mode 100644 libgomp/acc_prof.h
 create mode 100644 libgomp/oacc-profiling.c
 create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c
 create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
 create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c
 create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
 create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c
 create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c

diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog
index a16d5244f54e..e0a128f948f7 100644
--- a/libgomp/ChangeLog
+++ b/libgomp/ChangeLog
@@ -1,3 +1,58 @@ 
+2019-05-17  Thomas Schwinge  <thomas@codesourcery.com>
+
+	* acc_prof.h: New file.
+	* oacc-profiling.c: Likewise.
+	* Makefile.am (nodist_libsubinclude_HEADERS, libgomp_la_SOURCES):
+	Add these, respectively.
+	* Makefile.in: Regenerate.
+	* env.c (initialize_env): Call goacc_profiling_initialize.
+	* oacc-plugin.c (GOMP_PLUGIN_goacc_thread)
+	(GOMP_PLUGIN_goacc_profiling_dispatch): New functions.
+	* oacc-plugin.h (GOMP_PLUGIN_goacc_thread)
+	(GOMP_PLUGIN_goacc_profiling_dispatch): Declare.
+	* libgomp.map (OACC_2.5.1): Add acc_prof_lookup,
+	acc_prof_register, acc_prof_unregister, and acc_register_library.
+	(GOMP_PLUGIN_1.3): Add GOMP_PLUGIN_goacc_profiling_dispatch, and
+	GOMP_PLUGIN_goacc_thread.
+	* oacc-int.h (struct goacc_thread): Add prof_info, api_info,
+	prof_callbacks_enabled members.
+	(goacc_prof_enabled, goacc_profiling_initialize)
+	(_goacc_profiling_dispatch_p, _goacc_profiling_setup_p)
+	(goacc_profiling_dispatch): Declare.
+	(GOACC_PROF_ENABLED, GOACC_PROFILING_DISPATCH_P)
+	(GOACC_PROFILING_SETUP_P): Define.
+	* oacc-async.c (acc_async_test, acc_async_test_all, acc_wait)
+	(acc_wait_async, acc_wait_all, acc_wait_all_async): Update for
+	OpenACC Profiling Interface.
+	* oacc-cuda.c (acc_get_current_cuda_device)
+	(acc_get_current_cuda_context, acc_get_cuda_stream)
+	(acc_set_cuda_stream): Likewise.
+	* oacc-init.c (acc_init_1, goacc_attach_host_thread_to_device)
+	(acc_init, acc_set_device_type, acc_get_device_type)
+	(acc_get_device_num, goacc_lazy_initialize): Likewise.
+	* oacc-mem.c (acc_malloc, acc_free, memcpy_tofrom_device)
+	(acc_deviceptr, acc_hostptr, acc_is_present, acc_map_data)
+	(acc_unmap_data, present_create_copy, delete_copyout)
+	(update_dev_host): Likewise.
+	* oacc-parallel.c (GOACC_parallel_keyed, GOACC_data_start)
+	(GOACC_data_end, GOACC_enter_exit_data, GOACC_update, GOACC_wait):
+	Likewise.
+	* plugin/plugin-nvptx.c (nvptx_exec, nvptx_alloc, nvptx_free)
+	(GOMP_OFFLOAD_openacc_exec, GOMP_OFFLOAD_openacc_async_exec):
+	Likewise.
+	* libgomp.texi: Update.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c: New
+	file.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c:
+	Likewise.
+
 2019-05-13  Chung-Lin Tang <cltang@codesourcery.com>
 
 	* libgomp-plugin.h (struct goacc_asyncqueue): Declare.
diff --git a/libgomp/Makefile.am b/libgomp/Makefile.am
index 062fded0805a..7d36343a4be1 100644
--- a/libgomp/Makefile.am
+++ b/libgomp/Makefile.am
@@ -65,7 +65,7 @@  libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c error.c \
 	proc.c sem.c bar.c ptrlock.c time.c fortran.c affinity.c target.c \
 	splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c oacc-init.c \
 	oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
-	affinity-fmt.c teams.c
+	affinity-fmt.c teams.c oacc-profiling.c
 
 include $(top_srcdir)/plugin/Makefrag.am
 
@@ -74,7 +74,7 @@  libgomp_la_SOURCES += openacc.f90
 endif
 
 nodist_noinst_HEADERS = libgomp_f.h
-nodist_libsubinclude_HEADERS = omp.h openacc.h
+nodist_libsubinclude_HEADERS = omp.h openacc.h acc_prof.h
 if USE_FORTRAN
 nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod \
 	openacc_lib.h openacc.f90 openacc.mod openacc_kinds.mod
diff --git a/libgomp/Makefile.in b/libgomp/Makefile.in
index c7e63f120fdb..1e2fa89f3af8 100644
--- a/libgomp/Makefile.in
+++ b/libgomp/Makefile.in
@@ -217,7 +217,7 @@  am_libgomp_la_OBJECTS = alloc.lo atomic.lo barrier.lo critical.lo \
 	target.lo splay-tree.lo libgomp-plugin.lo oacc-parallel.lo \
 	oacc-host.lo oacc-init.lo oacc-mem.lo oacc-async.lo \
 	oacc-plugin.lo oacc-cuda.lo priority_queue.lo affinity-fmt.lo \
-	teams.lo $(am__objects_1)
+	teams.lo oacc-profiling.lo $(am__objects_1)
 libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS)
 AM_V_P = $(am__v_P_@AM_V@)
 am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
@@ -551,7 +551,7 @@  libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \
 	affinity.c target.c splay-tree.c libgomp-plugin.c \
 	oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c \
 	oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
-	affinity-fmt.c teams.c $(am__append_3)
+	affinity-fmt.c teams.c oacc-profiling.c $(am__append_3)
 
 # Nvidia PTX OpenACC plugin.
 @PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION)
@@ -575,7 +575,7 @@  libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \
 @PLUGIN_HSA_TRUE@libgomp_plugin_hsa_la_LIBADD = libgomp.la $(PLUGIN_HSA_LIBS)
 @PLUGIN_HSA_TRUE@libgomp_plugin_hsa_la_LIBTOOLFLAGS = --tag=disable-static
 nodist_noinst_HEADERS = libgomp_f.h
-nodist_libsubinclude_HEADERS = omp.h openacc.h
+nodist_libsubinclude_HEADERS = omp.h openacc.h acc_prof.h
 @USE_FORTRAN_TRUE@nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod \
 @USE_FORTRAN_TRUE@	openacc_lib.h openacc.f90 openacc.mod openacc_kinds.mod
 
@@ -753,6 +753,7 @@  distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-mem.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-parallel.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-plugin.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-profiling.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ordered.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/parallel.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/priority_queue.Plo@am__quote@
diff --git a/libgomp/acc_prof.h b/libgomp/acc_prof.h
new file mode 100644
index 000000000000..c7a2197d11a9
--- /dev/null
+++ b/libgomp/acc_prof.h
@@ -0,0 +1,252 @@ 
+/* OpenACC Profiling Interface
+
+   Copyright (C) 2019 Free Software Foundation, Inc.
+
+   Contributed by Mentor, a Siemens Business.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _ACC_PROF_H
+#define _ACC_PROF_H 1
+
+
+/* The OpenACC specification doesn't say so explicitly, but as its Profiling
+   Interface explicitly makes use of, for example, <openacc.h>'s
+   'acc_device_t', we supposedly are to '#include' that file here.  */
+
+#include <openacc.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Events.  */
+
+typedef enum acc_event_t
+{
+  acc_ev_none = 0,
+  acc_ev_device_init_start,
+  acc_ev_device_init_end,
+  acc_ev_device_shutdown_start,
+  acc_ev_device_shutdown_end,
+  acc_ev_runtime_shutdown,
+  acc_ev_create,
+  acc_ev_delete,
+  acc_ev_alloc,
+  acc_ev_free,
+  acc_ev_enter_data_start,
+  acc_ev_enter_data_end,
+  acc_ev_exit_data_start,
+  acc_ev_exit_data_end,
+  acc_ev_update_start,
+  acc_ev_update_end,
+  acc_ev_compute_construct_start,
+  acc_ev_compute_construct_end,
+  acc_ev_enqueue_launch_start,
+  acc_ev_enqueue_launch_end,
+  acc_ev_enqueue_upload_start,
+  acc_ev_enqueue_upload_end,
+  acc_ev_enqueue_download_start,
+  acc_ev_enqueue_download_end,
+  acc_ev_wait_start,
+  acc_ev_wait_end,
+  acc_ev_last
+} acc_event_t;
+
+
+/* Callbacks Signature.  */
+
+/* "The datatype 'ssize_t' means a signed 32-bit integer for a 32-bit binary
+   and a 64-bit integer for a 64-bit binary".  */
+typedef signed long int _acc_prof_ssize_t;
+/* "The datatype 'size_t' means an unsigned 32-bit integer for a 32-bit binary
+   and a 64-bit integer for a 64-bit binary".  */
+typedef unsigned long int _acc_prof_size_t;
+/* "The datatype 'int' means a 32-bit integer for both 32-bit and 64-bit
+   binaries".  */
+typedef int _acc_prof_int_t;
+
+/* Internal helpers: a struct's 'valid_bytes' may be less than its 'sizeof'.  */
+#define _ACC_PROF_VALID_BYTES_STRUCT(_struct, _lastfield, _valid_bytes_lastfield) \
+  offsetof (_struct, _lastfield) + (_valid_bytes_lastfield)
+#if 0 /* Untested.  */
+#define _ACC_PROF_VALID_BYTES_TYPE_N(_type, _n, _valid_bytes_type) \
+  ((_n - 1) * sizeof (_type) + (_valid_bytes_type))
+#endif
+#define _ACC_PROF_VALID_BYTES_BASICTYPE(_basictype) \
+  (sizeof (_basictype))
+
+typedef struct acc_prof_info
+{
+  acc_event_t event_type;
+  _acc_prof_int_t valid_bytes;
+  _acc_prof_int_t version;
+  acc_device_t device_type;
+  _acc_prof_int_t device_number;
+  _acc_prof_int_t thread_id;
+  _acc_prof_ssize_t async;
+  _acc_prof_ssize_t async_queue;
+  const char *src_file;
+  const char *func_name;
+  _acc_prof_int_t line_no, end_line_no;
+  _acc_prof_int_t func_line_no, func_end_line_no;
+#define _ACC_PROF_INFO_VALID_BYTES \
+  _ACC_PROF_VALID_BYTES_STRUCT (acc_prof_info, func_end_line_no, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (_acc_prof_int_t))
+} acc_prof_info;
+
+/* We implement the OpenACC 2.6 Profiling Interface.  */
+
+#define _ACC_PROF_INFO_VERSION 201711
+
+typedef enum acc_construct_t
+{
+  acc_construct_parallel = 0,
+  acc_construct_kernels,
+  acc_construct_loop,
+  acc_construct_data,
+  acc_construct_enter_data,
+  acc_construct_exit_data,
+  acc_construct_host_data,
+  acc_construct_atomic,
+  acc_construct_declare,
+  acc_construct_init,
+  acc_construct_shutdown,
+  acc_construct_set,
+  acc_construct_update,
+  acc_construct_routine,
+  acc_construct_wait,
+  acc_construct_runtime_api,
+  acc_construct_serial
+} acc_construct_t;
+
+typedef struct acc_data_event_info
+{
+  acc_event_t event_type;
+  _acc_prof_int_t valid_bytes;
+  acc_construct_t parent_construct;
+  _acc_prof_int_t implicit;
+  void *tool_info;
+  const char *var_name;
+  _acc_prof_size_t bytes;
+  const void *host_ptr;
+  const void *device_ptr;
+#define _ACC_DATA_EVENT_INFO_VALID_BYTES \
+  _ACC_PROF_VALID_BYTES_STRUCT (acc_data_event_info, device_ptr, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
+} acc_data_event_info;
+
+typedef struct acc_launch_event_info
+{
+  acc_event_t event_type;
+  _acc_prof_int_t valid_bytes;
+  acc_construct_t parent_construct;
+  _acc_prof_int_t implicit;
+  void *tool_info;
+  const char *kernel_name;
+  _acc_prof_size_t num_gangs, num_workers, vector_length;
+#define _ACC_LAUNCH_EVENT_INFO_VALID_BYTES \
+  _ACC_PROF_VALID_BYTES_STRUCT (acc_launch_event_info, vector_length, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (_acc_prof_size_t))
+} acc_launch_event_info;
+
+typedef struct acc_other_event_info
+{
+  acc_event_t event_type;
+  _acc_prof_int_t valid_bytes;
+  acc_construct_t parent_construct;
+  _acc_prof_int_t implicit;
+  void *tool_info;
+#define _ACC_OTHER_EVENT_INFO_VALID_BYTES \
+  _ACC_PROF_VALID_BYTES_STRUCT (acc_other_event_info, tool_info, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
+} acc_other_event_info;
+
+typedef union acc_event_info
+{
+  acc_event_t event_type;
+  acc_data_event_info data_event;
+  acc_launch_event_info launch_event;
+  acc_other_event_info other_event;
+} acc_event_info;
+
+typedef enum acc_device_api
+{
+  acc_device_api_none = 0,
+  acc_device_api_cuda,
+  acc_device_api_opencl,
+  acc_device_api_coi,
+  acc_device_api_other
+} acc_device_api;
+
+typedef struct acc_api_info
+{
+  acc_device_api device_api;
+  _acc_prof_int_t valid_bytes;
+  acc_device_t device_type;
+  _acc_prof_int_t vendor;
+  const void *device_handle;
+  const void *context_handle;
+  const void *async_handle;
+#define _ACC_API_INFO_VALID_BYTES \
+  _ACC_PROF_VALID_BYTES_STRUCT (acc_api_info, async_handle, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
+} acc_api_info;
+
+/* Don't tag 'acc_prof_callback' as '__GOACC_NOTHROW': these functions are
+   provided by user code, and must be expected to do anything.  */
+typedef void (*acc_prof_callback) (acc_prof_info *, acc_event_info *,
+				   acc_api_info *);
+
+
+/* Loading the Library.  */
+
+typedef enum acc_register_t
+{
+  acc_reg = 0,
+  acc_toggle = 1,
+  acc_toggle_per_thread = 2
+} acc_register_t;
+
+typedef void (*acc_prof_reg) (acc_event_t, acc_prof_callback, acc_register_t);
+extern void acc_prof_register (acc_event_t, acc_prof_callback,
+			       acc_register_t) __GOACC_NOTHROW;
+extern void acc_prof_unregister (acc_event_t, acc_prof_callback,
+				 acc_register_t) __GOACC_NOTHROW;
+typedef void (*acc_query_fn) ();
+typedef acc_query_fn (*acc_prof_lookup_func) (const char *);
+extern acc_query_fn acc_prof_lookup (const char *) __GOACC_NOTHROW;
+/* Don't tag 'acc_register_library' as '__GOACC_NOTHROW': this function can be
+   overridden by user code, and must be expected to do anything.  */
+extern void acc_register_library (acc_prof_reg, acc_prof_reg,
+				  acc_prof_lookup_func);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* _ACC_PROF_H */
diff --git a/libgomp/env.c b/libgomp/env.c
index 7937fbb056a0..29d9f58cba3a 100644
--- a/libgomp/env.c
+++ b/libgomp/env.c
@@ -1425,5 +1425,7 @@  initialize_env (void)
   parse_gomp_openacc_dim ();
 
   goacc_runtime_initialize ();
+
+  goacc_profiling_initialize ();
 }
 #endif /* LIBGOMP_OFFLOADED_ONLY */
diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map
index d8e2fd1818b9..c79430f8d8d1 100644
--- a/libgomp/libgomp.map
+++ b/libgomp/libgomp.map
@@ -476,6 +476,14 @@  OACC_2.5 {
 	acc_update_self_async_array_h_;
 } OACC_2.0.1;
 
+OACC_2.5.1 {
+  global:
+	acc_prof_lookup;
+	acc_prof_register;
+	acc_prof_unregister;
+	acc_register_library;
+} OACC_2.5;
+
 GOACC_2.0 {
   global:
 	GOACC_data_end;
@@ -515,3 +523,9 @@  GOMP_PLUGIN_1.2 {
   global:
 	GOMP_PLUGIN_acc_default_dim;
 } GOMP_PLUGIN_1.1;
+
+GOMP_PLUGIN_1.3 {
+  global:
+	GOMP_PLUGIN_goacc_profiling_dispatch;
+	GOMP_PLUGIN_goacc_thread;
+} GOMP_PLUGIN_1.2;
diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi
index e2e384ae8b62..6db895f62726 100644
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -111,6 +111,7 @@  changed to GNU Offloading and Multi Processing Runtime Library.
                                asynchronous operations.
 * OpenACC Library Interoperability:: OpenACC library interoperability with the
                                NVIDIA CUBLAS library.
+* OpenACC Profiling Interface::
 * The libgomp ABI::            Notes on the external ABI presented by libgomp.
 * Reporting Bugs::             How to report bugs in the GNU Offloading and
                                Multi Processing Runtime Library.
@@ -1897,6 +1898,13 @@  API routines for target platforms.
 * acc_get_current_cuda_context::Get CUDA context handle.
 * acc_get_cuda_stream::         Get CUDA stream handle.
 * acc_set_cuda_stream::         Set CUDA stream handle.
+
+API routines for the OpenACC Profiling Interface.
+
+* acc_prof_register::           Register callbacks.
+* acc_prof_unregister::         Unregister callbacks.
+* acc_prof_lookup::             Obtain inquiry functions.
+* acc_register_library::        Library registration.
 @end menu
 
 
@@ -2823,6 +2831,90 @@  A.2.1.4.
 
 
 
+@node acc_prof_register
+@section @code{acc_prof_register} -- Register callbacks.
+@table @asis
+@item @emph{Description}:
+This function registers callbacks.
+
+@item @emph{C/C++}:
+@multitable @columnfractions .20 .80
+@item @emph{Prototype}: @tab @code{void acc_prof_register (acc_event_t, acc_prof_callback, acc_register_t);}
+@end multitable
+
+@item @emph{See also}:
+@ref{OpenACC Profiling Interface}
+
+@item @emph{Reference}:
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+5.3.
+@end table
+
+
+
+@node acc_prof_unregister
+@section @code{acc_prof_unregister} -- Unregister callbacks.
+@table @asis
+@item @emph{Description}:
+This function unregisters callbacks.
+
+@item @emph{C/C++}:
+@multitable @columnfractions .20 .80
+@item @emph{Prototype}: @tab @code{void acc_prof_unregister (acc_event_t, acc_prof_callback, acc_register_t);}
+@end multitable
+
+@item @emph{See also}:
+@ref{OpenACC Profiling Interface}
+
+@item @emph{Reference}:
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+5.3.
+@end table
+
+
+
+@node acc_prof_lookup
+@section @code{acc_prof_lookup} -- Obtain inquiry functions.
+@table @asis
+@item @emph{Description}:
+Function to obtain inquiry functions.
+
+@item @emph{C/C++}:
+@multitable @columnfractions .20 .80
+@item @emph{Prototype}: @tab @code{acc_query_fn acc_prof_lookup (const char *);}
+@end multitable
+
+@item @emph{See also}:
+@ref{OpenACC Profiling Interface}
+
+@item @emph{Reference}:
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+5.3.
+@end table
+
+
+
+@node acc_register_library
+@section @code{acc_register_library} -- Library registration.
+@table @asis
+@item @emph{Description}:
+Function for library registration.
+
+@item @emph{C/C++}:
+@multitable @columnfractions .20 .80
+@item @emph{Prototype}: @tab @code{void acc_register_library (acc_prof_reg, acc_prof_reg, acc_prof_lookup_func);}
+@end multitable
+
+@item @emph{See also}:
+@ref{OpenACC Profiling Interface}, @ref{ACC_PROFLIB}
+
+@item @emph{Reference}:
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+5.3.
+@end table
+
+
+
 @c ---------------------------------------------------------------------
 @c OpenACC Environment Variables
 @c ---------------------------------------------------------------------
@@ -2832,11 +2924,14 @@  A.2.1.4.
 
 The variables @env{ACC_DEVICE_TYPE} and @env{ACC_DEVICE_NUM}
 are defined by section 4 of the OpenACC specification in version 2.0.
+The variable @env{ACC_PROFLIB}
+is defined by section 4 of the OpenACC specification in version 2.6.
 The variable @env{GCC_ACC_NOTIFY} is used for diagnostic purposes.
 
 @menu
 * ACC_DEVICE_TYPE::
 * ACC_DEVICE_NUM::
+* ACC_PROFLIB::
 * GCC_ACC_NOTIFY::
 @end menu
 
@@ -2862,6 +2957,19 @@  The variable @env{GCC_ACC_NOTIFY} is used for diagnostic purposes.
 
 
 
+@node ACC_PROFLIB
+@section @code{ACC_PROFLIB}
+@table @asis
+@item @emph{See also}:
+@ref{acc_register_library}, @ref{OpenACC Profiling Interface}
+
+@item @emph{Reference}:
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+4.3.
+@end table
+
+
+
 @node GCC_ACC_NOTIFY
 @section @code{GCC_ACC_NOTIFY}
 @table @asis
@@ -3077,6 +3185,310 @@  Application Programming Interface”, Version 2.0.}
 
 
 
+@c ---------------------------------------------------------------------
+@c OpenACC Profiling Interface
+@c ---------------------------------------------------------------------
+
+@node OpenACC Profiling Interface
+@chapter OpenACC Profiling Interface
+
+@section Implementation Status and Implementation-Defined Behavior
+
+We're implementing the OpenACC Profiling Interface as defined by the
+OpenACC 2.6 specification.  We're clarifying some aspects here as
+@emph{implementation-defined behavior}, while they're still under
+discussion within the OpenACC Technical Committee.
+
+This implementation is tuned to keep the performance impact as low as
+possible for the (very common) case that the Profiling Interface is
+not enabled.  This is relevant, as the Profiling Interface affects all
+the @emph{hot} code paths (in the target code, not in the offloaded
+code).  Users of the OpenACC Profiling Interface can be expected to
+understand that performance will be impacted to some degree once the
+Profiling Interface has gotten enabled: for example, because of the
+@emph{runtime} (libgomp) calling into a third-party @emph{library} for
+every event that has been registered.
+
+We're not yet accounting for the fact that @cite{OpenACC events may
+occur during event processing}.
+
+We're not yet implementing initialization via a
+@code{acc_register_library} function that is either statically linked
+in, or dynamically via @env{LD_PRELOAD}.
+Initialization via @code{acc_register_library} functions dynamically
+loaded via the @env{ACC_PROFLIB} environment variable does work, as
+does directly calling @code{acc_prof_register},
+@code{acc_prof_unregister}, @code{acc_prof_lookup}.
+
+As currently there are no inquiry functions defined, calls to
+@code{acc_prof_lookup} will always return @code{NULL}.
+
+There aren't separate @emph{start}, @emph{stop} events defined for the
+event types @code{acc_ev_create}, @code{acc_ev_delete},
+@code{acc_ev_alloc}, @code{acc_ev_free}.  It's not clear if these
+should be triggered before or after the actual device-specific call is
+made.  We trigger them after.
+
+Remarks about data provided to callbacks:
+
+@table @asis
+
+@item @code{acc_prof_info.event_type}
+It's not clear if for @emph{nested} event callbacks (for example,
+@code{acc_ev_enqueue_launch_start} as part of a parent compute
+construct), this should be set for the nested event
+(@code{acc_ev_enqueue_launch_start}), or if the value of the parent
+construct should remain (@code{acc_ev_compute_construct_start}).  In
+this implementation, the value will generally correspond to the
+innermost nested event type.
+
+@item @code{acc_prof_info.device_type}
+@itemize
+
+@item
+For @code{acc_ev_compute_construct_start}, and in presence of an
+@code{if} clause with @emph{false} argument, this will still refer to
+the offloading device type.
+It's not clear if that's the expected behavior.
+
+@item
+Complementary to the item before, for
+@code{acc_ev_compute_construct_end}, this is set to
+@code{acc_device_host} in presence of an @code{if} clause with
+@emph{false} argument.
+It's not clear if that's the expected behavior.
+
+@end itemize
+
+@item @code{acc_prof_info.thread_id}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_prof_info.async}
+@itemize
+
+@item
+Not yet implemented correctly for
+@code{acc_ev_compute_construct_start}.
+
+@item
+In a compute construct, for host-fallback
+execution/@code{acc_device_host} it will always be
+@code{acc_async_sync}.
+It's not clear if that's the expected behavior.
+
+@item
+For @code{acc_ev_device_init_start} and @code{acc_ev_device_init_end},
+it will always be @code{acc_async_sync}.
+It's not clear if that's the expected behavior.
+
+@end itemize
+
+@item @code{acc_prof_info.async_queue}
+There is no @cite{limited number of asynchronous queues} in libgomp.
+This will always have the same value as @code{acc_prof_info.async}.
+
+@item @code{acc_prof_info.src_file}
+Always @code{NULL}; not yet implemented.
+
+@item @code{acc_prof_info.func_name}
+Always @code{NULL}; not yet implemented.
+
+@item @code{acc_prof_info.line_no}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_prof_info.end_line_no}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_prof_info.func_line_no}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_prof_info.func_end_line_no}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_event_info.event_type}, @code{acc_event_info.*.event_type}
+Relating to @code{acc_prof_info.event_type} discussed above, in this
+implementation, this will always be the same value as
+@code{acc_prof_info.event_type}.
+
+@item @code{acc_event_info.*.parent_construct}
+@itemize
+
+@item
+Will be @code{acc_construct_parallel} for all OpenACC compute
+constructs as well as many OpenACC Runtime API calls; should be the
+one matching the actual construct, or
+@code{acc_construct_runtime_api}, respectively.
+
+@item
+Will be @code{acc_construct_enter_data} or
+@code{acc_construct_exit_data} when processing variable mappings
+specified in OpenACC @emph{declare} directives; should be
+@code{acc_construct_declare}.
+
+@item
+For implicit @code{acc_ev_device_init_start},
+@code{acc_ev_device_init_end}, and explicit as well as implicit
+@code{acc_ev_alloc}, @code{acc_ev_free},
+@code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end},
+@code{acc_ev_enqueue_download_start}, and
+@code{acc_ev_enqueue_download_end}, will be
+@code{acc_construct_parallel}; should reflect the real parent
+construct.
+
+@end itemize
+
+@item @code{acc_event_info.*.implicit}
+For @code{acc_ev_alloc}, @code{acc_ev_free},
+@code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end},
+@code{acc_ev_enqueue_download_start}, and
+@code{acc_ev_enqueue_download_end}, this currently will be @code{1}
+also for explicit usage.
+
+@item @code{acc_event_info.data_event.var_name}
+Always @code{NULL}; not yet implemented.
+
+@item @code{acc_event_info.data_event.host_ptr}
+For @code{acc_ev_alloc}, and @code{acc_ev_free}, this is always
+@code{NULL}.
+
+@item @code{typedef union acc_api_info}
+@dots{} as printed in @cite{5.2.3. Third Argument: API-Specific
+Information}.  This should obviously be @code{typedef @emph{struct}
+acc_api_info}.
+
+@item @code{acc_api_info.device_api}
+Possibly not yet implemented correctly for
+@code{acc_ev_compute_construct_start},
+@code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}:
+will always be @code{acc_device_api_none} for these event types.
+For @code{acc_ev_enter_data_start}, it will be
+@code{acc_device_api_none} in some cases.
+
+@item @code{acc_api_info.device_type}
+Always the same as @code{acc_prof_info.device_type}.
+
+@item @code{acc_api_info.vendor}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_api_info.device_handle}
+Always @code{NULL}; not yet implemented.
+
+@item @code{acc_api_info.context_handle}
+Always @code{NULL}; not yet implemented.
+
+@item @code{acc_api_info.async_handle}
+Always @code{NULL}; not yet implemented.
+
+@end table
+
+Remarks about certain event types:
+
+@table @asis
+
+@item @code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}
+@itemize
+
+@item
+@c See 'DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT' in
+@c 'libgomp.oacc-c-c++-common/acc_prof-kernels-1.c',
+@c 'libgomp.oacc-c-c++-common/acc_prof-parallel-1.c'.
+Whan a compute construct triggers implicit
+@code{acc_ev_device_init_start} and @code{acc_ev_device_init_end}
+events, they currently aren't @emph{nested within} the corresponding
+@code{acc_ev_compute_construct_start} and
+@code{acc_ev_compute_construct_end}, but they're currently observed
+@emph{before} @code{acc_ev_compute_construct_start}.
+It's not clear what to do: the standard asks us provide a lot of
+details to the @code{acc_ev_compute_construct_start} callback, without
+(implicitly) initializing a device before?
+
+@item
+Callbacks for these event types will not be invoked for calls to the
+@code{acc_set_device_type} and @code{acc_set_device_num} functions.
+It's not clear if they should be.
+
+@end itemize
+
+@item @code{acc_ev_enter_data_start}, @code{acc_ev_enter_data_end}, @code{acc_ev_exit_data_start}, @code{acc_ev_exit_data_end}
+@itemize
+
+@item
+Callbacks for these event types will also be invoked for OpenACC
+@emph{host_data} constructs.
+It's not clear if they should be.
+
+@item
+Callbacks for these event types will also be invoked when processing
+variable mappings specified in OpenACC @emph{declare} directives.
+It's not clear if they should be.
+
+@end itemize
+
+@end table
+
+Callbacks for the following event types will be invoked, but dispatch
+and information provided therein has not yet been thoroughly reviewed:
+
+@itemize
+@item @code{acc_ev_alloc}
+@item @code{acc_ev_free}
+@item @code{acc_ev_update_start}, @code{acc_ev_update_end}
+@item @code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end}
+@item @code{acc_ev_enqueue_download_start}, @code{acc_ev_enqueue_download_end}
+@end itemize
+
+During device initialization, and finalization, respectively,
+callbacks for the following event types will not yet be invoked:
+
+@itemize
+@item @code{acc_ev_alloc}
+@item @code{acc_ev_free}
+@end itemize
+
+Callbacks for the following event types have not yet been implemented,
+so currently won't be invoked:
+
+@itemize
+@item @code{acc_ev_device_shutdown_start}, @code{acc_ev_device_shutdown_end}
+@item @code{acc_ev_runtime_shutdown}
+@item @code{acc_ev_create}, @code{acc_ev_delete}
+@item @code{acc_ev_wait_start}, @code{acc_ev_wait_end}
+@end itemize
+
+For the following runtime library functions, not all expected
+callbacks will be invoked (mostly concerning implicit device
+initialization):
+
+@itemize
+@item @code{acc_get_num_devices}
+@item @code{acc_set_device_type}
+@item @code{acc_get_device_type}
+@item @code{acc_set_device_num}
+@item @code{acc_get_device_num}
+@item @code{acc_init}
+@item @code{acc_shutdown}
+@end itemize
+
+Aside from implicit device initialization, for the following runtime
+library functions, no callbacks will be invoked for shared-memory
+offloading devices (it's not clear if they should be):
+
+@itemize
+@item @code{acc_malloc}
+@item @code{acc_free}
+@item @code{acc_copyin}, @code{acc_present_or_copyin}, @code{acc_copyin_async}
+@item @code{acc_create}, @code{acc_present_or_create}, @code{acc_create_async}
+@item @code{acc_copyout}, @code{acc_copyout_async}, @code{acc_copyout_finalize}, @code{acc_copyout_finalize_async}
+@item @code{acc_delete}, @code{acc_delete_async}, @code{acc_delete_finalize}, @code{acc_delete_finalize_async}
+@item @code{acc_update_device}, @code{acc_update_device_async}
+@item @code{acc_update_self}, @code{acc_update_self_async}
+@item @code{acc_map_data}, @code{acc_unmap_data}
+@item @code{acc_memcpy_to_device}, @code{acc_memcpy_to_device_async}
+@item @code{acc_memcpy_from_device}, @code{acc_memcpy_from_device_async}
+@end itemize
+
+
+
 @c ---------------------------------------------------------------------
 @c The libgomp ABI
 @c ---------------------------------------------------------------------
diff --git a/libgomp/oacc-async.c b/libgomp/oacc-async.c
index 51bb676610c8..1760e8c90c63 100644
--- a/libgomp/oacc-async.c
+++ b/libgomp/oacc-async.c
@@ -43,17 +43,6 @@  get_goacc_thread (void)
   return thr;
 }
 
-static struct gomp_device_descr *
-get_goacc_thread_device (void)
-{
-  struct goacc_thread *thr = goacc_thread ();
-
-  if (!thr || !thr->dev)
-    gomp_fatal ("no device active");
-
-  return thr->dev;
-}
-
 static int
 validate_async_val (int async)
 {
@@ -76,7 +65,10 @@  validate_async_val (int async)
 
 /* Return the asyncqueue to be used for OpenACC async-argument ASYNC.  This
    might return NULL if no asyncqueue is to be used.  Otherwise, if CREATE,
-   create the asyncqueue if it doesn't exist yet.  */
+   create the asyncqueue if it doesn't exist yet.
+
+   Unless CREATE, this will not generate any OpenACC Profiling Interface
+   events.  */
 
 attribute_hidden struct goacc_asyncqueue *
 lookup_goacc_asyncqueue (struct goacc_thread *thr, bool create, int async)
@@ -152,8 +144,25 @@  acc_async_test (int async)
   goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
   if (!aq)
     return 1;
-  else
-    return thr->dev->openacc.async.test_func (aq);
+
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+  if (profiling_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
+
+  int res = thr->dev->openacc.async.test_func (aq);
+
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
+
+  return res;
 }
 
 int
@@ -161,6 +170,10 @@  acc_async_test_all (void)
 {
   struct goacc_thread *thr = get_goacc_thread ();
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+
   int ret = 1;
   gomp_mutex_lock (&thr->dev->openacc.async.lock);
   for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
@@ -170,6 +183,13 @@  acc_async_test_all (void)
 	break;
       }
   gomp_mutex_unlock (&thr->dev->openacc.async.lock);
+
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
+
   return ret;
 }
 
@@ -179,8 +199,26 @@  acc_wait (int async)
   struct goacc_thread *thr = get_goacc_thread ();
 
   goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
-  if (aq && !thr->dev->openacc.async.synchronize_func (aq))
+  if (!aq)
+    return;
+
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+  if (profiling_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
+
+  if (!thr->dev->openacc.async.synchronize_func (aq))
     gomp_fatal ("wait on %d failed", async);
+
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 /* acc_async_wait is an OpenACC 1.0 compatibility name for acc_wait.  */
@@ -205,10 +243,19 @@  acc_wait_async (int async1, int async2)
   if (!aq1)
     return;
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+  if (profiling_p)
+    {
+      prof_info.async = async2;
+      prof_info.async_queue = prof_info.async;
+    }
+
   goacc_aq aq2 = lookup_goacc_asyncqueue (thr, true, async2);
   /* An async queue is always synchronized with itself.  */
   if (aq1 == aq2)
-    return;
+    goto out_prof;
 
   if (aq2)
     {
@@ -222,18 +269,35 @@  acc_wait_async (int async1, int async2)
       if (!thr->dev->openacc.async.synchronize_func (aq1))
 	gomp_fatal ("wait on %d failed", async1);
     }
+
+ out_prof:
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 void
 acc_wait_all (void)
 {
-  struct gomp_device_descr *dev = get_goacc_thread_device ();
+  struct goacc_thread *thr = goacc_thread ();
+
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
 
   bool ret = true;
-  gomp_mutex_lock (&dev->openacc.async.lock);
-  for (goacc_aq_list l = dev->openacc.async.active; l; l = l->next)
-    ret &= dev->openacc.async.synchronize_func (l->aq);
-  gomp_mutex_unlock (&dev->openacc.async.lock);
+  gomp_mutex_lock (&thr->dev->openacc.async.lock);
+  for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
+    ret &= thr->dev->openacc.async.synchronize_func (l->aq);
+  gomp_mutex_unlock (&thr->dev->openacc.async.lock);
+
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 
   if (!ret)
     gomp_fatal ("wait all failed");
@@ -255,6 +319,15 @@  acc_wait_all_async (int async)
 {
   struct goacc_thread *thr = get_goacc_thread ();
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+  if (profiling_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
+
   goacc_aq waiting_queue = lookup_goacc_asyncqueue (thr, true, async);
 
   bool ret = true;
@@ -270,6 +343,12 @@  acc_wait_all_async (int async)
     }
   gomp_mutex_unlock (&thr->dev->openacc.async.lock);
 
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
+
   if (!ret)
     gomp_fatal ("wait all async(%d) failed", async);
 }
diff --git a/libgomp/oacc-cuda.c b/libgomp/oacc-cuda.c
index 1a6946c5ddab..07951df5bb0f 100644
--- a/libgomp/oacc-cuda.c
+++ b/libgomp/oacc-cuda.c
@@ -37,10 +37,23 @@  acc_get_current_cuda_device (void)
 {
   struct goacc_thread *thr = goacc_thread ();
 
+  void *ret = NULL;
   if (thr && thr->dev && thr->dev->openacc.cuda.get_current_device_func)
-    return thr->dev->openacc.cuda.get_current_device_func ();
+    {
+      acc_prof_info prof_info;
+      acc_api_info api_info;
+      bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+
+      ret = thr->dev->openacc.cuda.get_current_device_func ();
+
+      if (profiling_p)
+	{
+	  thr->prof_info = NULL;
+	  thr->api_info = NULL;
+	}
+    }
 
-  return NULL;
+  return ret;
 }
 
 void *
@@ -48,10 +61,23 @@  acc_get_current_cuda_context (void)
 {
   struct goacc_thread *thr = goacc_thread ();
 
+  void *ret = NULL;
   if (thr && thr->dev && thr->dev->openacc.cuda.get_current_context_func)
-    return thr->dev->openacc.cuda.get_current_context_func ();
- 
-  return NULL;
+    {
+      acc_prof_info prof_info;
+      acc_api_info api_info;
+      bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+
+      ret = thr->dev->openacc.cuda.get_current_context_func ();
+
+      if (profiling_p)
+	{
+	  thr->prof_info = NULL;
+	  thr->api_info = NULL;
+	}
+    }
+
+  return ret;
 }
 
 void *
@@ -62,14 +88,32 @@  acc_get_cuda_stream (int async)
   if (!async_valid_p (async))
     return NULL;
 
+  void *ret = NULL;
   if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func)
     {
       goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
-      if (aq)
-	return thr->dev->openacc.cuda.get_stream_func (aq);
+      if (!aq)
+	return ret;
+
+      acc_prof_info prof_info;
+      acc_api_info api_info;
+      bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+      if (profiling_p)
+	{
+	  prof_info.async = async;
+	  prof_info.async_queue = prof_info.async;
+	}
+
+      ret = thr->dev->openacc.cuda.get_stream_func (aq);
+
+      if (profiling_p)
+	{
+	  thr->prof_info = NULL;
+	  thr->api_info = NULL;
+	}
     }
- 
-  return NULL;
+
+  return ret;
 }
 
 int
@@ -87,6 +131,15 @@  acc_set_cuda_stream (int async, void *stream)
   int ret = -1;
   if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func)
     {
+      acc_prof_info prof_info;
+      acc_api_info api_info;
+      bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+      if (profiling_p)
+	{
+	  prof_info.async = async;
+	  prof_info.async_queue = prof_info.async;
+	}
+
       goacc_aq aq = get_goacc_asyncqueue (async);
       /* Due to not using an asyncqueue for "acc_async_sync", this cannot be
 	 used to change the CUDA stream associated with "acc_async_sync".  */
@@ -95,11 +148,19 @@  acc_set_cuda_stream (int async, void *stream)
 	  assert (async == acc_async_sync);
 	  gomp_debug (0, "Refusing request to set CUDA stream associated"
 		      " with \"acc_async_sync\"\n");
-	  return 0;
+	  ret = 0;
+	  goto out_prof;
 	}
       gomp_mutex_lock (&thr->dev->openacc.async.lock);
       ret = thr->dev->openacc.cuda.set_stream_func (aq, stream);
       gomp_mutex_unlock (&thr->dev->openacc.async.lock);
+
+    out_prof:
+      if (profiling_p)
+	{
+	  thr->prof_info = NULL;
+	  thr->api_info = NULL;
+	}
     }
 
   return ret;
diff --git a/libgomp/oacc-init.c b/libgomp/oacc-init.c
index 28471e40ba00..e1568c535b32 100644
--- a/libgomp/oacc-init.c
+++ b/libgomp/oacc-init.c
@@ -210,8 +210,67 @@  acc_dev_num_out_of_range (acc_device_t d, int ord, int ndevs)
    held before calling this function.  */
 
 static struct gomp_device_descr *
-acc_init_1 (acc_device_t d)
+acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit)
 {
+  bool check_not_nested_p;
+  if (implicit)
+    {
+      /* In the implicit case, there should (TODO: must?) already be something
+	 have been set up for an outer construct.  */
+      check_not_nested_p = false;
+    }
+  else
+    {
+      check_not_nested_p = true;
+      /* TODO: should we set 'thr->prof_info' etc. in this case ('acc_init')?
+	 The problem is, that we don't have 'thr' yet?  (So,
+	 'check_not_nested_p = true' also is pointless actually.)  */
+    }
+  bool profiling_p = GOACC_PROFILING_DISPATCH_P (check_not_nested_p);
+
+  acc_prof_info prof_info;
+  if (profiling_p)
+    {
+      prof_info.event_type = acc_ev_device_init_start;
+      prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+      prof_info.version = _ACC_PROF_INFO_VERSION;
+      prof_info.device_type = d;
+      prof_info.device_number = goacc_device_num;
+      prof_info.thread_id = -1;
+      prof_info.async = acc_async_sync;
+      prof_info.async_queue = prof_info.async;
+      prof_info.src_file = NULL;
+      prof_info.func_name = NULL;
+      prof_info.line_no = -1;
+      prof_info.end_line_no = -1;
+      prof_info.func_line_no = -1;
+      prof_info.func_end_line_no = -1;
+    }
+  acc_event_info device_init_event_info;
+  if (profiling_p)
+    {
+      device_init_event_info.other_event.event_type = prof_info.event_type;
+      device_init_event_info.other_event.valid_bytes
+	= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+      device_init_event_info.other_event.parent_construct = parent_construct;
+      device_init_event_info.other_event.implicit = implicit;
+      device_init_event_info.other_event.tool_info = NULL;
+    }
+  acc_api_info api_info;
+  if (profiling_p)
+    {
+      api_info.device_api = acc_device_api_none;
+      api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+      api_info.device_type = prof_info.device_type;
+      api_info.vendor = -1;
+      api_info.device_handle = NULL;
+      api_info.context_handle = NULL;
+      api_info.async_handle = NULL;
+    }
+
+  if (profiling_p)
+    goacc_profiling_dispatch (&prof_info, &device_init_event_info, &api_info);
+
   struct gomp_device_descr *base_dev, *acc_dev;
   int ndevs;
 
@@ -234,6 +293,14 @@  acc_init_1 (acc_device_t d)
   gomp_init_device (acc_dev);
   gomp_mutex_unlock (&acc_dev->lock);
 
+  if (profiling_p)
+    {
+      prof_info.event_type = acc_ev_device_init_end;
+      device_init_event_info.other_event.event_type = prof_info.event_type;
+      goacc_profiling_dispatch (&prof_info, &device_init_event_info,
+				&api_info);
+    }
+
   return base_dev;
 }
 
@@ -423,7 +490,11 @@  goacc_attach_host_thread_to_device (int ord)
   thr->dev = acc_dev = &base_dev[ord];
   thr->saved_bound_dev = NULL;
   thr->mapped_data = NULL;
-  
+  thr->prof_info = NULL;
+  thr->api_info = NULL;
+  /* Initially, all callbacks for all events are enabled.  */
+  thr->prof_callbacks_enabled = true;
+
   thr->target_tls
     = acc_dev->openacc.create_thread_data_func (ord);
 }
@@ -437,9 +508,7 @@  acc_init (acc_device_t d)
   gomp_init_targets_once ();
 
   gomp_mutex_lock (&acc_device_lock);
-
-  cached_base_dev = acc_init_1 (d);
-
+  cached_base_dev = acc_init_1 (d, acc_construct_runtime_api, 0);
   gomp_mutex_unlock (&acc_device_lock);
   
   goacc_attach_host_thread_to_device (-1);
@@ -498,6 +567,12 @@  acc_set_device_type (acc_device_t d)
   struct gomp_device_descr *base_dev, *acc_dev;
   struct goacc_thread *thr = goacc_thread ();
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+  if (profiling_p)
+    prof_info.device_type = d;
+
   gomp_init_targets_once ();
 
   gomp_mutex_lock (&acc_device_lock);
@@ -522,6 +597,12 @@  acc_set_device_type (acc_device_t d)
     }
 
   goacc_attach_host_thread_to_device (-1);
+
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 ialias (acc_set_device_type)
@@ -537,12 +618,22 @@  acc_get_device_type (void)
     res = acc_device_type (thr->base_dev->type);
   else
     {
+      acc_prof_info prof_info;
+      acc_api_info api_info;
+      bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+
       gomp_init_targets_once ();
 
       gomp_mutex_lock (&acc_device_lock);
       dev = resolve_device (acc_device_default, true);
       gomp_mutex_unlock (&acc_device_lock);
       res = acc_device_type (dev->type);
+
+      if (profiling_p)
+	{
+	  thr->prof_info = NULL;
+	  thr->api_info = NULL;
+	}
     }
 
   assert (res != acc_device_default
@@ -562,12 +653,24 @@  acc_get_device_num (acc_device_t d)
   if (d >= _ACC_device_hwm)
     gomp_fatal ("unknown device type %u", (unsigned) d);
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+  if (profiling_p)
+    prof_info.device_type = d;
+
   gomp_init_targets_once ();
 
   gomp_mutex_lock (&acc_device_lock);
   dev = resolve_device (d, true);
   gomp_mutex_unlock (&acc_device_lock);
 
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
+
   if (thr && thr->base_dev == dev && thr->dev)
     return thr->dev->target_id;
 
@@ -689,8 +792,13 @@  goacc_lazy_initialize (void)
   if (thr && thr->dev)
     return;
 
+  gomp_init_targets_once ();
+
+  gomp_mutex_lock (&acc_device_lock);
   if (!cached_base_dev)
-    acc_init (acc_device_default);
-  else
-    goacc_attach_host_thread_to_device (-1);
+    cached_base_dev = acc_init_1 (acc_device_default,
+				  acc_construct_parallel, 1);
+  gomp_mutex_unlock (&acc_device_lock);
+
+  goacc_attach_host_thread_to_device (-1);
 }
diff --git a/libgomp/oacc-int.h b/libgomp/oacc-int.h
index e4b6ea6b7db4..5ca9944601e2 100644
--- a/libgomp/oacc-int.h
+++ b/libgomp/oacc-int.h
@@ -40,6 +40,7 @@ 
 
 #include "openacc.h"
 #include "config.h"
+#include "acc_prof.h"
 #include <stddef.h>
 #include <stdbool.h>
 #include <stdarg.h>
@@ -68,6 +69,12 @@  struct goacc_thread
      strictly push/pop semantics according to lexical scope.  */
   struct target_mem_desc *mapped_data;
 
+  /* Data of the OpenACC Profiling Interface.  */
+  acc_prof_info *prof_info;
+  acc_api_info *api_info;
+  /* Per-thread toggle of OpenACC Profiling Interface callbacks.  */
+  bool prof_callbacks_enabled;
+
   /* These structures form a list: this is the next thread in that list.  */
   struct goacc_thread *next;
 
@@ -128,6 +135,28 @@  async_synchronous_p (int async)
   return async == acc_async_sync;
 }
 
+
+extern bool goacc_prof_enabled;
+/* Tune for the (very common) case that profiling is not enabled.  */
+#define GOACC_PROF_ENABLED \
+  (__builtin_expect (__atomic_load_n (&goacc_prof_enabled, \
+				      MEMMODEL_ACQUIRE) == true, false))
+
+void goacc_profiling_initialize (void);
+bool _goacc_profiling_dispatch_p (bool);
+/* Tune for the (very common) case that profiling is not enabled.  */
+#define GOACC_PROFILING_DISPATCH_P(...) \
+  (GOACC_PROF_ENABLED \
+   && _goacc_profiling_dispatch_p (__VA_ARGS__))
+bool _goacc_profiling_setup_p (struct goacc_thread *,
+			       acc_prof_info *, acc_api_info *);
+/* Tune for the (very common) case that profiling is not enabled.  */
+#define GOACC_PROFILING_SETUP_P(...) \
+  (GOACC_PROFILING_DISPATCH_P (false) \
+   && _goacc_profiling_setup_p (__VA_ARGS__))
+void goacc_profiling_dispatch (acc_prof_info *, acc_event_info *,
+			       acc_api_info *);
+
 #ifdef HAVE_ATTRIBUTE_VISIBILITY
 # pragma GCC visibility pop
 #endif
diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c
index 03df0d4fbf63..0f3832f52b3e 100644
--- a/libgomp/oacc-mem.c
+++ b/libgomp/oacc-mem.c
@@ -108,7 +108,19 @@  acc_malloc (size_t s)
   if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return malloc (s);
 
-  return thr->dev->alloc_func (thr->dev->target_id, s);
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+
+  void *res = thr->dev->alloc_func (thr->dev->target_id, s);
+
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
+
+  return res;
 }
 
 /* OpenACC 2.0a (3.2.16) doesn't specify what to do in the event
@@ -131,6 +143,10 @@  acc_free (void *d)
   if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return free (d);
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+
   gomp_mutex_lock (&acc_dev->lock);
 
   /* We don't have to call lazy open here, as the ptr value must have
@@ -151,6 +167,12 @@  acc_free (void *d)
 
   if (!acc_dev->free_func (acc_dev->target_id, d))
     gomp_fatal ("error in freeing device memory in %s", __FUNCTION__);
+
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 static void
@@ -172,11 +194,26 @@  memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
       return;
     }
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+  if (profiling_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
+
   goacc_aq aq = get_goacc_asyncqueue (async);
   if (from)
     gomp_copy_dev2host (thr->dev, aq, h, d, s);
   else
     gomp_copy_host2dev (thr->dev, aq, d, h, s, /* TODO: cbuf? */ NULL);
+
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 void
@@ -221,6 +258,9 @@  acc_deviceptr (void *h)
   if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return h;
 
+  /* In the following, no OpenACC Profiling Interface events can possibly be
+     generated.  */
+
   gomp_mutex_lock (&dev->lock);
 
   n = lookup_host (dev, h, 1);
@@ -258,6 +298,9 @@  acc_hostptr (void *d)
   if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return d;
 
+  /* In the following, no OpenACC Profiling Interface events can possibly be
+     generated.  */
+
   gomp_mutex_lock (&acc_dev->lock);
 
   n = lookup_dev (acc_dev->openacc.data_environ, d, 1);
@@ -295,6 +338,9 @@  acc_is_present (void *h, size_t s)
   if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return h != NULL;
 
+  /* In the following, no OpenACC Profiling Interface events can possibly be
+     generated.  */
+
   gomp_mutex_lock (&acc_dev->lock);
 
   n = lookup_host (acc_dev, h, s);
@@ -339,6 +385,10 @@  acc_map_data (void *h, void *d, size_t s)
 	gomp_fatal ("[%p,+%d]->[%p,+%d] is a bad map",
                     (void *)h, (int)s, (void *)d, (int)s);
 
+      acc_prof_info prof_info;
+      acc_api_info api_info;
+      bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+
       gomp_mutex_lock (&acc_dev->lock);
 
       if (lookup_host (acc_dev, h, s))
@@ -360,6 +410,12 @@  acc_map_data (void *h, void *d, size_t s)
       tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, &devaddrs, &sizes,
 			   &kinds, true, GOMP_MAP_VARS_OPENACC);
       tgt->list[0].key->refcount = REFCOUNT_INFINITY;
+
+      if (profiling_p)
+	{
+	  thr->prof_info = NULL;
+	  thr->api_info = NULL;
+	}
     }
 
   gomp_mutex_lock (&acc_dev->lock);
@@ -380,6 +436,10 @@  acc_unmap_data (void *h)
   if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return;
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+
   size_t host_size;
 
   gomp_mutex_lock (&acc_dev->lock);
@@ -433,6 +493,12 @@  acc_unmap_data (void *h)
   gomp_mutex_unlock (&acc_dev->lock);
 
   gomp_unmap_vars (t, true);
+
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 #define FLAG_PRESENT (1 << 0)
@@ -456,6 +522,15 @@  present_create_copy (unsigned f, void *h, size_t s, int async)
   if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return h;
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+  if (profiling_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
+
   gomp_mutex_lock (&acc_dev->lock);
 
   n = lookup_host (acc_dev, h, s);
@@ -518,6 +593,12 @@  present_create_copy (unsigned f, void *h, size_t s, int async)
       gomp_mutex_unlock (&acc_dev->lock);
     }
 
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
+
   return d;
 }
 
@@ -599,6 +680,15 @@  delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
   if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return;
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+  if (profiling_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
+
   gomp_mutex_lock (&acc_dev->lock);
 
   n = lookup_host (acc_dev, h, s);
@@ -672,6 +762,12 @@  delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
     }
 
   gomp_mutex_unlock (&acc_dev->lock);
+
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 void
@@ -737,6 +833,15 @@  update_dev_host (int is_dev, void *h, size_t s, int async)
   if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return;
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+  if (profiling_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
+
   gomp_mutex_lock (&acc_dev->lock);
 
   n = lookup_host (acc_dev, h, s);
@@ -758,6 +863,12 @@  update_dev_host (int is_dev, void *h, size_t s, int async)
     gomp_copy_dev2host (acc_dev, aq, h, d, s);
 
   gomp_mutex_unlock (&acc_dev->lock);
+
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 void
diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c
index fa99a2ad1a90..e56330f6226b 100644
--- a/libgomp/oacc-parallel.c
+++ b/libgomp/oacc-parallel.c
@@ -152,21 +152,75 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
   thr = goacc_thread ();
   acc_dev = thr->dev;
 
+  bool profiling_p = GOACC_PROFILING_DISPATCH_P (true);
+
+  acc_prof_info prof_info;
+  if (profiling_p)
+    {
+      thr->prof_info = &prof_info;
+
+      prof_info.event_type = acc_ev_compute_construct_start;
+      prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+      prof_info.version = _ACC_PROF_INFO_VERSION;
+      prof_info.device_type = acc_device_type (acc_dev->type);
+      prof_info.device_number = acc_dev->target_id;
+      prof_info.thread_id = -1;
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+      prof_info.src_file = NULL;
+      prof_info.func_name = NULL;
+      prof_info.line_no = -1;
+      prof_info.end_line_no = -1;
+      prof_info.func_line_no = -1;
+      prof_info.func_end_line_no = -1;
+    }
+  acc_event_info compute_construct_event_info;
+  if (profiling_p)
+    {
+      compute_construct_event_info.other_event.event_type
+	= prof_info.event_type;
+      compute_construct_event_info.other_event.valid_bytes
+	= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+      compute_construct_event_info.other_event.parent_construct
+	= acc_construct_parallel;
+      compute_construct_event_info.other_event.implicit = 0;
+      compute_construct_event_info.other_event.tool_info = NULL;
+    }
+  acc_api_info api_info;
+  if (profiling_p)
+    {
+      thr->api_info = &api_info;
+
+      api_info.device_api = acc_device_api_none;
+      api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+      api_info.device_type = prof_info.device_type;
+      api_info.vendor = -1;
+      api_info.device_handle = NULL;
+      api_info.context_handle = NULL;
+      api_info.async_handle = NULL;
+    }
+
+  if (profiling_p)
+    goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
+			      &api_info);
+
   handle_ftn_pointers (mapnum, hostaddrs, sizes, kinds);
 
   /* Host fallback if "if" clause is false or if the current device is set to
      the host.  */
   if (flags & GOACC_FLAG_HOST_FALLBACK)
     {
+      prof_info.device_type = acc_device_host;
+      api_info.device_type = prof_info.device_type;
       goacc_save_and_set_bind (acc_device_host);
       fn (hostaddrs);
       goacc_restore_bind ();
-      return;
+      goto out_prof;
     }
   else if (acc_device_type (acc_dev->type) == acc_device_host)
     {
       fn (hostaddrs);
-      return;
+      goto out_prof;
     }
 
   /* Default: let the runtime choose.  */
@@ -200,6 +254,13 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
 
 	    if (async == GOMP_LAUNCH_OP_MAX)
 	      async = va_arg (ap, unsigned);
+
+	    if (profiling_p)
+	      {
+		prof_info.async = async;
+		prof_info.async_queue = prof_info.async;
+	      }
+
 	    break;
 	  }
 
@@ -233,10 +294,34 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
   else
     tgt_fn = (void (*)) fn;
 
+  acc_event_info enter_exit_data_event_info;
+  if (profiling_p)
+    {
+      prof_info.event_type = acc_ev_enter_data_start;
+      enter_exit_data_event_info.other_event.event_type
+	= prof_info.event_type;
+      enter_exit_data_event_info.other_event.valid_bytes
+	= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+      enter_exit_data_event_info.other_event.parent_construct
+	= compute_construct_event_info.other_event.parent_construct;
+      enter_exit_data_event_info.other_event.implicit = 1;
+      enter_exit_data_event_info.other_event.tool_info = NULL;
+      goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				&api_info);
+    }
+
   goacc_aq aq = get_goacc_asyncqueue (async);
 
   tgt = gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs, NULL, sizes, kinds,
 			     true, GOMP_MAP_VARS_OPENACC);
+  if (profiling_p)
+    {
+      prof_info.event_type = acc_ev_enter_data_end;
+      enter_exit_data_event_info.other_event.event_type
+	= prof_info.event_type;
+      goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				&api_info);
+    }
   
   devaddrs = gomp_alloca (sizeof (void *) * mapnum);
   for (i = 0; i < mapnum; i++)
@@ -244,17 +329,46 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
 			    + tgt->list[i].key->tgt_offset
 			    + tgt->list[i].offset);
   if (aq == NULL)
+    acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs, dims,
+				tgt);
+  else
+    acc_dev->openacc.async.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
+				      dims, tgt, aq);
+
+  if (profiling_p)
     {
-      acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
-				  dims, tgt);
-      /* If running synchronously, unmap immediately.  */
-      gomp_unmap_vars (tgt, true);
+      prof_info.event_type = acc_ev_exit_data_start;
+      enter_exit_data_event_info.other_event.event_type = prof_info.event_type;
+      enter_exit_data_event_info.other_event.tool_info = NULL;
+      goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				&api_info);
     }
+
+  /* If running synchronously, unmap immediately.  */
+  if (aq == NULL)
+    gomp_unmap_vars (tgt, true);
   else
+    gomp_unmap_vars_async (tgt, true, aq);
+
+  if (profiling_p)
     {
-      acc_dev->openacc.async.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
-					dims, tgt, aq);
-      gomp_unmap_vars_async (tgt, true, aq);
+      prof_info.event_type = acc_ev_exit_data_end;
+      enter_exit_data_event_info.other_event.event_type = prof_info.event_type;
+      goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				&api_info);
+    }
+
+ out_prof:
+  if (profiling_p)
+    {
+      prof_info.event_type = acc_ev_compute_construct_end;
+      compute_construct_event_info.other_event.event_type
+	= prof_info.event_type;
+      goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
+				&api_info);
+
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
     }
 }
 
@@ -293,16 +407,83 @@  GOACC_data_start (int flags_m, size_t mapnum,
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
 
+  bool profiling_p = GOACC_PROFILING_DISPATCH_P (true);
+
+  acc_prof_info prof_info;
+  if (profiling_p)
+    {
+      thr->prof_info = &prof_info;
+
+      prof_info.event_type = acc_ev_enter_data_start;
+      prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+      prof_info.version = _ACC_PROF_INFO_VERSION;
+      prof_info.device_type = acc_device_type (acc_dev->type);
+      prof_info.device_number = acc_dev->target_id;
+      prof_info.thread_id = -1;
+      prof_info.async = acc_async_sync; /* Always synchronous.  */
+      prof_info.async_queue = prof_info.async;
+      prof_info.src_file = NULL;
+      prof_info.func_name = NULL;
+      prof_info.line_no = -1;
+      prof_info.end_line_no = -1;
+      prof_info.func_line_no = -1;
+      prof_info.func_end_line_no = -1;
+    }
+  acc_event_info enter_data_event_info;
+  if (profiling_p)
+    {
+      enter_data_event_info.other_event.event_type
+	= prof_info.event_type;
+      enter_data_event_info.other_event.valid_bytes
+	= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+      enter_data_event_info.other_event.parent_construct = acc_construct_data;
+      for (int i = 0; i < mapnum; ++i)
+	if ((kinds[i] & 0xff) == GOMP_MAP_USE_DEVICE_PTR)
+	  {
+	    /* If there is one such data mapping kind, then this is actually an
+	       OpenACC 'host_data' construct.  (GCC maps the OpenACC
+	       'host_data' construct to the OpenACC 'data' construct.)  Apart
+	       from artificial test cases (such as an OpenACC 'host_data'
+	       construct's (implicit) device initialization when there hasn't
+	       been any device data be set up before...), there can't really
+	       any meaningful events be generated from OpenACC 'host_data'
+	       constructs, though.  */
+	    enter_data_event_info.other_event.parent_construct
+	      = acc_construct_host_data;
+	    break;
+	  }
+      enter_data_event_info.other_event.implicit = 0;
+      enter_data_event_info.other_event.tool_info = NULL;
+    }
+  acc_api_info api_info;
+  if (profiling_p)
+    {
+      thr->api_info = &api_info;
+
+      api_info.device_api = acc_device_api_none;
+      api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+      api_info.device_type = prof_info.device_type;
+      api_info.vendor = -1;
+      api_info.device_handle = NULL;
+      api_info.context_handle = NULL;
+      api_info.async_handle = NULL;
+    }
+
+  if (profiling_p)
+    goacc_profiling_dispatch (&prof_info, &enter_data_event_info, &api_info);
+
   /* Host fallback or 'do nothing'.  */
   if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
       || (flags & GOACC_FLAG_HOST_FALLBACK))
     {
+      prof_info.device_type = acc_device_host;
+      api_info.device_type = prof_info.device_type;
       tgt = gomp_map_vars (NULL, 0, NULL, NULL, NULL, NULL, true,
 			   GOMP_MAP_VARS_OPENACC);
       tgt->prev = thr->mapped_data;
       thr->mapped_data = tgt;
 
-      return;
+      goto out_prof;
     }
 
   gomp_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
@@ -311,18 +492,90 @@  GOACC_data_start (int flags_m, size_t mapnum,
   gomp_debug (0, "  %s: mappings prepared\n", __FUNCTION__);
   tgt->prev = thr->mapped_data;
   thr->mapped_data = tgt;
+
+ out_prof:
+  if (profiling_p)
+    {
+      prof_info.event_type = acc_ev_enter_data_end;
+      enter_data_event_info.other_event.event_type = prof_info.event_type;
+      goacc_profiling_dispatch (&prof_info, &enter_data_event_info, &api_info);
+
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 void
 GOACC_data_end (void)
 {
   struct goacc_thread *thr = goacc_thread ();
+  struct gomp_device_descr *acc_dev = thr->dev;
   struct target_mem_desc *tgt = thr->mapped_data;
 
+  bool profiling_p = GOACC_PROFILING_DISPATCH_P (true);
+
+  acc_prof_info prof_info;
+  if (profiling_p)
+    {
+      thr->prof_info = &prof_info;
+
+      prof_info.event_type = acc_ev_exit_data_start;
+      prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+      prof_info.version = _ACC_PROF_INFO_VERSION;
+      prof_info.device_type = acc_device_type (acc_dev->type);
+      prof_info.device_number = acc_dev->target_id;
+      prof_info.thread_id = -1;
+      prof_info.async = acc_async_sync; /* Always synchronous.  */
+      prof_info.async_queue = prof_info.async;
+      prof_info.src_file = NULL;
+      prof_info.func_name = NULL;
+      prof_info.line_no = -1;
+      prof_info.end_line_no = -1;
+      prof_info.func_line_no = -1;
+      prof_info.func_end_line_no = -1;
+    }
+  acc_event_info exit_data_event_info;
+  if (profiling_p)
+    {
+      exit_data_event_info.other_event.event_type
+	= prof_info.event_type;
+      exit_data_event_info.other_event.valid_bytes
+	= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+      exit_data_event_info.other_event.parent_construct = acc_construct_data;
+      exit_data_event_info.other_event.implicit = 0;
+      exit_data_event_info.other_event.tool_info = NULL;
+    }
+  acc_api_info api_info;
+  if (profiling_p)
+    {
+      thr->api_info = &api_info;
+
+      api_info.device_api = acc_device_api_none;
+      api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+      api_info.device_type = prof_info.device_type;
+      api_info.vendor = -1;
+      api_info.device_handle = NULL;
+      api_info.context_handle = NULL;
+      api_info.async_handle = NULL;
+    }
+
+  if (profiling_p)
+    goacc_profiling_dispatch (&prof_info, &exit_data_event_info, &api_info);
+
   gomp_debug (0, "  %s: restore mappings\n", __FUNCTION__);
   thr->mapped_data = tgt->prev;
   gomp_unmap_vars (tgt, true);
   gomp_debug (0, "  %s: mappings restored\n", __FUNCTION__);
+
+  if (profiling_p)
+    {
+      prof_info.event_type = acc_ev_exit_data_end;
+      exit_data_event_info.other_event.event_type = prof_info.event_type;
+      goacc_profiling_dispatch (&prof_info, &exit_data_event_info, &api_info);
+
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 void
@@ -342,19 +595,6 @@  GOACC_enter_exit_data (int flags_m, size_t mapnum,
   thr = goacc_thread ();
   acc_dev = thr->dev;
 
-  if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
-      || (flags & GOACC_FLAG_HOST_FALLBACK))
-    return;
-
-  if (num_waits)
-    {
-      va_list ap;
-
-      va_start (ap, num_waits);
-      goacc_wait (async, num_waits, &ap);
-      va_end (ap);
-    }
-
   /* Determine whether "finalize" semantics apply to all mappings of this
      OpenACC directive.  */
   bool finalize = false;
@@ -394,6 +634,77 @@  GOACC_enter_exit_data (int flags_m, size_t mapnum,
 		      kind);
     }
 
+  bool profiling_p = GOACC_PROFILING_DISPATCH_P (true);
+
+  acc_prof_info prof_info;
+  if (profiling_p)
+    {
+      thr->prof_info = &prof_info;
+
+      prof_info.event_type
+	= data_enter ? acc_ev_enter_data_start : acc_ev_exit_data_start;
+      prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+      prof_info.version = _ACC_PROF_INFO_VERSION;
+      prof_info.device_type = acc_device_type (acc_dev->type);
+      prof_info.device_number = acc_dev->target_id;
+      prof_info.thread_id = -1;
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+      prof_info.src_file = NULL;
+      prof_info.func_name = NULL;
+      prof_info.line_no = -1;
+      prof_info.end_line_no = -1;
+      prof_info.func_line_no = -1;
+      prof_info.func_end_line_no = -1;
+    }
+  acc_event_info enter_exit_data_event_info;
+  if (profiling_p)
+    {
+      enter_exit_data_event_info.other_event.event_type
+	= prof_info.event_type;
+      enter_exit_data_event_info.other_event.valid_bytes
+	= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+      enter_exit_data_event_info.other_event.parent_construct
+	= data_enter ? acc_construct_enter_data : acc_construct_exit_data;
+      enter_exit_data_event_info.other_event.implicit = 0;
+      enter_exit_data_event_info.other_event.tool_info = NULL;
+    }
+  acc_api_info api_info;
+  if (profiling_p)
+    {
+      thr->api_info = &api_info;
+
+      api_info.device_api = acc_device_api_none;
+      api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+      api_info.device_type = prof_info.device_type;
+      api_info.vendor = -1;
+      api_info.device_handle = NULL;
+      api_info.context_handle = NULL;
+      api_info.async_handle = NULL;
+    }
+
+  if (profiling_p)
+    goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+			      &api_info);
+
+  if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+      || (flags & GOACC_FLAG_HOST_FALLBACK))
+    {
+      prof_info.device_type = acc_device_host;
+      api_info.device_type = prof_info.device_type;
+
+      goto out_prof;
+    }
+
+  if (num_waits)
+    {
+      va_list ap;
+
+      va_start (ap, num_waits);
+      goacc_wait (async, num_waits, &ap);
+      va_end (ap);
+    }
+
   /* In c, non-pointers and arrays are represented by a single data clause.
      Dynamically allocated arrays and subarrays are represented by a data
      clause followed by an internal GOMP_MAP_POINTER.
@@ -486,6 +797,19 @@  GOACC_enter_exit_data (int flags_m, size_t mapnum,
 	    i += pointer - 1;
 	  }
       }
+
+ out_prof:
+  if (profiling_p)
+    {
+      prof_info.event_type
+	= data_enter ? acc_ev_enter_data_end : acc_ev_exit_data_end;
+      enter_exit_data_event_info.other_event.event_type = prof_info.event_type;
+      goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				&api_info);
+
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 static void
@@ -534,9 +858,64 @@  GOACC_update (int flags_m, size_t mapnum,
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
 
+  bool profiling_p = GOACC_PROFILING_DISPATCH_P (true);
+
+  acc_prof_info prof_info;
+  if (profiling_p)
+    {
+      thr->prof_info = &prof_info;
+
+      prof_info.event_type = acc_ev_update_start;
+      prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+      prof_info.version = _ACC_PROF_INFO_VERSION;
+      prof_info.device_type = acc_device_type (acc_dev->type);
+      prof_info.device_number = acc_dev->target_id;
+      prof_info.thread_id = -1;
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+      prof_info.src_file = NULL;
+      prof_info.func_name = NULL;
+      prof_info.line_no = -1;
+      prof_info.end_line_no = -1;
+      prof_info.func_line_no = -1;
+      prof_info.func_end_line_no = -1;
+    }
+  acc_event_info update_event_info;
+  if (profiling_p)
+    {
+      update_event_info.other_event.event_type
+	= prof_info.event_type;
+      update_event_info.other_event.valid_bytes
+	= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+      update_event_info.other_event.parent_construct = acc_construct_update;
+      update_event_info.other_event.implicit = 0;
+      update_event_info.other_event.tool_info = NULL;
+    }
+  acc_api_info api_info;
+  if (profiling_p)
+    {
+      thr->api_info = &api_info;
+
+      api_info.device_api = acc_device_api_none;
+      api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+      api_info.device_type = prof_info.device_type;
+      api_info.vendor = -1;
+      api_info.device_handle = NULL;
+      api_info.context_handle = NULL;
+      api_info.async_handle = NULL;
+    }
+
+  if (profiling_p)
+    goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
+
   if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
       || (flags & GOACC_FLAG_HOST_FALLBACK))
-    return;
+    {
+      prof_info.device_type = acc_device_host;
+      api_info.device_type = prof_info.device_type;
+
+      goto out_prof;
+    }
 
   if (num_waits)
     {
@@ -608,11 +987,38 @@  GOACC_update (int flags_m, size_t mapnum,
 	  break;
 	}
     }
+
+ out_prof:
+  if (profiling_p)
+    {
+      prof_info.event_type = acc_ev_update_end;
+      update_event_info.other_event.event_type = prof_info.event_type;
+      goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
+
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 void
 GOACC_wait (int async, int num_waits, ...)
 {
+  goacc_lazy_initialize ();
+
+  struct goacc_thread *thr = goacc_thread ();
+
+  /* No nesting.  */
+  assert (thr->prof_info == NULL);
+  assert (thr->api_info == NULL);
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+  if (profiling_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
+
   if (num_waits)
     {
       va_list ap;
@@ -625,6 +1031,12 @@  GOACC_wait (int async, int num_waits, ...)
     acc_wait_all ();
   else
     acc_wait_all_async (async);
+
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 /* Legacy entry point (GCC 5).  */
diff --git a/libgomp/oacc-plugin.c b/libgomp/oacc-plugin.c
index 5480c1db56ee..b4f71a11c18b 100644
--- a/libgomp/oacc-plugin.c
+++ b/libgomp/oacc-plugin.c
@@ -29,6 +29,7 @@ 
 #include "libgomp.h"
 #include "oacc-plugin.h"
 #include "oacc-int.h"
+#include "acc_prof.h"
 
 /* This plugin function is now obsolete.  */
 void
@@ -38,6 +39,14 @@  GOMP_PLUGIN_async_unmap_vars (void *ptr __attribute__((unused)),
   gomp_fatal ("invalid plugin function");
 }
 
+/* Return the TLS data for the current thread.  */
+
+struct goacc_thread *
+GOMP_PLUGIN_goacc_thread (void)
+{
+  return goacc_thread ();
+}
+
 /* Return the target-specific part of the TLS data for the current thread.  */
 
 void *
@@ -57,3 +66,11 @@  GOMP_PLUGIN_acc_default_dim (unsigned int i)
     }
   return goacc_default_dims[i];
 }
+
+void
+GOMP_PLUGIN_goacc_profiling_dispatch (acc_prof_info *prof_info,
+				      acc_event_info *event_info,
+				      acc_api_info *api_info)
+{
+  goacc_profiling_dispatch (prof_info, event_info, api_info);
+}
diff --git a/libgomp/oacc-plugin.h b/libgomp/oacc-plugin.h
index 112d4a593f97..887c6f601253 100644
--- a/libgomp/oacc-plugin.h
+++ b/libgomp/oacc-plugin.h
@@ -27,8 +27,15 @@ 
 #ifndef OACC_PLUGIN_H
 #define OACC_PLUGIN_H 1
 
+#include "oacc-int.h"
+#include "acc_prof.h"
+
 extern void GOMP_PLUGIN_async_unmap_vars (void *, int);
+extern struct goacc_thread *GOMP_PLUGIN_goacc_thread (void);
 extern void *GOMP_PLUGIN_acc_thread (void);
 extern int GOMP_PLUGIN_acc_default_dim (unsigned int);
+extern void GOMP_PLUGIN_goacc_profiling_dispatch (acc_prof_info *,
+						  acc_event_info *,
+						  acc_api_info *);
 
 #endif
diff --git a/libgomp/oacc-profiling.c b/libgomp/oacc-profiling.c
new file mode 100644
index 000000000000..eff288650895
--- /dev/null
+++ b/libgomp/oacc-profiling.c
@@ -0,0 +1,662 @@ 
+/* OpenACC Profiling Interface
+
+   Copyright (C) 2019 Free Software Foundation, Inc.
+
+   Contributed by Mentor, a Siemens Business.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define _GNU_SOURCE
+#include "libgomp.h"
+#include "oacc-int.h"
+#include "secure_getenv.h"
+#include "acc_prof.h"
+#include <assert.h>
+#ifdef HAVE_STRING_H
+# include <string.h>
+#endif
+#ifdef PLUGIN_SUPPORT
+# include <dlfcn.h>
+#endif
+
+#define STATIC_ASSERT(expr) _Static_assert (expr, "!(" #expr ")")
+
+/* Statically assert that the layout of the common fields in the
+   'acc_event_info' variants matches.  */
+/* 'event_type' */
+STATIC_ASSERT (offsetof (acc_event_info, event_type)
+	       == offsetof (acc_event_info, data_event.event_type));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.event_type)
+	       == offsetof (acc_event_info, launch_event.event_type));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.event_type)
+	       == offsetof (acc_event_info, other_event.event_type));
+/* 'valid_bytes' */
+STATIC_ASSERT (offsetof (acc_event_info, data_event.valid_bytes)
+	       == offsetof (acc_event_info, launch_event.valid_bytes));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.valid_bytes)
+	       == offsetof (acc_event_info, other_event.valid_bytes));
+/* 'parent_construct' */
+STATIC_ASSERT (offsetof (acc_event_info, data_event.parent_construct)
+	       == offsetof (acc_event_info, launch_event.parent_construct));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.parent_construct)
+	       == offsetof (acc_event_info, other_event.parent_construct));
+/* 'implicit' */
+STATIC_ASSERT (offsetof (acc_event_info, data_event.implicit)
+	       == offsetof (acc_event_info, launch_event.implicit));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.implicit)
+	       == offsetof (acc_event_info, other_event.implicit));
+/* 'tool_info' */
+STATIC_ASSERT (offsetof (acc_event_info, data_event.tool_info)
+	       == offsetof (acc_event_info, launch_event.tool_info));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.tool_info)
+	       == offsetof (acc_event_info, other_event.tool_info));
+
+struct goacc_prof_callback_entry
+{
+  acc_prof_callback cb;
+  int ref;
+  bool enabled;
+  struct goacc_prof_callback_entry *next;
+};
+
+/* Use a separate flag to minimize run-time performance impact for the (very
+   common) case that profiling is not enabled.
+
+   Once enabled, we're not going to disable this anymore, anywhere.  We
+   probably could, by adding appropriate logic to 'acc_prof_register',
+   'acc_prof_unregister'.  */
+bool goacc_prof_enabled = false;
+
+/* Global state for registered callbacks.
+   'goacc_prof_callbacks_enabled[acc_ev_none]' acts as a global toggle.  */
+static bool goacc_prof_callbacks_enabled[acc_ev_last];
+static struct goacc_prof_callback_entry *goacc_prof_callback_entries[acc_ev_last];
+/* Lock used to protect access to 'goacc_prof_callbacks_enabled', and
+   'goacc_prof_callback_entries'.  */
+static gomp_mutex_t goacc_prof_lock;
+
+void
+goacc_profiling_initialize (void)
+{
+  gomp_mutex_init (&goacc_prof_lock);
+
+  /* Initially, all callbacks for all events are enabled.  */
+  for (int i = 0; i < acc_ev_last; ++i)
+    goacc_prof_callbacks_enabled[i] = true;
+
+
+#ifdef PLUGIN_SUPPORT
+  char *acc_proflibs = secure_getenv ("ACC_PROFLIB");
+  while (acc_proflibs != NULL && acc_proflibs[0] != '\0')
+    {
+      char *acc_proflibs_sep = strchr (acc_proflibs, ';');
+      char *acc_proflib;
+      if (acc_proflibs_sep == acc_proflibs)
+	{
+	  /* Stray ';' separator: make sure we don't 'dlopen' the main
+	     program.  */
+	  acc_proflib = NULL;
+	}
+      else
+	{
+	  if (acc_proflibs_sep != NULL)
+	    {
+	      /* Single out the first library.  */
+	      acc_proflib = gomp_malloc (acc_proflibs_sep - acc_proflibs + 1);
+	      memcpy (acc_proflib, acc_proflibs,
+		      acc_proflibs_sep - acc_proflibs);
+	      acc_proflib[acc_proflibs_sep - acc_proflibs] = '\0';
+	    }
+	  else
+	    {
+	      /* No ';' separator, so only one library.  */
+	      acc_proflib = acc_proflibs;
+	    }
+
+	  gomp_debug (0, "%s: dlopen (\"%s\")\n", __FUNCTION__, acc_proflib);
+	  void *dl_handle = dlopen (acc_proflib, RTLD_LAZY);
+	  if (dl_handle != NULL)
+	    {
+	      typeof (&acc_register_library) a_r_l
+		= dlsym (dl_handle, "acc_register_library");
+	      if (a_r_l == NULL)
+		goto dl_fail;
+	      gomp_debug (0, "  %s: calling %s:acc_register_library\n",
+			  __FUNCTION__, acc_proflib);
+	      a_r_l (acc_prof_register, acc_prof_unregister,
+		     acc_prof_lookup);
+	    }
+	  else
+	    {
+	    dl_fail:
+	      gomp_error ("while loading ACC_PROFLIB \"%s\": %s",
+			  acc_proflib, dlerror ());
+	      if (dl_handle != NULL)
+		{
+		  int err = dlclose (dl_handle);
+		  dl_handle = NULL;
+		  if (err != 0)
+		    goto dl_fail;
+		}
+	    }
+	}
+
+      if (acc_proflib != acc_proflibs)
+	{
+	  free (acc_proflib);
+
+	  acc_proflibs = acc_proflibs_sep + 1;
+	}
+      else
+	acc_proflibs = NULL;
+    }
+#endif /* PLUGIN_SUPPORT */
+}
+
+void
+acc_prof_register (acc_event_t ev, acc_prof_callback cb, acc_register_t reg)
+{
+  gomp_debug (0, "%s: ev=%d, cb=%p, reg=%d\n",
+	      __FUNCTION__, (int) ev, (void *) cb, (int) reg);
+
+
+  /* For any events to be dispatched, the user first has to register a
+     callback, which makes this here a good place for enabling the whole
+     machinery.  */
+  if (!GOACC_PROF_ENABLED)
+    __atomic_store_n (&goacc_prof_enabled, true, MEMMODEL_RELEASE);
+
+
+  enum
+  {
+    EVENT_KIND_BOGUS,
+    EVENT_KIND_NORMAL,
+    /* As end events invoke callbacks in the reverse order, we register these
+       in the reverse order here.  */
+    EVENT_KIND_END,
+  } event_kind = EVENT_KIND_BOGUS;
+  switch (ev)
+    {
+    case acc_ev_none:
+    case acc_ev_device_init_start:
+    case acc_ev_device_shutdown_start:
+    case acc_ev_runtime_shutdown:
+    case acc_ev_create:
+    case acc_ev_delete:
+    case acc_ev_alloc:
+    case acc_ev_free:
+    case acc_ev_enter_data_start:
+    case acc_ev_exit_data_start:
+    case acc_ev_update_start:
+    case acc_ev_compute_construct_start:
+    case acc_ev_enqueue_launch_start:
+    case acc_ev_enqueue_upload_start:
+    case acc_ev_enqueue_download_start:
+    case acc_ev_wait_start:
+      event_kind = EVENT_KIND_NORMAL;
+      break;
+    case acc_ev_device_init_end:
+    case acc_ev_device_shutdown_end:
+    case acc_ev_enter_data_end:
+    case acc_ev_exit_data_end:
+    case acc_ev_update_end:
+    case acc_ev_compute_construct_end:
+    case acc_ev_enqueue_launch_end:
+    case acc_ev_enqueue_upload_end:
+    case acc_ev_enqueue_download_end:
+    case acc_ev_wait_end:
+      event_kind = EVENT_KIND_END;
+      break;
+    case acc_ev_last:
+      break;
+    }
+  if (event_kind == EVENT_KIND_BOGUS)
+    {
+      /* Silently ignore.  */
+      gomp_debug (0, "  ignoring request for bogus 'acc_event_t'\n");
+      return;
+    }
+
+  bool bogus = true;
+  switch (reg)
+    {
+    case acc_reg:
+    case acc_toggle:
+    case acc_toggle_per_thread:
+      bogus = false;
+      break;
+    }
+  if (bogus)
+    {
+      /* Silently ignore.  */
+      gomp_debug (0, "  ignoring request with bogus 'acc_register_t'\n");
+      return;
+    }
+
+  /* Special cases.  */
+  if (reg == acc_toggle)
+    {
+      if (cb == NULL)
+	{
+	  gomp_debug (0, "  globally enabling callbacks\n");
+	  gomp_mutex_lock (&goacc_prof_lock);
+	  /* For 'acc_ev_none', this acts as a global toggle.  */
+	  goacc_prof_callbacks_enabled[ev] = true;
+	  gomp_mutex_unlock (&goacc_prof_lock);
+	  return;
+	}
+      else if (ev == acc_ev_none && cb != NULL)
+	{
+	  gomp_debug (0, "  ignoring request\n");
+	  return;
+	}
+    }
+  else if (reg == acc_toggle_per_thread)
+    {
+      if (ev == acc_ev_none && cb == NULL)
+	{
+	  gomp_debug (0, "  thread: enabling callbacks\n");
+	  goacc_lazy_initialize ();
+	  struct goacc_thread *thr = goacc_thread ();
+	  thr->prof_callbacks_enabled = true;
+	  return;
+	}
+      /* Silently ignore.  */
+      gomp_debug (0, "  ignoring bogus request\n");
+      return;
+    }
+
+  gomp_mutex_lock (&goacc_prof_lock);
+
+  struct goacc_prof_callback_entry *it, *it_p;
+  it = goacc_prof_callback_entries[ev];
+  it_p = NULL;
+  while (it)
+    {
+      if (it->cb == cb)
+	break;
+      it_p = it;
+      it = it->next;
+    }
+
+  switch (reg)
+    {
+    case acc_reg:
+      /* If we already have this callback registered, just increment its
+	 reference count.  */
+      if (it != NULL)
+	{
+	  it->ref++;
+	  gomp_debug (0, "  already registered;"
+		      " incrementing reference count to: %d\n", it->ref);
+	}
+      else
+	{
+	  struct goacc_prof_callback_entry *e
+	    = gomp_malloc (sizeof (struct goacc_prof_callback_entry));
+	  e->cb = cb;
+	  e->ref = 1;
+	  e->enabled = true;
+	  bool prepend = (event_kind == EVENT_KIND_END);
+	  /* If we don't have any callback registered yet, also use the
+	     'prepend' code path.  */
+	  if (it_p == NULL)
+	    prepend = true;
+	  if (prepend)
+	    {
+	      gomp_debug (0, "  prepending\n");
+	      e->next = goacc_prof_callback_entries[ev];
+	      goacc_prof_callback_entries[ev] = e;
+	    }
+	  else
+	    {
+	      gomp_debug (0, "  appending\n");
+	      e->next = NULL;
+	      it_p->next = e;
+	    }
+	}
+      break;
+
+    case acc_toggle:
+      if (it == NULL)
+	{
+	  gomp_debug (0, "  ignoring request: is not registered\n");
+	  break;
+	}
+      else
+	{
+	  gomp_debug (0, "  enabling\n");
+	  it->enabled = true;
+	}
+      break;
+
+    case acc_toggle_per_thread:
+      __builtin_unreachable ();
+    }
+
+  gomp_mutex_unlock (&goacc_prof_lock);
+}
+
+void
+acc_prof_unregister (acc_event_t ev, acc_prof_callback cb, acc_register_t reg)
+{
+  gomp_debug (0, "%s: ev=%d, cb=%p, reg=%d\n",
+	      __FUNCTION__, (int) ev, (void *) cb, (int) reg);
+
+  /* If profiling is not enabled, there cannot be anything to unregister.  */
+  if (!GOACC_PROF_ENABLED)
+    return;
+
+  if (ev < acc_ev_none
+      || ev >= acc_ev_last)
+    {
+      /* Silently ignore.  */
+      gomp_debug (0, "  ignoring request for bogus 'acc_event_t'\n");
+      return;
+    }
+
+  bool bogus = true;
+  switch (reg)
+    {
+    case acc_reg:
+    case acc_toggle:
+    case acc_toggle_per_thread:
+      bogus = false;
+      break;
+    }
+  if (bogus)
+    {
+      /* Silently ignore.  */
+      gomp_debug (0, "  ignoring request with bogus 'acc_register_t'\n");
+      return;
+    }
+
+  /* Special cases.  */
+  if (reg == acc_toggle)
+    {
+      if (cb == NULL)
+	{
+	  gomp_debug (0, "  globally disabling callbacks\n");
+	  gomp_mutex_lock (&goacc_prof_lock);
+	  /* For 'acc_ev_none', this acts as a global toggle.  */
+	  goacc_prof_callbacks_enabled[ev] = false;
+	  gomp_mutex_unlock (&goacc_prof_lock);
+	  return;
+	}
+      else if (ev == acc_ev_none && cb != NULL)
+	{
+	  gomp_debug (0, "  ignoring request\n");
+	  return;
+	}
+    }
+  else if (reg == acc_toggle_per_thread)
+    {
+      if (ev == acc_ev_none && cb == NULL)
+	{
+	  gomp_debug (0, "  thread: disabling callbacks\n");
+	  goacc_lazy_initialize ();
+	  struct goacc_thread *thr = goacc_thread ();
+	  thr->prof_callbacks_enabled = false;
+	  return;
+	}
+      /* Silently ignore.  */
+      gomp_debug (0, "  ignoring bogus request\n");
+      return;
+    }
+
+  gomp_mutex_lock (&goacc_prof_lock);
+
+  struct goacc_prof_callback_entry *it, *it_p;
+  it = goacc_prof_callback_entries[ev];
+  it_p = NULL;
+  while (it)
+    {
+      if (it->cb == cb)
+	break;
+      it_p = it;
+      it = it->next;
+    }
+
+  switch (reg)
+    {
+    case acc_reg:
+      if (it == NULL)
+	{
+	  /* Silently ignore.  */
+	  gomp_debug (0, "  ignoring bogus request: is not registered\n");
+	  break;
+	}
+      it->ref--;
+      gomp_debug (0, "  decrementing reference count to: %d\n", it->ref);
+      if (it->ref == 0)
+	{
+	  if (it_p == NULL)
+	    goacc_prof_callback_entries[ev] = it->next;
+	  else
+	    it_p->next = it->next;
+	  free (it);
+	}
+      break;
+
+    case acc_toggle:
+      if (it == NULL)
+	{
+	  gomp_debug (0, "  ignoring request: is not registered\n");
+	  break;
+	}
+      else
+	{
+	  gomp_debug (0, "  disabling\n");
+	  it->enabled = false;
+	}
+      break;
+
+    case acc_toggle_per_thread:
+      __builtin_unreachable ();
+    }
+
+  gomp_mutex_unlock (&goacc_prof_lock);
+}
+
+acc_query_fn
+acc_prof_lookup (const char *name)
+{
+  gomp_debug (0, "%s (%s)\n",
+	      __FUNCTION__, name ?: "NULL");
+
+  return NULL;
+}
+
+void
+acc_register_library (acc_prof_reg reg, acc_prof_reg unreg,
+		      acc_prof_lookup_func lookup)
+{
+  gomp_fatal ("TODO");
+}
+
+/* Prepare to dispatch events?  */
+
+bool
+_goacc_profiling_dispatch_p (bool check_not_nested_p)
+{
+  gomp_debug (0, "%s\n", __FUNCTION__);
+
+  bool ret;
+
+  struct goacc_thread *thr = goacc_thread ();
+  if (__builtin_expect (thr == NULL, false))
+    {
+      /* If we don't have any per-thread state yet, that means that per-thread
+	 callback dispatch has not been explicitly disabled (which only a call
+	 to 'acc_prof_unregister' with 'acc_toggle_per_thread' would do, and
+	 that would have allocated per-thread state via
+	 'goacc_lazy_initialize'); initially, all callbacks for all events are
+	 enabled.  */
+      gomp_debug (0, "  %s: don't have any per-thread state yet\n", __FUNCTION__);
+    }
+  else
+    {
+      if (check_not_nested_p)
+	{
+	  /* No nesting.  */
+	  assert (thr->prof_info == NULL);
+	  assert (thr->api_info == NULL);
+	}
+
+      if (__builtin_expect (!thr->prof_callbacks_enabled, true))
+	{
+	  gomp_debug (0, "  %s: disabled for this thread\n", __FUNCTION__);
+	  ret = false;
+	  goto out;
+	}
+    }
+
+  gomp_mutex_lock (&goacc_prof_lock);
+
+  /* 'goacc_prof_callbacks_enabled[acc_ev_none]' acts as a global toggle.  */
+  if (__builtin_expect (!goacc_prof_callbacks_enabled[acc_ev_none], true))
+    {
+      gomp_debug (0, "  %s: disabled globally\n", __FUNCTION__);
+      ret = false;
+      goto out_unlock;
+    }
+  else
+    ret = true;
+
+ out_unlock:
+  gomp_mutex_unlock (&goacc_prof_lock);
+
+ out:
+  return ret;
+}
+
+/* Set up to dispatch events?  */
+
+bool
+_goacc_profiling_setup_p (struct goacc_thread *thr,
+			  acc_prof_info *prof_info, acc_api_info *api_info)
+{
+  gomp_debug (0, "%s (%p)\n", __FUNCTION__, thr);
+
+  /* If we don't have any per-thread state yet, we can't register 'prof_info'
+     and 'api_info'.  */
+  if (__builtin_expect (thr == NULL, false))
+    {
+      gomp_debug (0, "Can't dispatch OpenACC Profiling Interface events for"
+		  " the current call, construct, or directive\n");
+      return false;
+    }
+
+  if (thr->prof_info != NULL)
+    {
+      /* Profiling has already been set up for an outer construct.  In this
+	 case, we continue to use the existing information, and thus return
+	 'false' here.
+
+	 This can happen, for example, for an 'enter data' directive, which
+	 sets up profiling, then calls into 'acc_copyin', which should not
+	 again set up profiling, should not overwrite the existing
+	 information.  */
+      return false;
+    }
+
+  thr->prof_info = prof_info;
+  thr->api_info = api_info;
+
+  /* Fill in some defaults.  */
+
+  prof_info->event_type = -1; /* Must be set later.  */
+  prof_info->valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+  prof_info->version = _ACC_PROF_INFO_VERSION;
+  if (thr->dev)
+    {
+      prof_info->device_type = acc_device_type (thr->dev->type);
+      prof_info->device_number = thr->dev->target_id;
+    }
+  else
+    {
+      prof_info->device_type = -1;
+      prof_info->device_number = -1;
+    }
+  prof_info->thread_id = -1;
+  prof_info->async = acc_async_sync;
+  prof_info->async_queue = prof_info->async;
+  prof_info->src_file = NULL;
+  prof_info->func_name = NULL;
+  prof_info->line_no = -1;
+  prof_info->end_line_no = -1;
+  prof_info->func_line_no = -1;
+  prof_info->func_end_line_no = -1;
+
+  api_info->device_api = acc_device_api_none;
+  api_info->valid_bytes = _ACC_API_INFO_VALID_BYTES;
+  api_info->device_type = prof_info->device_type;
+  api_info->vendor = -1;
+  api_info->device_handle = NULL;
+  api_info->context_handle = NULL;
+  api_info->async_handle = NULL;
+
+  return true;
+}
+
+/* Dispatch events.
+
+   This must only be called if 'GOACC_PROFILING_DISPATCH_P' or
+   'GOACC_PROFILING_SETUP_P' returned a true result.  */
+
+void
+goacc_profiling_dispatch (acc_prof_info *prof_info, acc_event_info *event_info,
+			  acc_api_info *apt_info)
+{
+  acc_event_t event_type = event_info->event_type;
+  gomp_debug (0, "%s: event_type=%d\n", __FUNCTION__, (int) event_type);
+  assert (event_type > acc_ev_none
+	  && event_type < acc_ev_last);
+
+  gomp_mutex_lock (&goacc_prof_lock);
+
+  if (!goacc_prof_callbacks_enabled[event_type])
+    {
+      gomp_debug (0, "  disabled for this event type\n");
+
+      goto out_unlock;
+    }
+
+  for (struct goacc_prof_callback_entry *e
+	 = goacc_prof_callback_entries[event_type];
+       e != NULL;
+       e = e->next)
+    {
+      if (!e->enabled)
+	{
+	  gomp_debug (0, "  disabled for callback %p\n", e->cb);
+	  continue;
+	}
+
+      gomp_debug (0, "  calling callback %p\n", e->cb);
+      e->cb (prof_info, event_info, apt_info);
+    }
+
+ out_unlock:
+  gomp_mutex_unlock (&goacc_prof_lock);
+}
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 8f71e69acb60..3469116cf610 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -37,6 +37,7 @@ 
 #include "libgomp-plugin.h"
 #include "oacc-plugin.h"
 #include "gomp-constants.h"
+#include "oacc-int.h"
 
 #include <pthread.h>
 #include <cuda.h>
@@ -904,27 +905,122 @@  nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
   // num_gangs		nctaid.x
   // num_workers	ntid.y
   // vector length	ntid.x
+
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  acc_prof_info *prof_info = thr->prof_info;
+  acc_event_info enqueue_launch_event_info;
+  acc_api_info *api_info = thr->api_info;
+  bool profiling_p = __builtin_expect (prof_info != NULL, false);
+  if (profiling_p)
+    {
+      prof_info->event_type = acc_ev_enqueue_launch_start;
+
+      enqueue_launch_event_info.launch_event.event_type
+	= prof_info->event_type;
+      enqueue_launch_event_info.launch_event.valid_bytes
+	= _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
+      enqueue_launch_event_info.launch_event.parent_construct
+	= acc_construct_parallel;
+      enqueue_launch_event_info.launch_event.implicit = 1;
+      enqueue_launch_event_info.launch_event.tool_info = NULL;
+      enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
+      enqueue_launch_event_info.launch_event.num_gangs
+	= dims[GOMP_DIM_GANG];
+      enqueue_launch_event_info.launch_event.num_workers
+	= dims[GOMP_DIM_WORKER];
+      enqueue_launch_event_info.launch_event.vector_length
+	= dims[GOMP_DIM_VECTOR];
+
+      api_info->device_api = acc_device_api_cuda;
+
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
+					    api_info);
+    }
+
   kargs[0] = &dp;
   CUDA_CALL_ASSERT (cuLaunchKernel, function,
 		    dims[GOMP_DIM_GANG], 1, 1,
 		    dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
 		    0, stream, kargs, 0);
 
+  if (profiling_p)
+    {
+      prof_info->event_type = acc_ev_enqueue_launch_end;
+      enqueue_launch_event_info.launch_event.event_type
+	= prof_info->event_type;
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
+					    api_info);
+    }
+
   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
 		     targ_fn->launch->fn);
 }
 
 void * openacc_get_current_cuda_context (void);
 
+static void
+goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
+{
+  acc_prof_info *prof_info = thr->prof_info;
+  acc_event_info data_event_info;
+  acc_api_info *api_info = thr->api_info;
+
+  prof_info->event_type = acc_ev_alloc;
+
+  data_event_info.data_event.event_type = prof_info->event_type;
+  data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
+  data_event_info.data_event.parent_construct = acc_construct_parallel;
+  data_event_info.data_event.implicit = 1;
+  data_event_info.data_event.tool_info = NULL;
+  data_event_info.data_event.var_name = NULL;
+  data_event_info.data_event.bytes = s;
+  data_event_info.data_event.host_ptr = NULL;
+  data_event_info.data_event.device_ptr = dp;
+
+  api_info->device_api = acc_device_api_cuda;
+
+  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
+}
+
 static void *
 nvptx_alloc (size_t s)
 {
   CUdeviceptr d;
 
   CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  bool profiling_p
+    = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
+  if (profiling_p)
+    goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
+
   return (void *) d;
 }
 
+static void
+goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
+{
+  acc_prof_info *prof_info = thr->prof_info;
+  acc_event_info data_event_info;
+  acc_api_info *api_info = thr->api_info;
+
+  prof_info->event_type = acc_ev_free;
+
+  data_event_info.data_event.event_type = prof_info->event_type;
+  data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
+  data_event_info.data_event.parent_construct = acc_construct_parallel;
+  data_event_info.data_event.implicit = 1;
+  data_event_info.data_event.tool_info = NULL;
+  data_event_info.data_event.var_name = NULL;
+  data_event_info.data_event.bytes = -1;
+  data_event_info.data_event.host_ptr = NULL;
+  data_event_info.data_event.device_ptr = p;
+
+  api_info->device_api = acc_device_api_cuda;
+
+  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
+}
+
 static bool
 nvptx_free (void *p, struct ptx_device *ptx_dev)
 {
@@ -952,6 +1048,12 @@  nvptx_free (void *p, struct ptx_device *ptx_dev)
     }
 
   CUDA_CALL (cuMemFree, (CUdeviceptr) p);
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  bool profiling_p
+    = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
+  if (profiling_p)
+    goacc_profiling_acc_ev_free (thr, p);
+
   return true;
 }
 
@@ -1250,22 +1352,61 @@  GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
 {
   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
 
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  acc_prof_info *prof_info = thr->prof_info;
+  acc_event_info data_event_info;
+  acc_api_info *api_info = thr->api_info;
+  bool profiling_p = __builtin_expect (prof_info != NULL, false);
+
   void **hp = NULL;
   CUdeviceptr dp = 0;
 
   if (mapnum > 0)
     {
-      hp = alloca (mapnum * sizeof (void *));
+      size_t s = mapnum * sizeof (void *);
+      hp = alloca (s);
       for (int i = 0; i < mapnum; i++)
 	hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
-      CUDA_CALL_ASSERT (cuMemAlloc, &dp, mapnum * sizeof (void *));
+      CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
+      if (profiling_p)
+	goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
     }
 
   /* Copy the (device) pointers to arguments to the device (dp and hp might in
      fact have the same value on a unified-memory system).  */
   if (mapnum > 0)
-    CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
-		      mapnum * sizeof (void *));
+    {
+      if (profiling_p)
+	{
+	  prof_info->event_type = acc_ev_enqueue_upload_start;
+
+	  data_event_info.data_event.event_type = prof_info->event_type;
+	  data_event_info.data_event.valid_bytes
+	    = _ACC_DATA_EVENT_INFO_VALID_BYTES;
+	  data_event_info.data_event.parent_construct
+	    = acc_construct_parallel;
+	  data_event_info.data_event.implicit = 1; /* Always implicit.  */
+	  data_event_info.data_event.tool_info = NULL;
+	  data_event_info.data_event.var_name = NULL;
+	  data_event_info.data_event.bytes = mapnum * sizeof (void *);
+	  data_event_info.data_event.host_ptr = hp;
+	  data_event_info.data_event.device_ptr = (const void *) dp;
+
+	  api_info->device_api = acc_device_api_cuda;
+
+	  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+						api_info);
+	}
+      CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
+			mapnum * sizeof (void *));
+      if (profiling_p)
+	{
+	  prof_info->event_type = acc_ev_enqueue_upload_end;
+	  data_event_info.data_event.event_type = prof_info->event_type;
+	  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+						api_info);
+	}
+    }
 
   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
 	      dp, NULL);
@@ -1277,7 +1418,10 @@  GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
 		       maybe_abort_msg);
   else if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
+
   CUDA_CALL_ASSERT (cuMemFree, dp);
+  if (profiling_p)
+    goacc_profiling_acc_ev_free (thr, (void *) dp);
 }
 
 static void
@@ -1296,23 +1440,54 @@  GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
 {
   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
 
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  acc_prof_info *prof_info = thr->prof_info;
+  acc_event_info data_event_info;
+  acc_api_info *api_info = thr->api_info;
+  bool profiling_p = __builtin_expect (prof_info != NULL, false);
+
   void **hp = NULL;
   CUdeviceptr dp = 0;
   void **block = NULL;
 
   if (mapnum > 0)
     {
-      block = (void **) GOMP_PLUGIN_malloc ((mapnum + 2) * sizeof (void *));
+      size_t s = mapnum * sizeof (void *);
+      block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s);
       hp = block + 2;
       for (int i = 0; i < mapnum; i++)
 	hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
-      CUDA_CALL_ASSERT (cuMemAlloc, &dp, mapnum * sizeof (void *));
+      CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
+      if (profiling_p)
+	goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
     }
 
   /* Copy the (device) pointers to arguments to the device (dp and hp might in
      fact have the same value on a unified-memory system).  */
   if (mapnum > 0)
     {
+      if (profiling_p)
+	{
+	  prof_info->event_type = acc_ev_enqueue_upload_start;
+
+	  data_event_info.data_event.event_type = prof_info->event_type;
+	  data_event_info.data_event.valid_bytes
+	    = _ACC_DATA_EVENT_INFO_VALID_BYTES;
+	  data_event_info.data_event.parent_construct
+	    = acc_construct_parallel;
+	  data_event_info.data_event.implicit = 1; /* Always implicit.  */
+	  data_event_info.data_event.tool_info = NULL;
+	  data_event_info.data_event.var_name = NULL;
+	  data_event_info.data_event.bytes = mapnum * sizeof (void *);
+	  data_event_info.data_event.host_ptr = hp;
+	  data_event_info.data_event.device_ptr = (const void *) dp;
+
+	  api_info->device_api = acc_device_api_cuda;
+
+	  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+						api_info);
+	}
+
       CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
 			mapnum * sizeof (void *), aq->cuda_stream);
       block[0] = (void *) dp;
@@ -1320,7 +1495,16 @@  GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
       struct nvptx_thread *nvthd =
 	(struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
       block[1] = (void *) nvthd->ptx_dev;
+
+      if (profiling_p)
+	{
+	  prof_info->event_type = acc_ev_enqueue_upload_end;
+	  data_event_info.data_event.event_type = prof_info->event_type;
+	  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+						api_info);
+	}
     }
+
   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
 	      dp, aq->cuda_stream);
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c
new file mode 100644
index 000000000000..d929bfd80a4f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c
@@ -0,0 +1,353 @@ 
+/* Test dispatch of events to callbacks.  */
+
+#undef NDEBUG
+#include <assert.h>
+
+#include <acc_prof.h>
+
+
+/* Use explicit 'copyin' clauses, to work around "'firstprivate'
+   optimizations", which will cause the value at the point of call to be used
+   (*before* any potential modifications done in callbacks), as opposed to its
+   address being taken, which then later gets dereferenced (*after* any
+   modifications done in callbacks).  */
+#define COPYIN(...) copyin(__VA_ARGS__)
+
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+
+static int state = -1;
+
+#define STATE_OP(state, op) \
+  do \
+    { \
+      typeof (state) state_o = (state); \
+      (void) state_o; \
+      (state)op; \
+      DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
+    } \
+  while (0)
+
+
+static void cb_compute_construct_start_1 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 0
+	  || state == 10
+	  || state == 30
+	  || state == 41
+	  || state == 51
+	  || state == 91
+	  || state == 101
+	  || state == 151);
+  STATE_OP (state, ++);
+}
+
+static void cb_compute_construct_start_2 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 1
+	  || state == 11
+	  || state == 40
+	  || state == 50
+	  || state == 90
+	  || state == 100
+	  || state == 150);
+  STATE_OP (state, ++);
+}
+
+static void cb_compute_construct_end_1 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 14
+	  || state == 21
+	  || state == 32
+	  || state == 42
+	  || state == 80
+	  || state == 103
+	  || state == 152);
+  STATE_OP (state, ++);
+}
+
+static void cb_compute_construct_end_2 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 13
+	  || state == 43
+	  || state == 102
+	  || state == 154);
+  STATE_OP (state, ++);
+}
+
+static void cb_compute_construct_end_3 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 12
+	  || state == 20
+	  || state == 31
+	  || state == 44
+	  || state == 81
+	  || state == 104
+	  || state == 153);
+  STATE_OP (state, ++);
+}
+
+
+static acc_prof_reg reg;
+static acc_prof_reg unreg;
+static acc_prof_lookup_func lookup;
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg = reg_;
+  unreg = unreg_;
+  lookup = lookup_;
+}
+
+
+int main()
+{
+  acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
+
+  STATE_OP (state, = 0);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_reg);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 2);
+  }
+  assert (state == 2);
+
+  STATE_OP (state, = 10);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 12);
+  }
+  assert (state == 15);
+
+  STATE_OP (state, = 20);
+  unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_toggle);
+  unreg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_toggle);
+  unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+  unreg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_toggle);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_toggle);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 20);
+  }
+  assert (state == 20);
+
+  STATE_OP (state, = 30);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_toggle);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_toggle);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_toggle);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_toggle);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 31);
+  }
+  assert (state == 33);
+
+  STATE_OP (state, = 40);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_reg);
+  unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 42);
+  }
+  assert (state == 45);
+
+  STATE_OP (state, = 50);
+  unreg (acc_ev_compute_construct_end, NULL, acc_toggle);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 52);
+  }
+  assert (state == 52);
+
+  STATE_OP (state, = 60);
+  unreg (acc_ev_compute_construct_end, NULL, acc_toggle);
+  unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+  unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 60);
+  }
+  assert (state == 60);
+
+  STATE_OP (state, = 70);
+  unreg (acc_ev_compute_construct_start, NULL, acc_toggle);
+  reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 70);
+  }
+  assert (state == 70);
+
+  STATE_OP (state, = 80);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  reg (acc_ev_compute_construct_end, NULL, acc_toggle);
+  reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 80);
+  }
+  assert (state == 82);
+
+  STATE_OP (state, = 90);
+  reg (acc_ev_compute_construct_start, NULL, acc_toggle);
+  unreg (acc_ev_compute_construct_end, NULL, acc_toggle);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 92);
+  }
+  assert (state == 92);
+
+  STATE_OP (state, = 100);
+  reg (acc_ev_compute_construct_end, NULL, acc_toggle);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 102);
+  }
+  assert (state == 105);
+
+  STATE_OP (state, = 110);
+  unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle);
+  unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 110);
+  }
+  assert (state == 110);
+
+  STATE_OP (state, = 120);
+  unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 120);
+  }
+  assert (state == 120);
+
+  STATE_OP (state, = 130);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 130);
+  }
+  assert (state == 130);
+
+  STATE_OP (state, = 140);
+  unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 140);
+  }
+  assert (state == 140);
+
+  STATE_OP (state, = 150);
+  reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 152);
+  }
+  assert (state == 155);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
new file mode 100644
index 000000000000..b356feb8108c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
@@ -0,0 +1,316 @@ 
+/* Test dispatch of events to callbacks.  */
+
+#undef NDEBUG
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <acc_prof.h>
+
+
+/* Use explicit 'copyin' clauses, to work around "'firstprivate'
+   optimizations", which will cause the value at the point of call to be used
+   (*before* any potential modifications done in callbacks), as opposed to its
+   address being taken, which then later gets dereferenced (*after* any
+   modifications done in callbacks).  */
+#define COPYIN(...) copyin(__VA_ARGS__)
+
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+
+static int state = -1;
+
+#define STATE_OP(state, op) \
+  do \
+    { \
+      typeof (state) state_o = (state); \
+      (void) state_o; \
+      (state)op; \
+      DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
+    } \
+  while (0)
+
+
+static acc_device_t acc_device_type;
+static int acc_device_num;
+static int acc_async;
+
+
+struct tool_info
+{
+  acc_event_info event_info;
+  struct tool_info *nested;
+};
+struct tool_info *tool_info;
+
+static void cb_device_init_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 0
+	  || state == 100);
+  STATE_OP (state, ++);
+
+  assert (tool_info == NULL);
+  tool_info = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info != NULL);
+  tool_info->nested = NULL;
+
+  assert (prof_info->event_type == acc_ev_device_init_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  if (state == 1)
+    assert (prof_info->device_type == acc_device_host);
+  else
+    assert (prof_info->device_type == acc_device_default);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async_sync);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_runtime_api);
+  assert (event_info->other_event.implicit == 0);
+  assert (event_info->other_event.tool_info == NULL);
+
+  assert (api_info->device_api == acc_device_api_none);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  tool_info->event_info.other_event.event_type = event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info;
+}
+
+static void cb_device_init_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 1
+	  || state == 101);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_device_init_start);
+
+  assert (prof_info->event_type == acc_ev_device_init_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  if (state == 2)
+    assert (prof_info->device_type == acc_device_host);
+  else
+    assert (prof_info->device_type == acc_device_default);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async_sync);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_runtime_api);
+  assert (event_info->other_event.implicit == 0);
+  assert (event_info->other_event.tool_info == tool_info);
+
+  assert (api_info->device_api == acc_device_api_none);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  free (tool_info);
+  tool_info = NULL;
+}
+
+static void cb_compute_construct_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 10
+	  || state == 110);
+  STATE_OP (state, ++);
+
+  assert (tool_info == NULL);
+  tool_info = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info != NULL);
+  tool_info->nested = NULL;
+
+  assert (prof_info->event_type == acc_ev_compute_construct_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == /* TODO acc_async */ acc_async_sync);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 0);
+  assert (event_info->other_event.tool_info == NULL);
+
+  assert (api_info->device_api == acc_device_api_none);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  tool_info->event_info.other_event.event_type = event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info;
+}
+
+static void cb_compute_construct_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 11
+	  || state == 111);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested == NULL);
+
+  assert (prof_info->event_type == acc_ev_compute_construct_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  if (acc_device_type == acc_device_host)
+    assert (prof_info->async == acc_async_sync);
+  else
+    assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 0);
+  assert (event_info->other_event.tool_info == tool_info);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  free (tool_info);
+  tool_info = NULL;
+}
+
+
+static acc_prof_reg reg;
+static acc_prof_reg unreg;
+static acc_prof_lookup_func lookup;
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg = reg_;
+  unreg = unreg_;
+  lookup = lookup_;
+}
+
+
+int main()
+{
+  acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
+
+  STATE_OP (state, = 0);
+  reg (acc_ev_device_init_start, cb_device_init_start, acc_reg);
+  reg (acc_ev_device_init_end, cb_device_init_end, acc_reg);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end, acc_reg);
+  assert (state == 0);
+
+  acc_init (acc_device_host);
+  assert (state == 2);
+
+  STATE_OP (state, = 10);
+
+  acc_device_type = acc_get_device_type ();
+  acc_device_num = acc_get_device_num (acc_device_type);
+  acc_async = 12;
+
+  {
+    int state_init;
+#pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+#pragma acc wait
+    assert (state_init == 11);
+  }
+  assert (state == 12);
+
+  STATE_OP (state, = 90);
+  acc_shutdown (acc_device_host);
+  assert (state == 90);
+
+
+  STATE_OP (state, = 100);
+  acc_init (acc_device_default);
+  assert (state == 102);
+
+  STATE_OP (state, = 110);
+
+  acc_device_type = acc_get_device_type ();
+  acc_device_num = acc_get_device_num (acc_device_type);
+  acc_async = 12;
+
+  {
+    int state_init;
+#pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+#pragma acc wait
+    assert (state_init == 111);
+  }
+  assert (state == 112);
+
+  STATE_OP (state, = 190);
+  acc_shutdown (acc_device_default);
+  assert (state == 190);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c
new file mode 100644
index 000000000000..7cfc364e4113
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c
@@ -0,0 +1,229 @@ 
+/* Test dispatch of events to callbacks.  */
+
+#undef NDEBUG
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <acc_prof.h>
+
+
+/* Use explicit 'copyin' clauses, to work around "'firstprivate'
+   optimizations", which will cause the value at the point of call to be used
+   (*before* any potential modifications done in callbacks), as opposed to its
+   address being taken, which then later gets dereferenced (*after* any
+   modifications done in callbacks).  */
+#define COPYIN(...) copyin(__VA_ARGS__)
+
+
+/* See the 'DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT' reference in
+   'libgomp.texi'.  */
+#define DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT 0
+
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+
+volatile // TODO PR90488
+static int state = -1;
+
+#define STATE_OP(state, op) \
+  do \
+    { \
+      typeof (state) state_o = (state); \
+      (void) state_o; \
+      (state)op; \
+      DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
+    } \
+  while (0)
+
+
+static acc_device_t acc_device_type;
+static int acc_device_num;
+static int num_gangs, num_workers, vector_length;
+
+
+static void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (acc_device_type != acc_device_host);
+
+  assert (state == 0);
+  STATE_OP (state, = 1);
+
+  assert (prof_info->event_type == acc_ev_enqueue_launch_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async_sync);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->launch_event.event_type == prof_info->event_type);
+  assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
+  assert (event_info->launch_event.parent_construct == acc_construct_parallel);
+  assert (event_info->launch_event.implicit == 1);
+  assert (event_info->launch_event.tool_info == NULL);
+  assert (event_info->launch_event.kernel_name != NULL);
+  {
+    const char *s = strstr (event_info->launch_event.kernel_name, "main");
+    assert (s != NULL);
+    s = strstr (s, "omp_fn");
+    assert (s != NULL);
+  }
+  if (num_gangs < 1)
+    assert (event_info->launch_event.num_gangs >= 1);
+  else
+    {
+#ifdef __OPTIMIZE__
+      assert (event_info->launch_event.num_gangs == num_gangs);
+#else
+      /* No parallelized OpenACC 'kernels' constructs.  Unparallelized OpenACC
+	 'kernels' constructs must get launched as 1 x 1 x 1 GPU kernels.  */
+      assert (event_info->launch_event.num_gangs == 1);
+#endif
+    }
+  if (num_workers < 1)
+    assert (event_info->launch_event.num_workers >= 1);
+  else
+    {
+#ifdef __OPTIMIZE__
+      assert (event_info->launch_event.num_workers == num_workers);
+#else
+      /* See 'num_gangs' above.  */
+      assert (event_info->launch_event.num_workers == 1);
+#endif
+    }
+  if (vector_length < 1)
+    assert (event_info->launch_event.vector_length >= 1);
+  else if (acc_device_type == acc_device_nvidia) /* ... is special.  */
+    assert (event_info->launch_event.vector_length == 32);
+  else
+    {
+#ifdef __OPTIMIZE__
+      assert (event_info->launch_event.vector_length == vector_length);
+#else
+      /* See 'num_gangs' above.  */
+      assert (event_info->launch_event.vector_length == 1);
+#endif
+    }
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+}
+
+
+static acc_prof_reg reg;
+static acc_prof_reg unreg;
+static acc_prof_lookup_func lookup;
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg = reg_;
+  unreg = unreg_;
+  lookup = lookup_;
+}
+
+
+int main()
+{
+  acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
+
+  STATE_OP (state, = 0);
+  reg (acc_ev_enqueue_launch_start, cb_enqueue_launch_start, acc_reg);
+  assert (state == 0);
+
+  acc_device_type = acc_get_device_type ();
+  acc_device_num = acc_get_device_num (acc_device_type);
+  assert (state == 0);
+
+  /* Parallelism dimensions: compiler/runtime decides.  */
+  STATE_OP (state, = 0);
+  num_gangs = num_workers = vector_length = 0;
+  {
+#define N 100
+    int x[N];
+#pragma acc kernels
+    {
+      for (int i = 0; i < N; ++i)
+	x[i] = i * i;
+    }
+    if (acc_device_type == acc_device_host)
+      assert (state == 0); /* No 'acc_ev_enqueue_launch_start'.  */
+    else
+      assert (state == 1);
+    for (int i = 0; i < N; ++i)
+      if (x[i] != i * i)
+	__builtin_abort ();
+#undef N
+  }
+
+  /* Parallelism dimensions: literal.  */
+  STATE_OP (state, = 0);
+  num_gangs = 30;
+  num_workers = 3;
+  vector_length = 5;
+  {
+#define N 100
+    int x[N];
+#pragma acc kernels \
+  num_gangs (30) num_workers (3) vector_length (5)
+    /* { dg-prune-output "using vector_length \\(32\\), ignoring 5" } */
+    {
+      for (int i = 0; i < N; ++i)
+	x[i] = i * i;
+    }
+    if (acc_device_type == acc_device_host)
+      assert (state == 0); /* No 'acc_ev_enqueue_launch_start'.  */
+    else
+      assert (state == 1);
+    for (int i = 0; i < N; ++i)
+      if (x[i] != i * i)
+	__builtin_abort ();
+#undef N
+  }
+
+  /* Parallelism dimensions: variable.  */
+  STATE_OP (state, = 0);
+  num_gangs = 22;
+  num_workers = 5;
+  vector_length = 7;
+  {
+#define N 100
+    int x[N];
+#pragma acc kernels \
+  num_gangs (num_gangs) num_workers (num_workers) vector_length (vector_length)
+    /* { dg-prune-output "using vector_length \\(32\\), ignoring runtime setting" } */
+    {
+      for (int i = 0; i < N; ++i)
+	x[i] = i * i;
+    }
+    if (acc_device_type == acc_device_host)
+      assert (state == 0); /* No 'acc_ev_enqueue_launch_start'.  */
+    else
+      assert (state == 1);
+    for (int i = 0; i < N; ++i)
+      if (x[i] != i * i)
+	__builtin_abort ();
+#undef N
+  }
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
new file mode 100644
index 000000000000..ac6eb48cbbef
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
@@ -0,0 +1,719 @@ 
+/* Test dispatch of events to callbacks.  */
+
+#undef NDEBUG
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <acc_prof.h>
+
+
+/* Use explicit 'copyin' clauses, to work around "'firstprivate'
+   optimizations", which will cause the value at the point of call to be used
+   (*before* any potential modifications done in callbacks), as opposed to its
+   address being taken, which then later gets dereferenced (*after* any
+   modifications done in callbacks).  */
+#define COPYIN(...) copyin(__VA_ARGS__)
+
+
+/* See the 'DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT' reference in
+   libgomp.texi.  */
+#define DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT 0
+
+
+/* Do we expect to see 'acc_ev_exit_data_start' and 'acc_ev_exit_data_end'
+   after a compute construct with an 'async' clause?  */
+#define ASYNC_EXIT_DATA 1
+
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+
+static int state = -1;
+
+#define STATE_OP(state, op) \
+  do \
+    { \
+      typeof (state) state_o = (state); \
+      (void) state_o; \
+      (state)op; \
+      DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
+    } \
+  while (0)
+
+
+static acc_device_t acc_device_type;
+static int acc_device_num;
+static int acc_async;
+
+
+struct tool_info
+{
+  acc_event_info event_info;
+  struct tool_info *nested;
+};
+struct tool_info *tool_info;
+
+static void cb_device_init_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+  assert (state == 1
+	  || state == 101);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested == NULL);
+  tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info->nested != NULL);
+  tool_info->nested->nested = NULL;
+#else
+  assert (state == 0
+	  || state == 100);
+  STATE_OP (state, ++);
+
+  assert (tool_info == NULL);
+  tool_info = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info != NULL);
+  tool_info->nested = NULL;
+#endif
+
+  assert (prof_info->event_type == acc_ev_device_init_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_default);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async_sync);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 1);
+  assert (event_info->other_event.tool_info == NULL);
+
+  assert (api_info->device_api == acc_device_api_none);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+  tool_info->nested->event_info.other_event.event_type = event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info->nested;
+#else
+  tool_info->event_info.other_event.event_type = event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info;
+#endif
+}
+
+static void cb_device_init_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+  assert (state == 2
+	  || state == 102);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested != NULL);
+  assert (tool_info->nested->event_info.other_event.event_type == acc_ev_device_init_start);
+#else
+  assert (state == 1
+	  || state == 101);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_device_init_start);
+#endif
+
+  assert (prof_info->event_type == acc_ev_device_init_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_default);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async_sync);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 1);
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+  assert (event_info->other_event.tool_info == tool_info->nested);
+#else
+  assert (event_info->other_event.tool_info == tool_info);
+#endif
+
+  assert (api_info->device_api == acc_device_api_none);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+  free (tool_info->nested);
+  tool_info->nested = NULL;
+#else
+  free (tool_info);
+  tool_info = NULL;
+#endif
+}
+
+static void cb_enter_data_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 3
+	  || state == 103);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested == NULL);
+  tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info->nested != NULL);
+  tool_info->nested->nested = NULL;
+
+  assert (prof_info->event_type == acc_ev_enter_data_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 1);
+  assert (event_info->other_event.tool_info == NULL);
+
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  tool_info->nested->event_info.other_event.event_type = event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info->nested;
+}
+
+static void cb_enter_data_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 4
+	  || state == 104);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested != NULL);
+  assert (tool_info->nested->event_info.other_event.event_type == acc_ev_enter_data_start);
+
+  assert (prof_info->event_type == acc_ev_enter_data_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 1);
+  assert (event_info->other_event.tool_info == tool_info->nested);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  free (tool_info->nested);
+  tool_info->nested = NULL;
+}
+
+static void cb_exit_data_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 7
+#if ASYNC_EXIT_DATA
+	  || state == 107
+#endif
+	  );
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested == NULL);
+  tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info->nested != NULL);
+  tool_info->nested->nested = NULL;
+
+  assert (prof_info->event_type == acc_ev_exit_data_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 1);
+  assert (event_info->other_event.tool_info == NULL);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  tool_info->nested->event_info.other_event.event_type = event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info->nested;
+}
+
+static void cb_exit_data_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 8
+#if ASYNC_EXIT_DATA
+	  || state == 108
+#endif
+	  );
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested != NULL);
+  assert (tool_info->nested->event_info.other_event.event_type == acc_ev_exit_data_start);
+
+  assert (prof_info->event_type == acc_ev_exit_data_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 1);
+  assert (event_info->other_event.tool_info == tool_info->nested);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  free (tool_info->nested);
+  tool_info->nested = NULL;
+}
+
+static void cb_compute_construct_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+  assert (state == 0
+	  || state == 100);
+  if (state == 100)
+    {
+      /* Compensate for the missing 'acc_ev_device_init_start' and
+	 'acc_ev_device_init_end'.  */
+      state += 2;
+    }
+#else
+  if (state == 100)
+    {
+      /* Compensate for the missing 'acc_ev_device_init_start' and
+	 'acc_ev_device_init_end'.  */
+      state += 2;
+    }
+  assert (state == 2
+	  || state == 102);
+#endif
+  STATE_OP (state, ++);
+
+  assert (tool_info == NULL);
+  tool_info = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info != NULL);
+  tool_info->nested = NULL;
+
+  assert (prof_info->event_type == acc_ev_compute_construct_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == /* TODO acc_async */ acc_async_sync);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 0);
+  assert (event_info->other_event.tool_info == NULL);
+
+  assert (api_info->device_api == acc_device_api_none);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  tool_info->event_info.other_event.event_type = event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info;
+
+  if (acc_device_type == acc_device_host)
+    {
+      /* Compensate for the missing 'acc_ev_enter_data_start'.  */
+      state += 1;
+    }
+}
+
+static void cb_compute_construct_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  if (acc_device_type == acc_device_host)
+    {
+      /* Compensate for the missing 'acc_ev_enter_data_end'.  */
+      state += 1;
+      /* Compensate for the missing 'acc_ev_enqueue_launch_start' and
+	 'acc_ev_enqueue_launch_end'.  */
+      state += 2;
+      /* Compensate for the missing 'acc_ev_exit_data_start' and
+	 'acc_ev_exit_data_end'.  */
+      state += 2;
+    }
+#if !ASYNC_EXIT_DATA
+  else if (acc_async != acc_async_sync)
+    {
+      /* Compensate for the missing 'acc_ev_exit_data_start' and
+	 'acc_ev_exit_data_end'.  */
+      state += 2;
+    }
+#endif
+  assert (state == 9
+	  || state == 109);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested == NULL);
+
+  assert (prof_info->event_type == acc_ev_compute_construct_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  if (acc_device_type == acc_device_host)
+    assert (prof_info->async == acc_async_sync);
+  else
+    assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 0);
+  assert (event_info->other_event.tool_info == tool_info);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  free (tool_info);
+  tool_info = NULL;
+}
+
+static void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (acc_device_type != acc_device_host);
+
+  assert (state == 5
+	  || state == 105);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested == NULL);
+  tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info->nested != NULL);
+  tool_info->nested->nested = NULL;
+
+  assert (prof_info->event_type == acc_ev_enqueue_launch_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->launch_event.event_type == prof_info->event_type);
+  assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
+  assert (event_info->launch_event.parent_construct == acc_construct_parallel);
+  assert (event_info->launch_event.implicit == 1);
+  assert (event_info->launch_event.tool_info == NULL);
+  assert (event_info->launch_event.kernel_name != NULL);
+  {
+    const char *s = strstr (event_info->launch_event.kernel_name, "main");
+    assert (s != NULL);
+    s = strstr (s, "omp_fn");
+    assert (s != NULL);
+  }
+  assert (event_info->launch_event.num_gangs >= 1);
+  assert (event_info->launch_event.num_workers >= 1);
+  assert (event_info->launch_event.vector_length >= 1);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  tool_info->nested->event_info.launch_event.event_type = event_info->launch_event.event_type;
+  tool_info->nested->event_info.launch_event.kernel_name = strdup (event_info->launch_event.kernel_name);
+  tool_info->nested->event_info.launch_event.num_gangs = event_info->launch_event.num_gangs;
+  tool_info->nested->event_info.launch_event.num_workers = event_info->launch_event.num_workers;
+  tool_info->nested->event_info.launch_event.vector_length = event_info->launch_event.vector_length;
+  event_info->other_event.tool_info = tool_info->nested;
+}
+
+static void cb_enqueue_launch_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (acc_device_type != acc_device_host);
+
+  assert (state == 6
+	  || state == 106);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested != NULL);
+  assert (tool_info->nested->event_info.launch_event.event_type == acc_ev_enqueue_launch_start);
+  assert (tool_info->nested->event_info.launch_event.kernel_name != NULL);
+  assert (tool_info->nested->event_info.launch_event.num_gangs >= 1);
+  assert (tool_info->nested->event_info.launch_event.num_workers >= 1);
+  assert (tool_info->nested->event_info.launch_event.vector_length >= 1);
+
+  assert (prof_info->event_type == acc_ev_enqueue_launch_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->launch_event.event_type == prof_info->event_type);
+  assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
+  assert (event_info->launch_event.parent_construct == acc_construct_parallel);
+  assert (event_info->launch_event.implicit == 1);
+  assert (event_info->launch_event.tool_info == tool_info->nested);
+  assert (event_info->launch_event.kernel_name != NULL);
+  assert (strcmp (event_info->launch_event.kernel_name, tool_info->nested->event_info.launch_event.kernel_name) == 0);
+  assert (event_info->launch_event.num_gangs == tool_info->nested->event_info.launch_event.num_gangs);
+  assert (event_info->launch_event.num_workers == tool_info->nested->event_info.launch_event.num_workers);
+  assert (event_info->launch_event.vector_length == tool_info->nested->event_info.launch_event.vector_length);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  free ((void *) tool_info->nested->event_info.launch_event.kernel_name);
+  free (tool_info->nested);
+  tool_info->nested = NULL;
+}
+
+
+static acc_prof_reg reg;
+static acc_prof_reg unreg;
+static acc_prof_lookup_func lookup;
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg = reg_;
+  unreg = unreg_;
+  lookup = lookup_;
+}
+
+
+int main()
+{
+  acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
+
+  STATE_OP (state, = 0);
+  reg (acc_ev_device_init_start, cb_device_init_start, acc_reg);
+  reg (acc_ev_device_init_end, cb_device_init_end, acc_reg);
+  reg (acc_ev_enter_data_start, cb_enter_data_start, acc_reg);
+  reg (acc_ev_enter_data_end, cb_enter_data_end, acc_reg);
+  reg (acc_ev_exit_data_start, cb_exit_data_start, acc_reg);
+  reg (acc_ev_exit_data_end, cb_exit_data_end, acc_reg);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end, acc_reg);
+  reg (acc_ev_enqueue_launch_start, cb_enqueue_launch_start, acc_reg);
+  reg (acc_ev_enqueue_launch_end, cb_enqueue_launch_end, acc_reg);
+  assert (state == 0);
+
+  acc_device_type = acc_get_device_type ();
+  acc_device_num = acc_get_device_num (acc_device_type);
+  acc_async = acc_async_sync;
+  assert (state == 0);
+
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      asm volatile ("" : : : "memory"); // TODO PR90488
+
+      state_init = state;
+    }
+    assert (state_init == 4);
+  }
+  assert (state == 10);
+
+  STATE_OP (state, = 100);
+
+  acc_async = 12;
+  {
+    int state_init;
+#pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init)
+    {
+      asm volatile ("" : : : "memory"); // TODO PR90488
+
+      state_init = state;
+    }
+#pragma acc wait
+    assert (state_init == 104);
+  }
+  assert (state == 110);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c
new file mode 100644
index 000000000000..5b58c51d4c42
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c
@@ -0,0 +1,226 @@ 
+/* Test the 'valid_bytes' magic.  */
+
+#undef NDEBUG
+#include <assert.h>
+
+#include <acc_prof.h>
+
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+
+static int ev_count_data;
+
+static void cb_data_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
+
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (event_info->data_event.valid_bytes == _ACC_DATA_EVENT_INFO_VALID_BYTES);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+
+  ++ev_count_data;
+}
+
+static int ev_count_launch;
+
+static void cb_launch_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
+
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+
+  ++ev_count_launch;
+}
+
+static int ev_count_other;
+
+static void cb_other_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
+
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+
+  ++ev_count_other;
+}
+
+
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg_ (acc_ev_device_init_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_device_init_end, cb_other_event, acc_reg);
+  reg_ (acc_ev_device_shutdown_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_device_shutdown_end, cb_other_event, acc_reg);
+  reg_ (acc_ev_runtime_shutdown, cb_other_event, acc_reg);
+  reg_ (acc_ev_create, cb_data_event, acc_reg);
+  reg_ (acc_ev_delete, cb_data_event, acc_reg);
+  reg_ (acc_ev_alloc, cb_data_event, acc_reg);
+  reg_ (acc_ev_free, cb_data_event, acc_reg);
+  reg_ (acc_ev_enter_data_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_enter_data_end, cb_other_event, acc_reg);
+  reg_ (acc_ev_exit_data_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_exit_data_end, cb_other_event, acc_reg);
+  reg_ (acc_ev_update_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_update_end, cb_other_event, acc_reg);
+  reg_ (acc_ev_compute_construct_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_compute_construct_end, cb_other_event, acc_reg);
+  reg_ (acc_ev_enqueue_launch_start, cb_launch_event, acc_reg);
+  reg_ (acc_ev_enqueue_launch_end, cb_launch_event, acc_reg);
+  reg_ (acc_ev_enqueue_upload_start, cb_data_event, acc_reg);
+  reg_ (acc_ev_enqueue_upload_end, cb_data_event, acc_reg);
+  reg_ (acc_ev_enqueue_download_start, cb_data_event, acc_reg);
+  reg_ (acc_ev_enqueue_download_end, cb_data_event, acc_reg);
+  reg_ (acc_ev_wait_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_wait_end, cb_other_event, acc_reg);
+}
+
+
+/* Basic struct.  */
+typedef struct A
+{
+  int a;
+  int b;
+#define VALID_BYTES_A \
+  _ACC_PROF_VALID_BYTES_STRUCT (A, b, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (int))
+} A;
+
+/* Add a 'char' field.  */
+typedef struct B
+{
+  int a;
+  int b;
+  char c;
+#define VALID_BYTES_B \
+  _ACC_PROF_VALID_BYTES_STRUCT (B, c, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (char))
+} B;
+
+/* Add another 'char' field.  */
+typedef struct C
+{
+  int a;
+  int b;
+  char c, d;
+#define VALID_BYTES_C \
+  _ACC_PROF_VALID_BYTES_STRUCT (C, d, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (char))
+} C;
+
+/* Add two 'void *' fields.  */
+typedef struct D
+{
+  int a;
+  int b;
+  char c, d;
+  void *e;
+  void *f;
+#define VALID_BYTES_D \
+  _ACC_PROF_VALID_BYTES_STRUCT (D, f, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
+} D;
+
+/* Add another three 'char' fields.  */
+typedef struct E
+{
+  int a;
+  int b;
+  char c, d;
+  void *e;
+  void *f;
+  char g, h, i;
+#define VALID_BYTES_E \
+  _ACC_PROF_VALID_BYTES_STRUCT (E, i, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (char))
+} E;
+
+
+int main()
+{
+  acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
+
+  A A1;
+  DEBUG_printf ("s=%zd, vb=%zd\n", sizeof A1, VALID_BYTES_A);
+  assert (VALID_BYTES_A <= sizeof A1);
+  DEBUG_printf ("&A1=%p, &A1.b=%p\n", &A1, &A1.b);
+  assert (((char *) &A1) + VALID_BYTES_A == (char *) (&A1.b + 1));
+
+  B B1;
+  DEBUG_printf ("s=%zd, vb=%zd\n", sizeof B1, VALID_BYTES_B);
+  assert (VALID_BYTES_B <= sizeof B1);
+  DEBUG_printf ("&B1=%p, &B1.c=%p\n", &B1, &B1.c);
+  assert (((char *) &B1) + VALID_BYTES_B == (char *) (&B1.c + 1));
+
+  assert (VALID_BYTES_B == VALID_BYTES_A + 1 * sizeof (char));
+
+  C C1;
+  DEBUG_printf ("s=%zd, vb=%zd\n", sizeof C1, VALID_BYTES_C);
+  assert (VALID_BYTES_C <= sizeof C1);
+  DEBUG_printf ("&C1=%p, &C1.d=%p\n", &C1, &C1.d);
+  assert (((char *) &C1) + VALID_BYTES_C == (char *) (&C1.d + 1));
+
+  assert (VALID_BYTES_C == VALID_BYTES_B + 1 * sizeof (char));
+
+  D D1;
+  DEBUG_printf ("s=%zd, vb=%zd\n", sizeof D1, VALID_BYTES_D);
+  assert (VALID_BYTES_D <= sizeof D1);
+  DEBUG_printf ("&D1=%p, &D1.f=%p\n", &D1, &D1.f);
+  assert (((char *) &D1) + VALID_BYTES_D == (char *) (&D1.f + 1));
+
+  assert (VALID_BYTES_D > VALID_BYTES_C);
+
+  E E1;
+  DEBUG_printf ("s=%zd, vb=%zd\n", sizeof E1, VALID_BYTES_E);
+  assert (VALID_BYTES_E <= sizeof E1);
+  DEBUG_printf ("&E1=%p, &E1.i=%p\n", &E1, &E1.i);
+  assert (((char *) &E1) + VALID_BYTES_E == (char *) (&E1.i + 1));
+
+  assert (VALID_BYTES_E == VALID_BYTES_D + 3 * sizeof (char));
+
+  ev_count_data = 0;
+  ev_count_launch = 0;
+  ev_count_other = 0;
+
+  /* Trigger tests done in 'cb_*' functions.  */
+  int host;
+#pragma acc parallel copyout (host)
+  {
+    asm volatile ("" : : : "memory"); // TODO PR90488
+
+    host = acc_on_device (acc_device_host);
+  }
+
+  DEBUG_printf ("ev_count_data = %d\n", ev_count_data);
+  if (host)
+    assert (ev_count_data == 0);
+  else
+    {
+      /* We don't know exactly how many data events to expect, but we at least
+	 expect some.  */
+      assert (ev_count_data > 0);
+    }
+
+  DEBUG_printf ("ev_count_launch = %d\n", ev_count_launch);
+  if (host)
+    assert (ev_count_data == 0);
+  else
+    {
+      /* We expect two launch events, 'acc_ev_enqueue_launch_start',
+	 'acc_ev_enqueue_launch_end'.  */
+      assert (ev_count_launch == 2);
+    }
+
+  DEBUG_printf ("ev_count_other = %d\n", ev_count_other);
+  /* We don't know exactly how many other events to expect, but we at least
+     expect 'acc_ev_device_init_start', 'acc_ev_device_init_end',
+     'acc_ev_compute_construct_start', 'acc_ev_compute_construct_end'.  */
+  assert (ev_count_other >= 4);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c
new file mode 100644
index 000000000000..f53786871671
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c
@@ -0,0 +1,76 @@ 
+/* Test the 'version' field of 'acc_prof_info'.  */
+
+#undef NDEBUG
+#include <assert.h>
+
+#include <acc_prof.h>
+
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+
+static int ev_count;
+
+
+static void cb_any_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
+
+  assert (prof_info->version == 201711);
+
+  ++ev_count;
+}
+
+
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg_ (acc_ev_device_init_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_device_init_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_device_shutdown_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_device_shutdown_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_runtime_shutdown, cb_any_event, acc_reg);
+  reg_ (acc_ev_create, cb_any_event, acc_reg);
+  reg_ (acc_ev_delete, cb_any_event, acc_reg);
+  reg_ (acc_ev_alloc, cb_any_event, acc_reg);
+  reg_ (acc_ev_free, cb_any_event, acc_reg);
+  reg_ (acc_ev_enter_data_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_enter_data_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_exit_data_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_exit_data_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_update_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_update_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_compute_construct_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_compute_construct_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_enqueue_launch_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_enqueue_launch_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_enqueue_upload_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_enqueue_upload_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_enqueue_download_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_enqueue_download_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_wait_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_wait_end, cb_any_event, acc_reg);
+}
+
+
+int main()
+{
+  acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
+
+  ev_count = 0;
+
+  /* Trigger tests done in 'cb_*' functions.  */
+#pragma acc parallel
+  {
+    asm volatile ("" : : : "memory"); // TODO PR90488
+  }
+
+  DEBUG_printf ("ev_count = %d\n", ev_count);
+  /* We don't know exactly how many events to expect, but we at least expect
+     'acc_ev_device_init_start', 'acc_ev_device_init_end',
+     'acc_ev_compute_construct_start', 'acc_ev_compute_construct_end'.  */
+  assert (ev_count >= 4);
+
+  return 0;
+}
-- 
2.17.1