[2/2] OpenACC: Add Graphite-based "kernels" handling to pass_convert_oacc_kernels

Message ID xu8f1rgyj2zs.fsf@harwath.name
State New
Headers show
Series
  • Use Graphite for OpenACC "kernels" regions
Related show

Commit Message

Frederik Harwath Nov. 12, 2020, 10:14 a.m.
This patch changes the "kernels" conversion to route loops in OpenACC
"kernels" regions through Graphite. This is done by converting the loops
in "kernels" regions which are not yet known to be "independent" to
"auto" loops as in the current (OG10) "parloops" based "kernels"
handling. Afterwards, the "kernels" regions will now be treated
essentially like "parallel" regions. A new internal target kind however
still enables to distinguish between the types of regions which is
useful for diagnostic messages.

The old "parloops" based "kernels" handling will be deprecated, but is
still available through the command line options
"-fopenacc-kernels=split-parloops" and "-fopenacc-kernels=parloops".
---
 gcc/c-family/c.opt                            |  5 +-
 gcc/doc/invoke.texi                           | 10 ++-
 gcc/doc/passes.texi                           |  6 +-
 gcc/flag-types.h                              |  1 +
 gcc/gimple-pretty-print.c                     |  3 +
 gcc/gimple.h                                  |  9 ++-
 gcc/gimplify.c                                |  1 +
 gcc/omp-expand.c                              | 63 +++++++++++++--
 gcc/omp-general.c                             | 19 ++++-
 gcc/omp-general.h                             |  1 +
 gcc/omp-low.c                                 | 76 +++++++++++++++----
 gcc/omp-oacc-kernels.c                        | 59 ++++++++++++--
 gcc/omp-offload.c                             | 50 +++++++++++-
 .../goacc/kernels-conversion-parloops.c       | 61 +++++++++++++++
 .../c-c++-common/goacc/kernels-conversion.c   | 12 +--
 .../gfortran.dg/goacc/kernels-reductions.f90  | 37 +++++++++
 gcc/tree-parloops.c                           | 16 +++-
 gcc/tree-ssa-loop.c                           | 10 +++
 18 files changed, 395 insertions(+), 44 deletions(-)
 create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-conversion-parloops.c
 create mode 100644 gcc/testsuite/gfortran.dg/goacc/kernels-reductions.f90

--
2.17.1


-----------------
Mentor Graphics (Deutschland) GmbH, Arnulfstraße 201, 80634 München / Germany
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Alexander Walter

Patch

diff --git a/gcc/c-family/c.opt b/gcc/c-family/c.opt
index 4ef7ea76aa1..255ff84ca4b 100644
--- a/gcc/c-family/c.opt
+++ b/gcc/c-family/c.opt
@@ -1747,7 +1747,7 @@  Specify default OpenACC compute dimensions.

 fopenacc-kernels=
 C ObjC C++ ObjC++ RejectNegative Joined Enum(openacc_kernels) Var(flag_openacc_kernels) Init(OPENACC_KERNELS_SPLIT)
--fopenacc-kernels=[split|parloops]     Configure OpenACC 'kernels' constructs handling.
+-fopenacc-kernels=[split|split-parloops|parloops]      Configure OpenACC 'kernels' constructs handling.

 Enum
 Name(openacc_kernels) Type(enum openacc_kernels)
@@ -1755,6 +1755,9 @@  Name(openacc_kernels) Type(enum openacc_kernels)
 EnumValue
 Enum(openacc_kernels) String(split) Value(OPENACC_KERNELS_SPLIT)

+EnumValue
+Enum(openacc_kernels) String(split-parloops) Value(OPENACC_KERNELS_SPLIT_PARLOOPS)
+
 EnumValue
 Enum(openacc_kernels) String(parloops) Value(OPENACC_KERNELS_PARLOOPS)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index fe04b4d8e6a..d713d6ae8ab 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -2266,12 +2266,20 @@  permitted.
 @opindex fopenacc-kernels
 @cindex OpenACC accelerator programming
 Configure OpenACC 'kernels' constructs handling.
+
 With @option{-fopenacc-kernels=split}, OpenACC 'kernels' constructs
 are split into a sequence of compute constructs, each then handled
-individually.
+individually. The data dependence analysis that is necessary to
+determine if loops can be parallelized is performed by the Graphite
+pass.
 This is the default.
+With @option{-fopenacc-kernels=split-parloops}, OpenACC 'kernels' constructs
+are split into a sequence of compute constructs, each then handled
+individually.
+This is deprecated.
 With @option{-fopenacc-kernels=parloops}, the whole OpenACC
 'kernels' constructs is handled by the @samp{parloops} pass.
+This is deprecated.

 @item -fopenmp
 @opindex fopenmp
diff --git a/gcc/doc/passes.texi b/gcc/doc/passes.texi
index 7424690dac3..5dda056a2bb 100644
--- a/gcc/doc/passes.texi
+++ b/gcc/doc/passes.texi
@@ -248,9 +248,9 @@  constraints in order to generate the points-to sets.  It is located in

 This is a pass group for processing OpenACC kernels regions.  It is a
 subpass of the IPA OpenACC pass group that runs on offloaded functions
-containing OpenACC kernels loops.  It is located in
-@file{tree-ssa-loop.c} and is described by
-@code{pass_ipa_oacc_kernels}.
+containing OpenACC kernels loops if @samp{parloops} based handling of
+kernels regions is used. It is located in @file{tree-ssa-loop.c} and
+is described by @code{pass_ipa_oacc_kernels}.

 @item Target clone

diff --git a/gcc/flag-types.h b/gcc/flag-types.h
index e2255a56745..058c4e214af 100644
--- a/gcc/flag-types.h
+++ b/gcc/flag-types.h
@@ -376,6 +376,7 @@  enum cf_protection_level
 enum openacc_kernels
 {
   OPENACC_KERNELS_SPLIT,
+  OPENACC_KERNELS_SPLIT_PARLOOPS,
   OPENACC_KERNELS_PARLOOPS
 };

diff --git a/gcc/gimple-pretty-print.c b/gcc/gimple-pretty-print.c
index 54a6d318dc5..b4a2b43ebed 100644
--- a/gcc/gimple-pretty-print.c
+++ b/gcc/gimple-pretty-print.c
@@ -1701,6 +1701,9 @@  dump_gimple_omp_target (pretty_printer *buffer, const gomp_target *gs,
     case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
       kind = " oacc_parallel_kernels_gang_single";
       break;
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
+      kind = " oacc_parallel_kernels_graphite";
+      break;
     case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
       kind = " oacc_data_kernels";
       break;
diff --git a/gcc/gimple.h b/gcc/gimple.h
index bfc2eef21a2..7b25024e035 100644
--- a/gcc/gimple.h
+++ b/gcc/gimple.h
@@ -169,7 +169,7 @@  enum gf_mask {
        loop statements.  */
     GF_OMP_FOR_GRID_INTRA_GROUP        = 1 << 5,
     GF_OMP_FOR_GRID_GROUP_ITER  = 1 << 6,
-    GF_OMP_TARGET_KIND_MASK    = (1 << 4) - 1,
+    GF_OMP_TARGET_KIND_MASK    = (1 << 5) - 1,
     GF_OMP_TARGET_KIND_REGION  = 0,
     GF_OMP_TARGET_KIND_DATA    = 1,
     GF_OMP_TARGET_KIND_UPDATE  = 2,
@@ -189,9 +189,12 @@  enum gf_mask {
     /* A GF_OMP_TARGET_KIND_OACC_PARALLEL that originates from a 'kernels'
        construct, "gang-single".  */
     GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE = 14,
+    /* A GF_OMP_TARGET_KIND_OACC_PARALLEL that originates from a 'kernels'
+       construct, for Graphite to analyze.  */
+    GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE = 15,
     /* A GF_OMP_TARGET_KIND_OACC_DATA that originates from a 'kernels'
        construct.  */
-    GF_OMP_TARGET_KIND_OACC_DATA_KERNELS = 15,
+    GF_OMP_TARGET_KIND_OACC_DATA_KERNELS = 16,
     GF_OMP_TEAMS_GRID_PHONY    = 1 << 0,
     GF_OMP_TEAMS_HOST          = 1 << 1,

@@ -6610,6 +6613,7 @@  is_gimple_omp_oacc (const gimple *stmt)
        case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
        case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
        case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+       case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
        case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
          return true;
        default:
@@ -6638,6 +6642,7 @@  is_gimple_omp_offloaded (const gimple *stmt)
        case GF_OMP_TARGET_KIND_OACC_SERIAL:
        case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
        case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+       case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
          return true;
        default:
          return false;
diff --git a/gcc/gimplify.c b/gcc/gimplify.c
index 0b3ba5997cf..292983f2dab 100644
--- a/gcc/gimplify.c
+++ b/gcc/gimplify.c
@@ -13103,6 +13103,7 @@  gimplify_omp_workshare (tree *expr_p, gimple_seq *pre_p)
     {
       push_gimplify_context ();

+      //TODO-kernels: What needs to be done here?
       /* FIXME: Reductions are not supported in kernels regions yet.  */
       if (/*ort == ORT_ACC_KERNELS ||*/ ort == ORT_ACC_PARALLEL)
         localize_reductions (OMP_CLAUSES (expr), OMP_BODY (expr));
diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c
index b3d21c1181d..a5c3c8ae1a7 100644
--- a/gcc/omp-expand.c
+++ b/gcc/omp-expand.c
@@ -108,7 +108,8 @@  struct omp_region
      a depend clause.  */
   gomp_ordered *ord_stmt;

-  /* True if this is nested inside an OpenACC kernels construct.  */
+  /* True if this is nested inside an OpenACC kernels construct that
+     will be handled by the "parloops" pass.  */
   bool inside_kernels_p;
 };

@@ -6579,12 +6580,36 @@  expand_omp_for (struct omp_region *region, gimple *inner_stmt)
     loops_state_set (LOOPS_NEED_FIXUP);

   if (region->inside_kernels_p)
-    expand_omp_for_generic (region, &fd, BUILT_IN_NONE, BUILT_IN_NONE,
-                           NULL_TREE, inner_stmt);
+    {
+      gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+                          || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
+      expand_omp_for_generic (region, &fd, BUILT_IN_NONE, BUILT_IN_NONE,
+                             NULL_TREE, inner_stmt);
+    }
   else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_SIMD)
     expand_omp_simd (region, &fd);
   else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP)
     {
+      struct omp_region *target_region;
+      for (target_region = region->outer; target_region;
+          target_region = target_region->outer)
+       {
+         if (region->type == GIMPLE_OMP_TARGET)
+           {
+             gomp_target *entry_stmt
+               = as_a <gomp_target *> (last_stmt (target_region->entry));
+
+             if (gimple_omp_target_kind (entry_stmt) == GF_OMP_TARGET_KIND_OACC_KERNELS)
+               gcc_checking_assert (flag_openacc_kernels != OPENACC_KERNELS_SPLIT_PARLOOPS
+                                    && flag_openacc_kernels != OPENACC_KERNELS_PARLOOPS);
+
+           }
+
+       }
+
+
+
       gcc_assert (!inner_stmt);
       expand_oacc_for (region, &fd);
     }
@@ -7674,6 +7699,9 @@  static void
 mark_loops_in_oacc_kernels_region (basic_block region_entry,
                                   basic_block region_exit)
 {
+  gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+                      || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
   class loop *outer = region_entry->loop_father;
   gcc_assert (region_exit == NULL || outer == region_exit->loop_father);

@@ -7955,6 +7983,10 @@  expand_omp_target (struct omp_region *region)

   entry_stmt = as_a <gomp_target *> (last_stmt (region->entry));
   target_kind = gimple_omp_target_kind (entry_stmt);
+  if (!(flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+       || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS))
+    gcc_checking_assert (target_kind != GF_OMP_TARGET_KIND_OACC_KERNELS);
+
   new_bb = region->entry;
   oacc_explode_args = false;

@@ -7964,6 +7996,7 @@  expand_omp_target (struct omp_region *region)
     case GF_OMP_TARGET_KIND_OACC_PARALLEL:
     case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
     case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
     case GF_OMP_TARGET_KIND_OACC_SERIAL:
       if (targetm.goacc.explode_args ())
        oacc_explode_args = true;
@@ -8010,6 +8043,9 @@  expand_omp_target (struct omp_region *region)
   switch (target_kind)
     {
     case GF_OMP_TARGET_KIND_OACC_KERNELS:
+      gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+                          || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
       mark_loops_in_oacc_kernels_region (region->entry, region->exit);

       DECL_ATTRIBUTES (child_fn)
@@ -8031,6 +8067,11 @@  expand_omp_target (struct omp_region *region)
        = tree_cons (get_identifier ("oacc parallel_kernels_gang_single"),
                     NULL_TREE, DECL_ATTRIBUTES (child_fn));
       break;
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
+      DECL_ATTRIBUTES (child_fn)
+       = tree_cons (get_identifier ("oacc parallel_kernels_graphite"),
+                    NULL_TREE, DECL_ATTRIBUTES (child_fn));
+      break;
     default:
       break;
     }
@@ -8240,6 +8281,7 @@  expand_omp_target (struct omp_region *region)
     case GF_OMP_TARGET_KIND_OACC_SERIAL:
     case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
     case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
       start_ix = BUILT_IN_GOACC_PARALLEL;
       break;
     case GF_OMP_TARGET_KIND_OACC_DATA:
@@ -8885,6 +8927,9 @@  expand_omp (struct omp_region *region)
        {
          grid_expand_target_grid_body (region);

+         if (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+             || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS)
+         {
          if (region->inner)
            {
              gomp_target *entry
@@ -8894,6 +8939,7 @@  expand_omp (struct omp_region *region)
                      == GF_OMP_TARGET_KIND_OACC_KERNELS))
                region->inner->inside_kernels_p = true;
            }
+         }
        }

       if (region->type == GIMPLE_OMP_FOR
@@ -9046,6 +9092,7 @@  build_omp_regions_1 (basic_block bb, struct omp_region *parent,
                case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
                case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
                case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+               case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
                case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
                  break;
                case GF_OMP_TARGET_KIND_UPDATE:
@@ -9232,9 +9279,14 @@  public:
   /* opt_pass methods: */
   virtual bool gate (function *fun)
     {
-      return !(fun->curr_properties & PROP_gimple_eomp);
+      return !(fun->curr_properties & PROP_gimple_eomp)
+       && (!oacc_get_kernels_attrib (cfun->decl)
+           || flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+           || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
+
     }
-  virtual unsigned int execute (function *) { return execute_expand_omp (); }
+  virtual unsigned int execute (function *) {return execute_expand_omp ();}
   opt_pass * clone () { return new pass_expand_omp_ssa (m_ctxt); }

 }; // class pass_expand_omp_ssa
@@ -9304,6 +9356,7 @@  omp_make_gimple_edges (basic_block bb, struct omp_region **region,
        case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
        case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
        case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+       case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
        case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
          break;
        case GF_OMP_TARGET_KIND_UPDATE:
diff --git a/gcc/omp-general.c b/gcc/omp-general.c
index b1c60cfe421..6e2adb037a5 100644
--- a/gcc/omp-general.c
+++ b/gcc/omp-general.c
@@ -1962,6 +1962,16 @@  oacc_get_fn_attrib (tree fn)
   return lookup_attribute (OACC_FN_ATTRIB, DECL_ATTRIBUTES (fn));
 }

+/* Retrieve the oacc kernels attrib and return it.  Non-oacc
+   functions will return NULL.  */
+
+tree
+oacc_get_kernels_attrib (tree fn)
+{
+  return lookup_attribute ("oacc kernels", DECL_ATTRIBUTES (fn));
+}
+
+
 /* Return true if FN is an OpenMP or OpenACC offloading function.  */

 bool
@@ -1988,10 +1998,15 @@  oacc_get_fn_dim_size (tree fn, int axis)
     dims = TREE_CHAIN (dims);

   tree v = TREE_VALUE (dims);
-  /* TODO With 'pass_oacc_device_lower' moved "later", this is necessary to
+  /* TODO-kernels With 'pass_oacc_device_lower' moved "later", this is necessary to
      avoid ICE for some OpenACC 'kernels' ("parloops") constructs.  */
   if (v == NULL_TREE)
-    return 0;
+    {
+      gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+                          || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
+      return 0;
+    }

   int size = TREE_INT_CST_LOW (v);

diff --git a/gcc/omp-general.h b/gcc/omp-general.h
index 4b6f0e3e43f..7497c9c4eff 100644
--- a/gcc/omp-general.h
+++ b/gcc/omp-general.h
@@ -99,6 +99,7 @@  extern int oacc_verify_routine_clauses (tree, tree *, location_t,
                                        const char *);
 extern tree oacc_build_routine_dims (tree clauses);
 extern tree oacc_get_fn_attrib (tree fn);
+extern tree oacc_get_kernels_attrib (tree fn);
 extern bool offloading_function_p (tree fn);
 extern int oacc_get_fn_dim_size (tree fn, int axis);
 extern int oacc_get_ifn_dim_arg (const gimple *stmt);
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index a740c6e4146..f1f943c17c8 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -210,7 +210,9 @@  is_oacc_parallel_or_serial (omp_context *ctx)
              || (gimple_omp_target_kind (ctx->stmt)
                  == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED)
              || (gimple_omp_target_kind (ctx->stmt)
-                 == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE)));
+                 == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE)
+             || (gimple_omp_target_kind (ctx->stmt)
+                 == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE)));
 }

 /* Return true if CTX corresponds to an oacc kernels region.  */
@@ -236,6 +238,8 @@  was_originally_oacc_kernels (omp_context *ctx)
               == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED)
              || (gimple_omp_target_kind (ctx->stmt)
                  == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE)
+             || (gimple_omp_target_kind (ctx->stmt)
+                 == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE)
              || (gimple_omp_target_kind (ctx->stmt)
                  == GF_OMP_TARGET_KIND_OACC_DATA_KERNELS)));
 }
@@ -2566,11 +2570,19 @@  enclosing_target_ctx (omp_context *ctx)
 static bool
 ctx_in_oacc_kernels_region (omp_context *ctx)
 {
+  gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT
+                      || flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+                      || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
   for (;ctx != NULL; ctx = ctx->outer)
     {
       gimple *stmt = ctx->stmt;
-      if (gimple_code (stmt) == GIMPLE_OMP_TARGET
-         && gimple_omp_target_kind (stmt) == GF_OMP_TARGET_KIND_OACC_KERNELS)
+      if (gimple_code (stmt) != GIMPLE_OMP_TARGET)
+       continue;
+
+      int target_kind = gimple_omp_target_kind (stmt);
+      if (target_kind == GF_OMP_TARGET_KIND_OACC_KERNELS
+         || target_kind == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE)
        return true;
     }

@@ -2584,6 +2596,10 @@  ctx_in_oacc_kernels_region (omp_context *ctx)
 static unsigned
 check_oacc_kernel_gwv (gomp_for *stmt, omp_context *ctx)
 {
+  gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+                      || flag_openacc_kernels == OPENACC_KERNELS_SPLIT
+                      || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
   bool checking = true;
   unsigned outer_mask = 0;
   unsigned this_mask = 0;
@@ -2656,7 +2672,7 @@  scan_omp_for (gomp_for *stmt, omp_context *outer_ctx)
       omp_context *tgt = enclosing_target_ctx (outer_ctx);

       if (!tgt || (is_oacc_parallel_or_serial (tgt)
-                    && !was_originally_oacc_kernels (tgt)))
+                  && !was_originally_oacc_kernels (tgt)))
        for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
          {
            char const *check = NULL;
@@ -2685,8 +2701,12 @@  scan_omp_for (gomp_for *stmt, omp_context *outer_ctx)
                        " OpenACC %<parallel%> or %<serial%>", check);
          }

-      if (tgt && is_oacc_kernels (tgt))
+      if (tgt && (is_oacc_kernels (tgt)))
        {
+         gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+                              || flag_openacc_kernels == OPENACC_KERNELS_SPLIT
+                              || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
          /* Strip out reductions, as they are not handled yet.  */
          tree *prev_ptr = &clauses;

@@ -3183,14 +3203,18 @@  check_omp_nesting_restrictions (gimple *stmt, omp_context *ctx)
        && is_gimple_omp_oacc (stmt))
       /* Except for atomic codes that we share with OpenMP.  */
       && !(gimple_code (stmt) == GIMPLE_OMP_ATOMIC_LOAD
-          || gimple_code (stmt) == GIMPLE_OMP_ATOMIC_STORE))
+          || gimple_code (stmt) == GIMPLE_OMP_ATOMIC_STORE)
+      /* Except for target regions introduced for kernels.  */
+      && (gimple_code (stmt) != GIMPLE_OMP_TARGET
+         || gimple_omp_target_kind (stmt) != GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE))
     {
       if (oacc_get_fn_attrib (cfun->decl) != NULL)
        {
          error_at (gimple_location (stmt),
                    "non-OpenACC construct inside of OpenACC routine");
+
          return false;
-       }
+}
       else
        for (omp_context *octx = ctx; octx != NULL; octx = octx->outer)
          if (is_gimple_omp (octx->stmt)
@@ -3336,6 +3360,7 @@  check_omp_nesting_restrictions (gimple *stmt, omp_context *ctx)
                  case GF_OMP_TARGET_KIND_OACC_SERIAL:
                  case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
                  case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+                 case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
                    ok = true;
                    break;

@@ -3794,6 +3819,7 @@  check_omp_nesting_restrictions (gimple *stmt, omp_context *ctx)
              break;
            case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
            case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+           case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
            case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
              /* These three cases arise from kernels conversion.  */
              stmt_name = "kernels"; break;
@@ -3814,6 +3840,7 @@  check_omp_nesting_restrictions (gimple *stmt, omp_context *ctx)
              ctx_stmt_name = "host_data"; break;
            case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
            case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+           case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
            case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
              /* These three cases arise from kernels conversion.  */
              ctx_stmt_name = "kernels"; break;
@@ -3822,7 +3849,9 @@  check_omp_nesting_restrictions (gimple *stmt, omp_context *ctx)

          /* OpenACC/OpenMP mismatch?  */
          if (is_gimple_omp_oacc (stmt)
-             != is_gimple_omp_oacc (ctx->stmt))
+             != is_gimple_omp_oacc (ctx->stmt)
+             && (gimple_code (stmt) != GIMPLE_OMP_TARGET
+                 || gimple_omp_target_kind (stmt) != GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE))
            {
              error_at (gimple_location (stmt),
                        "%s %qs construct inside of %s %qs region",
@@ -7143,7 +7172,9 @@  lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
                        && (gimple_omp_target_kind (probe->stmt)
                            != GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED)
                        && (gimple_omp_target_kind (probe->stmt)
-                           != GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE))
+                           != GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE)
+                       && (gimple_omp_target_kind (probe->stmt)
+                           != GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE))
                      goto do_lookup;

                    cls = gimple_omp_target_clauses (probe->stmt);
@@ -7225,7 +7256,7 @@  lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
        if (!ref_to_res)
          ref_to_res = integer_zero_node;

-       if (omp_is_reference (outgoing))
+       if (omp_is_reference (outgoing) && !omp_is_reference (var))
          {
            outgoing = build_simple_mem_ref (outgoing);

@@ -7954,7 +7985,15 @@  lower_oacc_head_mark (location_t loc, tree ddvar, tree clauses,
   omp_context *tgt = enclosing_target_ctx (ctx);
   if ((!tgt || is_oacc_parallel_or_serial (tgt))
       && !(tag & (OLF_SEQ | OLF_AUTO)))
-    tag |= OLF_INDEPENDENT;
+    {
+      tag |= OLF_INDEPENDENT;
+
+      gcc_checking_assert (gimple_code (ctx->stmt) != GIMPLE_OMP_TARGET
+                          /* Loops in kernels regions that will be handled by Graphite should
+                             have been made 'auto' by "pass_convert_oacc_kernels". */
+                          || gimple_omp_target_kind (ctx->stmt)
+                          != GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE);
+    }

   if (tag & OLF_TILE)
     /* Tiling could use all 3 levels.  */
@@ -11112,11 +11151,17 @@  lower_omp_for (gimple_stmt_iterator *gsi_p, omp_context *ctx)
   /* Once lowered, extract the bounds and clauses.  */
   omp_extract_for_data (stmt, &fd, NULL);

+  bool oacc_kernels_parloops = false;
+  if (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+      || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS)
+    oacc_kernels_parloops = ctx_in_oacc_kernels_region (ctx);
   if (is_gimple_omp_oacc (ctx->stmt)
-      && !ctx_in_oacc_kernels_region (ctx))
-    lower_oacc_head_tail (gimple_location (stmt),
-                         gimple_omp_for_clauses (stmt), private_marker,
-                         &oacc_head, &oacc_tail, ctx);
+      && !oacc_kernels_parloops)
+    {
+      lower_oacc_head_tail (gimple_location (stmt),
+                           gimple_omp_for_clauses (stmt), private_marker,
+                           &oacc_head, &oacc_tail, ctx);
+    }

   /* Add OpenACC partitioning and reduction markers just before the loop.  */
   if (oacc_head)
@@ -12003,6 +12048,7 @@  lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx)
     case GF_OMP_TARGET_KIND_OACC_DECLARE:
     case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
     case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
       data_region = false;
       break;
     case GF_OMP_TARGET_KIND_DATA:
diff --git a/gcc/omp-oacc-kernels.c b/gcc/omp-oacc-kernels.c
index fb36b50bbba..9088893c39a 100644
--- a/gcc/omp-oacc-kernels.c
+++ b/gcc/omp-oacc-kernels.c
@@ -179,8 +179,13 @@  adjust_region_code_walk_stmt_fn (gimple_stmt_iterator *gsi_p,
               compiler logic to analyze this, so can't parallelize it here, so
               we'd very likely be running into a performance problem if we
               were to execute this unparallelized, thus forward the whole loop
-              nest to "parloops".  */
-           *region_code = GF_OMP_TARGET_KIND_OACC_KERNELS;
+              nest to Graphite/"parloops".  */
+           if (flag_openacc_kernels == OPENACC_KERNELS_SPLIT)
+             *region_code = GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE;
+           else if (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS)
+             *region_code = GF_OMP_TARGET_KIND_OACC_KERNELS;
+           else
+             gcc_unreachable ();
            /* Terminate: final decision for this region.  */
            *handled_ops_p = true;
            return integer_zero_node;
@@ -200,8 +205,15 @@  adjust_region_code_walk_stmt_fn (gimple_stmt_iterator *gsi_p,
         the compiler logic to analyze this, so can't parallelize it here, so
         we'd very likely be running into a performance problem if we were to
         execute this unparallelized, thus forward the whole thing to
-        "parloops".  */
-      *region_code = GF_OMP_TARGET_KIND_OACC_KERNELS;
+        Graphite/"parloops".  */
+      // TODO-kernels Is Graphite already able to handle this?
+      // Is this covered by tests?
+      if (flag_openacc_kernels == OPENACC_KERNELS_SPLIT)
+       *region_code = GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE;
+      else if (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS)
+       *region_code = GF_OMP_TARGET_KIND_OACC_KERNELS;
+      else
+       gcc_unreachable ();
       /* Terminate: final decision for this region.  */
       *handled_ops_p = true;
       return integer_zero_node;
@@ -327,6 +339,13 @@  make_region_seq (location_t loc, gimple_seq stmts,
         loops nested inside this sequentially executed statement.  */
       make_loops_gang_single (stmts);
     }
+  else if (region_code == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE)
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc_stmts_first,
+                        "beginning \"Graphite\" region in OpenACC"
+                        " 'kernels' construct\n");
+    }
   else if (region_code == GF_OMP_TARGET_KIND_OACC_KERNELS)
     {
       if (dump_enabled_p ())
@@ -492,6 +511,11 @@  adjust_nested_loop_clauses (gimple_stmt_iterator *gsi_p, bool *,
            = build_omp_clause (gimple_location (stmt), OMP_CLAUSE_AUTO);
          OMP_CLAUSE_CHAIN (auto_clause) = loop_clauses;
          gimple_omp_for_set_clauses (stmt, auto_clause);
+
+         if (dump_file && (dump_flags & TDF_DETAILS))
+           dump_printf_loc (MSG_NOTE,
+                            stmt,
+                            "Added 'auto' clause to loop.\n");
        }
     }

@@ -580,6 +604,11 @@  transform_kernels_loop_clauses (gimple *omp_for,
                                           OMP_CLAUSE_AUTO);
       OMP_CLAUSE_CHAIN (auto_clause) = loop_clauses;
       loop_clauses = auto_clause;
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+       dump_printf_loc (MSG_NOTE,
+                        omp_for,
+                        "Added 'auto' clause to loop.\n");
     }
   gimple_omp_for_set_clauses (omp_for, loop_clauses);
   /* We must also recurse into the loop; it might contain nested loops
@@ -661,6 +690,19 @@  make_region_loop_nest (gimple *omp_for, gimple_seq stmts,
                         "parallelized loop nest in OpenACC 'kernels'"
                         " construct\n");

+      clauses = transform_kernels_loop_clauses (omp_for,
+                                               num_gangs_clause,
+                                               num_workers_clause,
+                                               vector_length_clause,
+                                               clauses);
+    }
+  else if (region_code == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE)
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, omp_for,
+                        "forwarded loop nest in OpenACC 'kernels' construct"
+                        " to \"Graphite\" for analysis\n");
+
       clauses = transform_kernels_loop_clauses (omp_for,
                                                num_gangs_clause,
                                                num_workers_clause,
@@ -1644,8 +1686,13 @@  public:
   /* opt_pass methods: */
   virtual bool gate (function *)
   {
-    return (flag_openacc
-           && flag_openacc_kernels == OPENACC_KERNELS_SPLIT);
+    if (flag_openacc_kernels == OPENACC_KERNELS_SPLIT
+       || flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS)
+      return flag_openacc;
+    else if (flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS)
+      return false;
+    else
+      gcc_unreachable ();
   }
   virtual unsigned int execute (function *)
   {
diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c
index 3e81e878cf9..77d31c6eba8 100644
--- a/gcc/omp-offload.c
+++ b/gcc/omp-offload.c
@@ -1863,8 +1863,8 @@  default_goacc_reduction (gcall *call)
              gimple_seq_add_stmt (&seq, gimple_build_assign (t, expr));
              ref_to_res = t;
            }
-         tree dst = build_simple_mem_ref (ref_to_res);
          tree src = var;
+         tree dst = ref_to_res;

          if (code == IFN_GOACC_REDUCTION_SETUP)
            {
@@ -1872,6 +1872,14 @@  default_goacc_reduction (gcall *call)
              dst = lhs;
              lhs = NULL;
            }
+
+         if (TREE_TYPE (TREE_TYPE (dst)) == TREE_TYPE (src))
+             dst = build_simple_mem_ref (dst);
+
+         if (TREE_TYPE (TREE_TYPE (src)) == TREE_TYPE (dst))
+             src = build_simple_mem_ref (src);
+
+
          gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
        }
     }
@@ -2030,11 +2038,22 @@  execute_oacc_loop_designation ()
   bool is_oacc_kernels
     = (lookup_attribute ("oacc kernels",
                         DECL_ATTRIBUTES (current_function_decl)) != NULL);
+  if (is_oacc_kernels)
+    gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+                        || flag_openacc_kernels == OPENACC_KERNELS_SPLIT
+                        || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
   bool is_oacc_kernels_parallelized
     = (lookup_attribute ("oacc kernels parallelized",
                         DECL_ATTRIBUTES (current_function_decl)) != NULL);
   if (is_oacc_kernels_parallelized)
-    gcc_checking_assert (is_oacc_kernels);
+    {
+      gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+                          || flag_openacc_kernels == OPENACC_KERNELS_SPLIT
+                          || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
+      gcc_checking_assert (is_oacc_kernels);
+    }
   bool is_oacc_parallel_kernels_parallelized
     = (lookup_attribute ("oacc parallel_kernels_parallelized",
                         DECL_ATTRIBUTES (current_function_decl)) != NULL);
@@ -2047,6 +2066,12 @@  execute_oacc_loop_designation ()
     gcc_checking_assert (!is_oacc_kernels);
   gcc_checking_assert (!(is_oacc_parallel_kernels_parallelized
                         && is_oacc_parallel_kernels_gang_single));
+  bool is_oacc_parallel_kernels_graphite
+    = (lookup_attribute ("oacc parallel_kernels_graphite",
+                        DECL_ATTRIBUTES (current_function_decl)) != NULL);
+  if (is_oacc_parallel_kernels_graphite)
+      gcc_checking_assert (!is_oacc_kernels
+                          && !is_oacc_parallel_kernels_gang_single);

   /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
      kernels, so remove the parallelism dimensions function attributes
@@ -2076,6 +2101,9 @@  execute_oacc_loop_designation ()
       else if (is_oacc_parallel_kernels_gang_single)
        fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
                 "parallel_kernels_gang_single");
+      else if (is_oacc_parallel_kernels_graphite)
+       fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
+                "parallel_kernels_graphite");
       else
        fprintf (dump_file, "Function is OpenACC parallel offload\n");
     }
@@ -2088,11 +2116,25 @@  execute_oacc_loop_designation ()
      generic oacc_loop infrastructure and attribute/dimension processing.  */
   if (is_oacc_kernels && is_oacc_kernels_parallelized)
     {
+      //TODO-kernels We should not really end up here with KERNELS_SPLIT!?
+      gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+                          || flag_openacc_kernels == OPENACC_KERNELS_SPLIT
+                          || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
       /* Parallelized OpenACC kernels constructs use gang parallelism.  See
         also tree-parloops.c:create_parallel_loop.  */
       used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
     }

+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, " [execute_oacc_loop_designation]: (attr = ");
+      print_generic_expr (dump_file, attr);
+      fprintf (dump_file, ")\n");
+    }
+
+
+
   int dims[GOMP_DIM_MAX];
   oacc_validate_dims (current_function_decl, attr, dims, fn_level, used_mask);

@@ -2118,6 +2160,10 @@  execute_oacc_loop_designation ()
         generic oacc_loop infrastructure.  */
       if (is_oacc_kernels)
        {
+         //TODO-kernels: how to handle KERNELS_SPLIT
+         /* gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS */
+         /*                   || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS); */
+
          /* Create a fake oacc_loop for diagnostic purposes.  */
          l = new_oacc_loop_raw (NULL,
                                 DECL_SOURCE_LOCATION (current_function_decl));
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-conversion-parloops.c b/gcc/testsuite/c-c++-common/goacc/kernels-conversion-parloops.c
new file mode 100644
index 00000000000..a1616144549
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-conversion-parloops.c
@@ -0,0 +1,61 @@ 
+/* { dg-additional-options "-fopenacc-kernels=split-parloops -fdump-tree-convert_oacc_kernels" } */
+
+#define N 1024
+
+unsigned int a[N];
+
+int
+main (void)
+{
+  int i;
+  unsigned int sum = 1;
+
+#pragma acc kernels copyin(a[0:N]) copy(sum)
+  {
+    /* converted to "oacc_kernels" */
+    #pragma acc loop
+    for (i = 0; i < N; ++i)
+      sum += a[i];
+
+    /* converted to "oacc_parallel_kernels_gang_single" */
+    sum++;
+    a[0]++;
+
+    /* converted to "oacc_parallel_kernels_parallelized" */
+    #pragma acc loop independent
+    for (i = 0; i < N; ++i)
+      sum += a[i];
+
+    /* converted to "oacc_kernels" */
+    if (sum > 10)
+      {
+        #pragma acc loop
+        for (i = 0; i < N; ++i)
+          sum += a[i];
+      }
+
+    /* converted to "oacc_kernels" */
+    #pragma acc loop auto
+    for (i = 0; i < N; ++i)
+      sum += a[i];
+  }
+
+  return 0;
+}
+
+/* Check that the kernels region is split into a data region and enclosed
+   parallel regions.  */
+/* { dg-final { scan-tree-dump-times "oacc_data_kernels" 1 "convert_oacc_kernels" } } */
+
+/* As noted in the comments above, we get one gang-single serial region; one
+   parallelized loop region; and three "old-style" kernel regions. */
+/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single" 1 "convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized" 1 "convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "oacc_kernels " 3 "convert_oacc_kernels" } } */
+
+/* Each of the parallel regions is async, and there is a final call to
+   __builtin_GOACC_wait.  */
+/* { dg-final { scan-tree-dump-times "oacc_kernels async\\(-1\\)" 3 "convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single async\\(-1\\)" 1 "convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized async\\(-1\\)" 1 "convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_GOACC_wait" 1 "convert_oacc_kernels" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c b/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c
index 8cb63f00444..83f658b7755 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c
@@ -12,7 +12,7 @@  main (void)

 #pragma acc kernels copyin(a[0:N]) copy(sum)
   {
-    /* converted to "oacc_kernels" */
+    /* converted to "oacc_parallel_kernels_graphite" */
     #pragma acc loop
     for (i = 0; i < N; ++i)
       sum += a[i];
@@ -26,7 +26,7 @@  main (void)
     for (i = 0; i < N; ++i)
       sum += a[i];

-    /* converted to "oacc_kernels" */
+    /* converted to "oacc_parallel_kernels_graphite" */
     if (sum > 10)
       {
         #pragma acc loop
@@ -34,7 +34,7 @@  main (void)
           sum += a[i];
       }

-    /* converted to "oacc_kernels" */
+    /* converted to "oacc_parallel_kernels_graphite" */
     #pragma acc loop auto
     for (i = 0; i < N; ++i)
       sum += a[i];
@@ -48,14 +48,14 @@  main (void)
 /* { dg-final { scan-tree-dump-times "oacc_data_kernels" 1 "convert_oacc_kernels" } } */

 /* As noted in the comments above, we get one gang-single serial region; one
-   parallelized loop region; and three "old-style" kernel regions. */
+   parallelized loop region; and three "graphite" kernel regions. */
 /* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single" 1 "convert_oacc_kernels" } } */
 /* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized" 1 "convert_oacc_kernels" } } */
-/* { dg-final { scan-tree-dump-times "oacc_kernels " 3 "convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_graphite " 3 "convert_oacc_kernels" } } */

 /* Each of the parallel regions is async, and there is a final call to
    __builtin_GOACC_wait.  */
-/* { dg-final { scan-tree-dump-times "oacc_kernels async\\(-1\\)" 3 "convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_graphite async\\(-1\\)" 3 "convert_oacc_kernels" } } */
 /* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single async\\(-1\\)" 1 "convert_oacc_kernels" } } */
 /* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized async\\(-1\\)" 1 "convert_oacc_kernels" } } */
 /* { dg-final { scan-tree-dump-times "__builtin_GOACC_wait" 1 "convert_oacc_kernels" } } */
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-reductions.f90 b/gcc/testsuite/gfortran.dg/goacc/kernels-reductions.f90
new file mode 100644
index 00000000000..49ff69291ed
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-reductions.f90
@@ -0,0 +1,37 @@ 
+! { dg-additional-options "-fopenacc-kernels=split" }
+
+! A regression test checking that the reduction clause lowering does
+! not fail if a subroutine argument is used as a reduction variable in
+! a kernels region.
+
+! This was fine ...
+subroutine reduction_var_not_argument(res)
+  real res
+  real tmp
+  integer i
+
+  !$acc kernels
+  !$acc loop reduction(+:tmp)
+  do i=0,n-1
+     tmp = tmp + 1
+  end do
+  !$acc end kernels
+
+  res = tmp
+end subroutine reduction_var_not_argument
+
+! ... but this led to problems because ARG
+! was a pointer type that did not get dereferenced.
+subroutine reduction_var_as_argument(arg)
+  real arg
+  integer i
+
+  !$acc kernels
+  !$acc loop reduction(+:arg)
+  do i=0,n-1
+     arg = arg + 1
+  end do
+  !$acc end kernels
+end subroutine reduction_var_as_argument
+
+
diff --git a/gcc/tree-parloops.c b/gcc/tree-parloops.c
index d9250d36c72..683a188502f 100644
--- a/gcc/tree-parloops.c
+++ b/gcc/tree-parloops.c
@@ -4174,7 +4174,15 @@  public:
   virtual bool gate (function *)
   {
     if (oacc_kernels_p)
-      return flag_openacc;
+      {
+       if (flag_openacc_kernels == OPENACC_KERNELS_SPLIT)
+         return false;
+
+       gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+                            || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
+       return flag_openacc;
+      }
     else
       return flag_tree_parallelize_loops > 1;
   }
@@ -4193,6 +4201,12 @@  public:
 unsigned
 pass_parallelize_loops::execute (function *fun)
 {
+  if (oacc_kernels_p)
+    {
+      gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+                          || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+    }
+
   tree nthreads = builtin_decl_explicit (BUILT_IN_OMP_GET_NUM_THREADS);
   if (nthreads == NULL_TREE)
     return 0;
diff --git a/gcc/tree-ssa-loop.c b/gcc/tree-ssa-loop.c
index ed06527aa3f..7697eb19c8b 100644
--- a/gcc/tree-ssa-loop.c
+++ b/gcc/tree-ssa-loop.c
@@ -155,6 +155,12 @@  make_pass_tree_loop (gcc::context *ctxt)
 static bool
 gate_oacc_kernels (function *fn)
 {
+  if (flag_openacc_kernels == OPENACC_KERNELS_SPLIT)
+    return false;
+
+  gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+                      || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
   if (!flag_openacc)
     return false;

@@ -324,6 +330,10 @@  public:
   /* opt_pass methods: */
   virtual bool gate (function *)
   {
+    if (flag_openacc_kernels != OPENACC_KERNELS_SPLIT_PARLOOPS
+       && flag_openacc_kernels != OPENACC_KERNELS_PARLOOPS)
+      return false;
+
     return (optimize
            && flag_openacc
            /* Don't bother doing anything if the program has errors.  */