From 186a81186e76c67ce4442d1535d12c3f477583c5 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 24 Oct 2025 15:53:30 +0000 Subject: [PATCH 01/24] Extend handler-less kernel submission path to API functions with event dependencies --- .../oneapi/experimental/enqueue_functions.hpp | 4 +- .../sycl/khr/free_function_commands.hpp | 8 +- sycl/include/sycl/queue.hpp | 134 +++++++++++++----- sycl/source/detail/event_impl.cpp | 72 ++++++++++ sycl/source/detail/event_impl.hpp | 10 ++ sycl/source/detail/queue_impl.cpp | 24 +++- sycl/source/detail/queue_impl.hpp | 13 +- sycl/source/handler.cpp | 66 +-------- sycl/source/queue.cpp | 20 +-- 9 files changed, 226 insertions(+), 125 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 8c8488a99e354..4650926be5ca3 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -160,7 +160,7 @@ void single_task(queue Q, const KernelType &KernelObj, !(detail::KernelLambdaHasKernelHandlerArgT::value)) { detail::submit_kernel_direct_single_task( - std::move(Q), empty_properties_t{}, KernelObj, CodeLoc); + std::move(Q), empty_properties_t{}, KernelObj, {}, CodeLoc); } else { submit( std::move(Q), @@ -281,7 +281,7 @@ void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { detail::submit_kernel_direct_parallel_for( - std::move(Q), empty_properties_t{}, Range, KernelObj); + std::move(Q), empty_properties_t{}, Range, KernelObj, {}); } else { submit(std::move(Q), [&](handler &CGH) { nd_launch(CGH, Range, KernelObj, diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index 68dd159bf8211..e17adb0ffca23 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -166,7 +166,7 @@ void launch_grouped(const queue &q, range<1> r, range<1> size, KernelType &&k, KernelType, sycl::nd_item<1>>::value)) { detail::submit_kernel_direct_parallel_for( q, ext::oneapi::experimental::empty_properties_t{}, - nd_range<1>(r, size), std::forward(k)); + nd_range<1>(r, size), std::forward(k), {}); } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, @@ -187,7 +187,7 @@ void launch_grouped(const queue &q, range<2> r, range<2> size, KernelType &&k, KernelType, sycl::nd_item<2>>::value)) { detail::submit_kernel_direct_parallel_for( q, ext::oneapi::experimental::empty_properties_t{}, - nd_range<2>(r, size), std::forward(k)); + nd_range<2>(r, size), std::forward(k), {}); } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, @@ -208,7 +208,7 @@ void launch_grouped(const queue &q, range<3> r, range<3> size, KernelType &&k, KernelType, sycl::nd_item<3>>::value)) { detail::submit_kernel_direct_parallel_for( q, ext::oneapi::experimental::empty_properties_t{}, - nd_range<3>(r, size), std::forward(k)); + nd_range<3>(r, size), std::forward(k), {}); } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, @@ -333,7 +333,7 @@ void launch_task(const sycl::queue &q, KernelType &&k, void>::value)) { detail::submit_kernel_direct_single_task( q, ext::oneapi::experimental::empty_properties_t{}, - std::forward(k), codeLoc); + std::forward(k), {}, codeLoc); } else { submit(q, [&](handler &h) { launch_task(h, k); }, codeLoc); } diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 4a7f1fac789a3..0cf0732a8fb95 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -40,6 +40,7 @@ #include // for nd_range #include // for property_list #include // for range +#include // for sycl::span #include // for size_t #include // for function @@ -67,14 +68,14 @@ template event __SYCL_EXPORT submit_kernel_direct_with_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, + detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template void __SYCL_EXPORT submit_kernel_direct_without_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, + detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, const detail::code_location &CodeLoc, bool IsTopCodeLoc); namespace detail { @@ -163,6 +164,7 @@ template &Range, KernelTypeUniversalRef &&KernelFunc, + sycl::span DepEvents, const detail::code_location &CodeLoc = detail::code_location::current()) { // TODO Properties not supported yet static_assert( @@ -212,11 +214,11 @@ auto submit_kernel_direct( if constexpr (EventNeeded) { return submit_kernel_direct_with_event_impl( - Queue, Range, HostKernel, DeviceKernelInfoPtr, + Queue, Range, HostKernel, DeviceKernelInfoPtr, DepEvents, TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } else { submit_kernel_direct_without_event_impl( - Queue, Range, HostKernel, DeviceKernelInfoPtr, + Queue, Range, HostKernel, DeviceKernelInfoPtr, DepEvents, TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } } @@ -225,7 +227,7 @@ template auto submit_kernel_direct_parallel_for( const queue &Queue, PropertiesT Props, const nd_range &Range, - KernelTypeUniversalRef &&KernelFunc, + KernelTypeUniversalRef &&KernelFunc, sycl::span DepEvents, const detail::code_location &CodeLoc = detail::code_location::current()) { using KernelType = @@ -247,20 +249,21 @@ auto submit_kernel_direct_parallel_for( KernelName, EventNeeded, PropertiesT, KernelTypeUniversalRef, Dims>( Queue, Props, Range, std::forward(KernelFunc), - CodeLoc); + DepEvents, CodeLoc); } template auto submit_kernel_direct_single_task( const queue &Queue, PropertiesT Props, KernelTypeUniversalRef &&KernelFunc, + sycl::span DepEvents, const detail::code_location &CodeLoc = detail::code_location::current()) { return submit_kernel_direct( Queue, Props, nd_range<1>{1, 1}, - std::forward(KernelFunc), CodeLoc); + std::forward(KernelFunc), DepEvents, CodeLoc); } } // namespace detail @@ -2786,7 +2789,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { !(detail::KernelLambdaHasKernelHandlerArgT::value)) { return detail::submit_kernel_direct_single_task( *this, ext::oneapi::experimental::empty_properties_t{}, KernelFunc, - TlsCodeLocCapture.query()); + {}, TlsCodeLocCapture.query()); } else { return submit( [&](handler &CGH) { @@ -2836,13 +2839,29 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { "Use queue.submit() instead"); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return submit( - [&](handler &CGH) { - CGH.depends_on(DepEvent); - CGH.template single_task( - Properties, KernelFunc); - }, - TlsCodeLocCapture.query()); + + // TODO The handler-less path does not support kernel + // function properties and kernel functions with the kernel_handler + // type argument yet. + if constexpr ( + std::is_same_v && + !(ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod< + const KernelType &>::value) && + !(detail::KernelLambdaHasKernelHandlerArgT::value)) { + std::vector DepEvents = {DepEvent}; + return detail::submit_kernel_direct_single_task( + *this, ext::oneapi::experimental::empty_properties_t{}, KernelFunc, + DepEvents, TlsCodeLocCapture.query()); + } else { + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvent); + CGH.template single_task( + Properties, KernelFunc); + }, + TlsCodeLocCapture.query()); + } } /// single_task version with a kernel represented as a lambda. @@ -2887,13 +2906,25 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { "Use queue.submit() instead"); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return submit( - [&](handler &CGH) { - CGH.depends_on(DepEvents); - CGH.template single_task( - Properties, KernelFunc); - }, - TlsCodeLocCapture.query()); + + if constexpr ( + std::is_same_v && + !(ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod< + const KernelType &>::value) && + !(detail::KernelLambdaHasKernelHandlerArgT::value)) { + return detail::submit_kernel_direct_single_task( + *this, ext::oneapi::experimental::empty_properties_t{}, KernelFunc, + DepEvents, TlsCodeLocCapture.query()); + } else { + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvents); + CGH.template single_task( + Properties, KernelFunc); + }, + TlsCodeLocCapture.query()); + } } /// single_task version with a kernel represented as a lambda. @@ -3355,7 +3386,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { KernelType, sycl::nd_item>::value)) { return detail::submit_kernel_direct_parallel_for( *this, ext::oneapi::experimental::empty_properties_t{}, Range, - Rest..., TlsCodeLocCapture.query()); + Rest..., {}, TlsCodeLocCapture.query()); } else { return submit( [&](handler &CGH) { @@ -3407,12 +3438,29 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { parallel_for(nd_range Range, event DepEvent, RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return submit( - [&](handler &CGH) { - CGH.depends_on(DepEvent); - CGH.template parallel_for(Range, Rest...); - }, - TlsCodeLocCapture.query()); + using KernelType = std::tuple_element_t<0, std::tuple>; + + // TODO The handler-less path does not support reductions, kernel + // function properties and kernel functions with the kernel_handler + // type argument yet. + if constexpr (sizeof...(RestT) == 1 && + !(ext::oneapi::experimental::detail:: + HasKernelPropertiesGetMethod< + const KernelType &>::value) && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, sycl::nd_item>::value)) { + std::vector DepEvents = {DepEvent}; + return detail::submit_kernel_direct_parallel_for( + *this, ext::oneapi::experimental::empty_properties_t{}, Range, + Rest..., DepEvents, TlsCodeLocCapture.query()); + } else { + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvent); + CGH.template parallel_for(Range, Rest...); + }, + TlsCodeLocCapture.query()); + } } /// parallel_for version with a kernel represented as a lambda + nd_range that @@ -3461,12 +3509,28 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return submit( - [&](handler &CGH) { - CGH.depends_on(DepEvents); - CGH.template parallel_for(Range, Rest...); - }, - TlsCodeLocCapture.query()); + using KernelType = std::tuple_element_t<0, std::tuple>; + + // TODO The handler-less path does not support reductions, kernel + // function properties and kernel functions with the kernel_handler + // type argument yet. + if constexpr (sizeof...(RestT) == 1 && + !(ext::oneapi::experimental::detail:: + HasKernelPropertiesGetMethod< + const KernelType &>::value) && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, sycl::nd_item>::value)) { + return detail::submit_kernel_direct_parallel_for( + *this, ext::oneapi::experimental::empty_properties_t{}, Range, + Rest..., DepEvents, TlsCodeLocCapture.query()); + } else { + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvents); + CGH.template parallel_for(Range, Rest...); + }, + TlsCodeLocCapture.query()); + } } /// Copies data from a memory region pointed to by a placeholder accessor to diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index b0c838cdd890c..40f13f1f863e1 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -611,6 +611,78 @@ bool event_impl::isCompleted() { void event_impl::setCommand(Command *Cmd) { MCommand = Cmd; } +void registerEventDependency( + const detail::EventImplPtr &EventImpl, + std::vector &EventsRegistered, + detail::queue_impl *QueueImpl, const detail::context_impl &ContextImpl, + const detail::device_impl &DeviceImpl, + const std::shared_ptr &Graph, + sycl::detail::CGType CommandGroupType) { + + if (!EventImpl) + return; + if (EventImpl->isDiscarded()) { + throw sycl::exception(make_error_code(errc::invalid), + "Queue operation cannot depend on discarded event."); + } + + // Async alloc calls adapter immediately. Any explicit/implicit dependencies + // are handled at that point, including in order queue deps. Further calls to + // depends_on after an async alloc are explicitly disallowed. + if (CommandGroupType == CGType::AsyncAlloc) { + throw sycl::exception(make_error_code(errc::invalid), + "Cannot submit a dependency after an asynchronous " + "allocation has already been executed!"); + } + + auto EventGraph = EventImpl->getCommandGraph(); + if (QueueImpl && EventGraph) { + auto QueueGraph = QueueImpl->getCommandGraph(); + + if (&EventGraph->getContextImpl() != &ContextImpl) { + throw sycl::exception( + make_error_code(errc::invalid), + "Cannot submit to a queue with a dependency from a graph that is " + "associated with a different context."); + } + + if (&EventGraph->getDeviceImpl() != &DeviceImpl) { + throw sycl::exception( + make_error_code(errc::invalid), + "Cannot submit to a queue with a dependency from a graph that is " + "associated with a different device."); + } + + if (QueueGraph && QueueGraph != EventGraph) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "Cannot submit to a recording queue with a " + "dependency from a different graph."); + } + + // If the event dependency has a graph, that means that the queue that + // created it was in recording mode. If the current queue is not recording, + // we need to set it to recording (implements the transitive queue recording + // feature). + if (!QueueGraph) { + EventGraph->beginRecording(*QueueImpl); + } + } + + if (Graph) { + if (EventGraph == nullptr) { + throw sycl::exception( + make_error_code(errc::invalid), + "Graph nodes cannot depend on events from outside the graph."); + } + if (EventGraph != Graph) { + throw sycl::exception( + make_error_code(errc::invalid), + "Graph nodes cannot depend on events from another graph."); + } + } + EventsRegistered.push_back(EventImpl); +} + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index 0ca4aa6d49a9f..54e52ba366758 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -473,6 +474,15 @@ class events_range : public iterator_range { public: using Base::Base; }; + +void registerEventDependency( + const detail::EventImplPtr &EventToRegister, + std::vector &EventsRegistered, + detail::queue_impl *QueueImpl, const detail::context_impl &ContextImpl, + const detail::device_impl &DeviceImpl, + const std::shared_ptr &Graph, + sycl::detail::CGType CommandGroupType); + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 4ed73e700d8ce..127ac83136c21 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -567,7 +567,8 @@ EventImplPtr queue_impl::submit_command_to_graph( EventImplPtr queue_impl::submit_kernel_direct_impl( const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + sycl::span DepEvents, const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { KernelData KData; @@ -619,12 +620,12 @@ EventImplPtr queue_impl::submit_kernel_direct_impl( *this, true); }; - return submit_direct(CallerNeedsEvent, SubmitKernelFunc); + return submit_direct(CallerNeedsEvent, DepEvents, SubmitKernelFunc); } template detail::EventImplPtr -queue_impl::submit_direct(bool CallerNeedsEvent, +queue_impl::submit_direct(bool CallerNeedsEvent, sycl::span DepEvents, SubmitCommandFuncType &SubmitCommandFunc) { detail::CG::StorageInitHelper CGData; std::unique_lock Lock(MMutex); @@ -635,13 +636,26 @@ queue_impl::submit_direct(bool CallerNeedsEvent, // Sync with an external event std::optional ExternalEvent = popExternalEvent(); if (ExternalEvent) { - CGData.MEvents.push_back(getSyclObjImpl(*ExternalEvent)); + registerEventDependency(getSyclObjImpl(*ExternalEvent), CGData.MEvents, + this, getContextImpl(), getDeviceImpl(), + hasCommandGraph() ? getCommandGraph() : nullptr, + detail::CGType::Kernel); } // Sync with the last event for in order queue EventImplPtr &LastEvent = MDefaultGraphDeps.LastEventPtr; if (isInOrder() && LastEvent) { - CGData.MEvents.push_back(LastEvent); + registerEventDependency(LastEvent, CGData.MEvents, this, getContextImpl(), + getDeviceImpl(), + hasCommandGraph() ? getCommandGraph() : nullptr, + detail::CGType::Kernel); + } + + for (event e : DepEvents) { + registerEventDependency(getSyclObjImpl(e), CGData.MEvents, this, + getContextImpl(), getDeviceImpl(), + hasCommandGraph() ? getCommandGraph() : nullptr, + detail::CGType::Kernel); } // Barrier and un-enqueued commands synchronization for out or order queue diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 49da7aee8c448..ed660af9fe74a 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -362,21 +362,21 @@ class queue_impl : public std::enable_shared_from_this { template event submit_kernel_direct_with_event( const nd_range &Range, detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, + detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, - true, CodeLoc, IsTopCodeLoc); + true, DepEvents, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } template void submit_kernel_direct_without_event( const nd_range &Range, detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, + detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, - false, CodeLoc, IsTopCodeLoc); + false, DepEvents, CodeLoc, IsTopCodeLoc); } void submit_without_event(const detail::type_erased_cgfo_ty &CGF, @@ -929,10 +929,11 @@ class queue_impl : public std::enable_shared_from_this { EventImplPtr submit_kernel_direct_impl( const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, - const detail::code_location &CodeLoc, bool IsTopCodeLoc); + sycl::span DepEvents, const detail::code_location &CodeLoc, + bool IsTopCodeLoc); template - EventImplPtr submit_direct(bool CallerNeedsEvent, + EventImplPtr submit_direct(bool CallerNeedsEvent, sycl::span DepEvents, SubmitCommandFuncType &SubmitCommandFunc); /// Helper function for submitting a memory operation with a handler. diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 26477c99be62c..146695a1d2681 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -1671,69 +1671,9 @@ void handler::depends_on(const std::vector &Events) { } void handler::depends_on(const detail::EventImplPtr &EventImpl) { - if (!EventImpl) - return; - if (EventImpl->isDiscarded()) { - throw sycl::exception(make_error_code(errc::invalid), - "Queue operation cannot depend on discarded event."); - } - - // Async alloc calls adapter immediately. Any explicit/implicit dependencies - // are handled at that point, including in order queue deps. Further calls to - // depends_on after an async alloc are explicitly disallowed. - if (getType() == CGType::AsyncAlloc) { - throw sycl::exception(make_error_code(errc::invalid), - "Cannot submit a dependency after an asynchronous " - "allocation has already been executed!"); - } - - auto EventGraph = EventImpl->getCommandGraph(); - queue_impl *Queue = impl->get_queue_or_null(); - if (Queue && EventGraph) { - auto QueueGraph = Queue->getCommandGraph(); - - if (&EventGraph->getContextImpl() != &impl->get_context()) { - throw sycl::exception( - make_error_code(errc::invalid), - "Cannot submit to a queue with a dependency from a graph that is " - "associated with a different context."); - } - - if (&EventGraph->getDeviceImpl() != &impl->get_device()) { - throw sycl::exception( - make_error_code(errc::invalid), - "Cannot submit to a queue with a dependency from a graph that is " - "associated with a different device."); - } - - if (QueueGraph && QueueGraph != EventGraph) { - throw sycl::exception(sycl::make_error_code(errc::invalid), - "Cannot submit to a recording queue with a " - "dependency from a different graph."); - } - - // If the event dependency has a graph, that means that the queue that - // created it was in recording mode. If the current queue is not recording, - // we need to set it to recording (implements the transitive queue recording - // feature). - if (!QueueGraph) { - EventGraph->beginRecording(*Queue); - } - } - - if (auto Graph = getCommandGraph(); Graph) { - if (EventGraph == nullptr) { - throw sycl::exception( - make_error_code(errc::invalid), - "Graph nodes cannot depend on events from outside the graph."); - } - if (EventGraph != Graph) { - throw sycl::exception( - make_error_code(errc::invalid), - "Graph nodes cannot depend on events from another graph."); - } - } - impl->CGData.MEvents.push_back(EventImpl); + registerEventDependency(EventImpl, impl->CGData.MEvents, + impl->get_queue_or_null(), impl->get_context(), + impl->get_device(), getCommandGraph(), getType()); } void handler::depends_on(const std::vector &Events) { diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index f34da47852266..6001bc3a49f38 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -475,56 +475,56 @@ template event submit_kernel_direct_with_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, + detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { return getSyclObjImpl(Queue)->submit_kernel_direct_with_event( - Range, HostKernel, DeviceKernelInfo, CodeLoc, IsTopCodeLoc); + Range, HostKernel, DeviceKernelInfo, DepEvents, CodeLoc, IsTopCodeLoc); } template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<1>( const queue &Queue, const nd_range<1> &Range, detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, + detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<2>( const queue &Queue, const nd_range<2> &Range, detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, + detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<3>( const queue &Queue, const nd_range<3> &Range, detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, + detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template void submit_kernel_direct_without_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, + detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { getSyclObjImpl(Queue)->submit_kernel_direct_without_event( - Range, HostKernel, DeviceKernelInfo, CodeLoc, IsTopCodeLoc); + Range, HostKernel, DeviceKernelInfo, DepEvents, CodeLoc, IsTopCodeLoc); } template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<1>( const queue &Queue, const nd_range<1> &Range, detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, + detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<2>( const queue &Queue, const nd_range<2> &Range, detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, + detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<3>( const queue &Queue, const nd_range<3> &Range, detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, + detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, const detail::code_location &CodeLoc, bool IsTopCodeLoc); } // namespace _V1 From b769e4992b2f4447f9fdb96e8b969f38476b556b Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 27 Oct 2025 14:31:40 +0000 Subject: [PATCH 02/24] Add an unlocked version of graph_impl::beginRecording --- sycl/source/detail/event_impl.cpp | 23 ++++++++++++++++++++++- sycl/source/detail/event_impl.hpp | 1 + sycl/source/detail/graph/graph_impl.cpp | 8 ++++++++ sycl/source/detail/graph/graph_impl.hpp | 8 ++++++++ sycl/source/detail/queue_impl.cpp | 24 ++++++++++++------------ sycl/source/detail/queue_impl.hpp | 9 +++++++-- 6 files changed, 58 insertions(+), 15 deletions(-) diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index 40f13f1f863e1..1c14ed43d3875 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -611,6 +611,7 @@ bool event_impl::isCompleted() { void event_impl::setCommand(Command *Cmd) { MCommand = Cmd; } +template void registerEventDependency( const detail::EventImplPtr &EventImpl, std::vector &EventsRegistered, @@ -664,7 +665,11 @@ void registerEventDependency( // we need to set it to recording (implements the transitive queue recording // feature). if (!QueueGraph) { - EventGraph->beginRecording(*QueueImpl); + if constexpr (LockQueue) { + EventGraph->beginRecording(*QueueImpl); + } else { + EventGraph->beginRecordingUnlockedQueue(*QueueImpl); + } } } @@ -683,6 +688,22 @@ void registerEventDependency( EventsRegistered.push_back(EventImpl); } +template void registerEventDependency( + const detail::EventImplPtr &EventImpl, + std::vector &EventsRegistered, + detail::queue_impl *QueueImpl, const detail::context_impl &ContextImpl, + const detail::device_impl &DeviceImpl, + const std::shared_ptr &Graph, + sycl::detail::CGType CommandGroupType); + +template void registerEventDependency( + const detail::EventImplPtr &EventImpl, + std::vector &EventsRegistered, + detail::queue_impl *QueueImpl, const detail::context_impl &ContextImpl, + const detail::device_impl &DeviceImpl, + const std::shared_ptr &Graph, + sycl::detail::CGType CommandGroupType); + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index 54e52ba366758..0e4a1e62d904a 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -475,6 +475,7 @@ class events_range : public iterator_range { using Base::Base; }; +template void registerEventDependency( const detail::EventImplPtr &EventToRegister, std::vector &EventsRegistered, diff --git a/sycl/source/detail/graph/graph_impl.cpp b/sycl/source/detail/graph/graph_impl.cpp index ad5058361b6d4..9f878f0e8ea66 100644 --- a/sycl/source/detail/graph/graph_impl.cpp +++ b/sycl/source/detail/graph/graph_impl.cpp @@ -690,6 +690,14 @@ std::vector graph_impl::getExitNodesEvents( return Events; } +void graph_impl::beginRecordingUnlockedQueue(sycl::detail::queue_impl &Queue) { + graph_impl::WriteLock Lock(MMutex); + if (!Queue.hasCommandGraph()) { + Queue.setCommandGraphUnlocked(shared_from_this()); + addQueue(Queue); + } +} + void graph_impl::beginRecording(sycl::detail::queue_impl &Queue) { graph_impl::WriteLock Lock(MMutex); if (!Queue.hasCommandGraph()) { diff --git a/sycl/source/detail/graph/graph_impl.hpp b/sycl/source/detail/graph/graph_impl.hpp index 04f4bef37eadf..185f75ca82c1b 100644 --- a/sycl/source/detail/graph/graph_impl.hpp +++ b/sycl/source/detail/graph/graph_impl.hpp @@ -461,6 +461,14 @@ class graph_impl : public std::enable_shared_from_this { std::vector getExitNodesEvents(std::weak_ptr Queue); + /// Sets the Queue state to queue_state::recording. Adds the queue to the list + /// of recording queues associated with this graph. + /// Does not take the queue submission lock. Required for the cases, + /// when the recording is started direct from within the kernel submission + /// flow. + /// @param[in] Queue The queue to be recorded from. + void beginRecordingUnlockedQueue(sycl::detail::queue_impl &Queue); + /// Sets the Queue state to queue_state::recording. Adds the queue to the list /// of recording queues associated with this graph. /// @param[in] Queue The queue to be recorded from. diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 76c6d0ea91b0a..f4529104dd356 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -636,10 +636,10 @@ queue_impl::submit_direct(bool CallerNeedsEvent, sycl::span DepEvents, // Sync with an external event std::optional ExternalEvent = popExternalEvent(); if (ExternalEvent) { - registerEventDependency(getSyclObjImpl(*ExternalEvent), CGData.MEvents, - this, getContextImpl(), getDeviceImpl(), - hasCommandGraph() ? getCommandGraph() : nullptr, - detail::CGType::Kernel); + registerEventDependency( + getSyclObjImpl(*ExternalEvent), CGData.MEvents, this, getContextImpl(), + getDeviceImpl(), hasCommandGraph() ? getCommandGraph() : nullptr, + detail::CGType::Kernel); } auto &Deps = hasCommandGraph() ? MExtGraphDeps : MDefaultGraphDeps; @@ -647,17 +647,17 @@ queue_impl::submit_direct(bool CallerNeedsEvent, sycl::span DepEvents, // Sync with the last event for in order queue EventImplPtr &LastEvent = Deps.LastEventPtr; if (isInOrder() && LastEvent) { - registerEventDependency(LastEvent, CGData.MEvents, this, getContextImpl(), - getDeviceImpl(), - hasCommandGraph() ? getCommandGraph() : nullptr, - detail::CGType::Kernel); + registerEventDependency( + LastEvent, CGData.MEvents, this, getContextImpl(), getDeviceImpl(), + hasCommandGraph() ? getCommandGraph() : nullptr, + detail::CGType::Kernel); } for (event e : DepEvents) { - registerEventDependency(getSyclObjImpl(e), CGData.MEvents, this, - getContextImpl(), getDeviceImpl(), - hasCommandGraph() ? getCommandGraph() : nullptr, - detail::CGType::Kernel); + registerEventDependency( + getSyclObjImpl(e), CGData.MEvents, this, getContextImpl(), + getDeviceImpl(), hasCommandGraph() ? getCommandGraph() : nullptr, + detail::CGType::Kernel); } // Barrier and un-enqueued commands synchronization for out or order queue diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index ed660af9fe74a..3db16d225a1e3 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -622,9 +622,8 @@ class queue_impl : public std::enable_shared_from_this { const std::vector &DepEvents, bool CallerNeedsEvent); - void setCommandGraph( + void setCommandGraphUnlocked( std::shared_ptr Graph) { - std::lock_guard Lock(MMutex); MGraph = Graph; MExtGraphDeps.reset(); @@ -635,6 +634,12 @@ class queue_impl : public std::enable_shared_from_this { } } + void setCommandGraph( + std::shared_ptr Graph) { + std::lock_guard Lock(MMutex); + setCommandGraphUnlocked(Graph); + } + std::shared_ptr getCommandGraph() const { return MGraph.lock(); From 4c82669cea9ef8cc933b388e29426d7533d46fce Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 27 Oct 2025 15:16:33 +0000 Subject: [PATCH 03/24] Use the const event type in sycl::span --- sycl/include/sycl/queue.hpp | 16 +++++++------ sycl/source/detail/queue_impl.cpp | 5 ++-- sycl/source/detail/queue_impl.hpp | 15 +++++++----- sycl/source/queue.cpp | 40 ++++++++++++++++++------------- 4 files changed, 45 insertions(+), 31 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 0cf0732a8fb95..6a5e429c1f342 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -68,15 +68,17 @@ template event __SYCL_EXPORT submit_kernel_direct_with_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, - const detail::code_location &CodeLoc, bool IsTopCodeLoc); + detail::DeviceKernelInfo *DeviceKernelInfo, + sycl::span DepEvents, const detail::code_location &CodeLoc, + bool IsTopCodeLoc); template void __SYCL_EXPORT submit_kernel_direct_without_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, - const detail::code_location &CodeLoc, bool IsTopCodeLoc); + detail::DeviceKernelInfo *DeviceKernelInfo, + sycl::span DepEvents, const detail::code_location &CodeLoc, + bool IsTopCodeLoc); namespace detail { class queue_impl; @@ -164,7 +166,7 @@ template &Range, KernelTypeUniversalRef &&KernelFunc, - sycl::span DepEvents, + sycl::span DepEvents, const detail::code_location &CodeLoc = detail::code_location::current()) { // TODO Properties not supported yet static_assert( @@ -227,7 +229,7 @@ template auto submit_kernel_direct_parallel_for( const queue &Queue, PropertiesT Props, const nd_range &Range, - KernelTypeUniversalRef &&KernelFunc, sycl::span DepEvents, + KernelTypeUniversalRef &&KernelFunc, sycl::span DepEvents, const detail::code_location &CodeLoc = detail::code_location::current()) { using KernelType = @@ -256,7 +258,7 @@ template auto submit_kernel_direct_single_task( const queue &Queue, PropertiesT Props, KernelTypeUniversalRef &&KernelFunc, - sycl::span DepEvents, + sycl::span DepEvents, const detail::code_location &CodeLoc = detail::code_location::current()) { return submit_kernel_direct DepEvents, const detail::code_location &CodeLoc, + sycl::span DepEvents, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { KernelData KData; @@ -625,7 +625,8 @@ EventImplPtr queue_impl::submit_kernel_direct_impl( template detail::EventImplPtr -queue_impl::submit_direct(bool CallerNeedsEvent, sycl::span DepEvents, +queue_impl::submit_direct(bool CallerNeedsEvent, + sycl::span DepEvents, SubmitCommandFuncType &SubmitCommandFunc) { detail::CG::StorageInitHelper CGData; std::unique_lock Lock(MMutex); diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 3db16d225a1e3..6feee26964513 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -362,8 +362,9 @@ class queue_impl : public std::enable_shared_from_this { template event submit_kernel_direct_with_event( const nd_range &Range, detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + detail::DeviceKernelInfo *DeviceKernelInfo, + sycl::span DepEvents, const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, true, DepEvents, CodeLoc, IsTopCodeLoc); @@ -373,8 +374,9 @@ class queue_impl : public std::enable_shared_from_this { template void submit_kernel_direct_without_event( const nd_range &Range, detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + detail::DeviceKernelInfo *DeviceKernelInfo, + sycl::span DepEvents, const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, false, DepEvents, CodeLoc, IsTopCodeLoc); } @@ -934,11 +936,12 @@ class queue_impl : public std::enable_shared_from_this { EventImplPtr submit_kernel_direct_impl( const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, - sycl::span DepEvents, const detail::code_location &CodeLoc, + sycl::span DepEvents, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template - EventImplPtr submit_direct(bool CallerNeedsEvent, sycl::span DepEvents, + EventImplPtr submit_direct(bool CallerNeedsEvent, + sycl::span DepEvents, SubmitCommandFuncType &SubmitCommandFunc); /// Helper function for submitting a memory operation with a handler. diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 6001bc3a49f38..2424d8c869b82 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -475,8 +475,9 @@ template event submit_kernel_direct_with_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + detail::DeviceKernelInfo *DeviceKernelInfo, + sycl::span DepEvents, const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { return getSyclObjImpl(Queue)->submit_kernel_direct_with_event( Range, HostKernel, DeviceKernelInfo, DepEvents, CodeLoc, IsTopCodeLoc); } @@ -484,27 +485,31 @@ event submit_kernel_direct_with_event_impl( template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<1>( const queue &Queue, const nd_range<1> &Range, detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, - const detail::code_location &CodeLoc, bool IsTopCodeLoc); + detail::DeviceKernelInfo *DeviceKernelInfo, + sycl::span DepEvents, const detail::code_location &CodeLoc, + bool IsTopCodeLoc); template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<2>( const queue &Queue, const nd_range<2> &Range, detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, - const detail::code_location &CodeLoc, bool IsTopCodeLoc); + detail::DeviceKernelInfo *DeviceKernelInfo, + sycl::span DepEvents, const detail::code_location &CodeLoc, + bool IsTopCodeLoc); template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<3>( const queue &Queue, const nd_range<3> &Range, detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, - const detail::code_location &CodeLoc, bool IsTopCodeLoc); + detail::DeviceKernelInfo *DeviceKernelInfo, + sycl::span DepEvents, const detail::code_location &CodeLoc, + bool IsTopCodeLoc); template void submit_kernel_direct_without_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + detail::DeviceKernelInfo *DeviceKernelInfo, + sycl::span DepEvents, const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { getSyclObjImpl(Queue)->submit_kernel_direct_without_event( Range, HostKernel, DeviceKernelInfo, DepEvents, CodeLoc, IsTopCodeLoc); } @@ -512,20 +517,23 @@ void submit_kernel_direct_without_event_impl( template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<1>( const queue &Queue, const nd_range<1> &Range, detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, - const detail::code_location &CodeLoc, bool IsTopCodeLoc); + detail::DeviceKernelInfo *DeviceKernelInfo, + sycl::span DepEvents, const detail::code_location &CodeLoc, + bool IsTopCodeLoc); template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<2>( const queue &Queue, const nd_range<2> &Range, detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, - const detail::code_location &CodeLoc, bool IsTopCodeLoc); + detail::DeviceKernelInfo *DeviceKernelInfo, + sycl::span DepEvents, const detail::code_location &CodeLoc, + bool IsTopCodeLoc); template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<3>( const queue &Queue, const nd_range<3> &Range, detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, - const detail::code_location &CodeLoc, bool IsTopCodeLoc); + detail::DeviceKernelInfo *DeviceKernelInfo, + sycl::span DepEvents, const detail::code_location &CodeLoc, + bool IsTopCodeLoc); } // namespace _V1 } // namespace sycl From 8c36583ed7e01ce6718df8670a49ae4336689034 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 28 Oct 2025 11:30:56 +0000 Subject: [PATCH 04/24] Update Linux symbols, update tests --- sycl/test/abi/sycl_symbols_linux.dump | 12 ++++++------ sycl/test/basic_tests/single_task_error_message.cpp | 6 ++++++ sycl/test/include_deps/sycl_detail_core.hpp.cpp | 1 + .../include_deps/sycl_khr_includes_queue.hpp.cpp | 1 + sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp | 1 + 5 files changed, 15 insertions(+), 6 deletions(-) diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 032b82ae74293..6f04667e3d0bc 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -2985,12 +2985,12 @@ _ZN4sycl3_V121__isgreaterequal_implEdd _ZN4sycl3_V121__isgreaterequal_implEff _ZN4sycl3_V122accelerator_selector_vERKNS0_6deviceE _ZN4sycl3_V128verifyUSMAllocatorPropertiesERKNS0_13property_listE -_ZN4sycl3_V136submit_kernel_direct_with_event_implILi1EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoERKNSA_13code_locationEb -_ZN4sycl3_V136submit_kernel_direct_with_event_implILi2EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoERKNSA_13code_locationEb -_ZN4sycl3_V136submit_kernel_direct_with_event_implILi3EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoERKNSA_13code_locationEb -_ZN4sycl3_V139submit_kernel_direct_without_event_implILi1EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoERKNS9_13code_locationEb -_ZN4sycl3_V139submit_kernel_direct_without_event_implILi2EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoERKNS9_13code_locationEb -_ZN4sycl3_V139submit_kernel_direct_without_event_implILi3EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoERKNS9_13code_locationEb +_ZN4sycl3_V136submit_kernel_direct_with_event_implILi1EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoENS0_4spanIKS2_Lm18446744073709551615EEERKNSA_13code_locationEb +_ZN4sycl3_V136submit_kernel_direct_with_event_implILi2EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoENS0_4spanIKS2_Lm18446744073709551615EEERKNSA_13code_locationEb +_ZN4sycl3_V136submit_kernel_direct_with_event_implILi3EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoENS0_4spanIKS2_Lm18446744073709551615EEERKNSA_13code_locationEb +_ZN4sycl3_V139submit_kernel_direct_without_event_implILi1EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoENS0_4spanIKNS0_5eventELm18446744073709551615EEERKNS9_13code_locationEb +_ZN4sycl3_V139submit_kernel_direct_without_event_implILi2EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoENS0_4spanIKNS0_5eventELm18446744073709551615EEERKNS9_13code_locationEb +_ZN4sycl3_V139submit_kernel_direct_without_event_implILi3EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoENS0_4spanIKNS0_5eventELm18446744073709551615EEERKNS9_13code_locationEb _ZN4sycl3_V13ext5intel12experimental9pipe_base13get_pipe_nameB5cxx11EPKv _ZN4sycl3_V13ext5intel12experimental9pipe_base17wait_non_blockingERKNS0_5eventE _ZN4sycl3_V13ext5intel12experimental9pipe_base18get_pipe_name_implEPKv diff --git a/sycl/test/basic_tests/single_task_error_message.cpp b/sycl/test/basic_tests/single_task_error_message.cpp index b086c81e4f2af..66fc1c39b1415 100644 --- a/sycl/test/basic_tests/single_task_error_message.cpp +++ b/sycl/test/basic_tests/single_task_error_message.cpp @@ -31,6 +31,9 @@ int main() { [&](sycl::handler &cgh) { // expected-error-re@sycl/queue.hpp:* {{static assertion failed due to requirement '{{.*}}': sycl::queue.single_task() requires a kernel instead of command group.{{.*}} Use queue.submit() instead}} // expected-error-re@sycl/detail/cg_types.hpp:* {{no matching function for call to object of type '(lambda at {{.*}}single_task_error_message.cpp:{{.*}})'}} + // TODO Investigate why this function template is not instantiated + // (if this is expected). + // expected-error@sycl/detail/cg_types.hpp:* {{no matching function for call to 'runKernelWithoutArg'}} }) .wait(); } @@ -47,6 +50,9 @@ int main() { [&](sycl::handler &cgh) { // expected-error-re@sycl/queue.hpp:* {{static assertion failed due to requirement '{{.*}}': sycl::queue.single_task() requires a kernel instead of command group.{{.*}} Use queue.submit() instead}} // expected-error-re@sycl/detail/cg_types.hpp:* {{no matching function for call to object of type '(lambda at {{.*}}single_task_error_message.cpp:{{.*}})'}} + // TODO Investigate why this function template is not instantiated + // (if this is expected). + // expected-error@sycl/detail/cg_types.hpp:* {{no matching function for call to 'runKernelWithoutArg'}} }) .wait(); } diff --git a/sycl/test/include_deps/sycl_detail_core.hpp.cpp b/sycl/test/include_deps/sycl_detail_core.hpp.cpp index cf98e8708254a..7126e6109879b 100644 --- a/sycl/test/include_deps/sycl_detail_core.hpp.cpp +++ b/sycl/test/include_deps/sycl_detail_core.hpp.cpp @@ -154,5 +154,6 @@ // CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp // CHECK-NEXT: kernel.hpp // CHECK-NEXT: sampler.hpp +// CHECK-NEXT: sycl_span.hpp // CHECK-NEXT: feature_test.hpp // CHECK-EMPTY: diff --git a/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp index a37e63c0cc7de..4896332c3aa4a 100644 --- a/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp @@ -158,4 +158,5 @@ // CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp // CHECK-NEXT: kernel.hpp // CHECK-NEXT: sampler.hpp +// CHECK-NEXT: sycl_span.hpp // CHECK-EMPTY: diff --git a/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp index 87de587378f40..d53dc0e82792f 100644 --- a/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp @@ -173,6 +173,7 @@ // CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp // CHECK-NEXT: kernel.hpp // CHECK-NEXT: sampler.hpp +// CHECK-NEXT: sycl_span.hpp // CHECK-NEXT: usm/usm_pointer_info.hpp // CHECK-NEXT: usm/usm_allocator.hpp // CHECK-EMPTY: From 256de780a65c8462772c18ba77459c7e52f6ddc8 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 28 Oct 2025 11:49:45 +0000 Subject: [PATCH 05/24] Update Windows symbols --- sycl/test/abi/sycl_symbols_windows.dump | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index d5f53a5bbb505..6accfd60df3eb 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -286,12 +286,12 @@ ??$is_image_handle_supported@Usampled_image_handle@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA_NAEBUimage_descriptor@01234@W4image_memory_handle_type@01234@AEBVqueue@34@@Z ??$is_image_handle_supported@Uunsampled_image_handle@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA_NAEBUimage_descriptor@01234@W4image_memory_handle_type@01234@AEBVdevice@34@AEBVcontext@34@@Z ??$is_image_handle_supported@Uunsampled_image_handle@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA_NAEBUimage_descriptor@01234@W4image_memory_handle_type@01234@AEBVqueue@34@@Z -??$submit_kernel_direct_with_event_impl@$00@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$00@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@AEBUcode_location@601@_N@Z -??$submit_kernel_direct_with_event_impl@$01@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$01@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@AEBUcode_location@601@_N@Z -??$submit_kernel_direct_with_event_impl@$02@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$02@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@AEBUcode_location@601@_N@Z -??$submit_kernel_direct_without_event_impl@$00@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$00@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@AEBUcode_location@501@_N@Z -??$submit_kernel_direct_without_event_impl@$01@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$01@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@AEBUcode_location@501@_N@Z -??$submit_kernel_direct_without_event_impl@$02@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$02@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@AEBUcode_location@501@_N@Z +??$submit_kernel_direct_with_event_impl@$00@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$00@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@V?$span@$$CBVevent@_V1@sycl@@$0?0@01@AEBUcode_location@601@_N@Z +??$submit_kernel_direct_with_event_impl@$01@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$01@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@V?$span@$$CBVevent@_V1@sycl@@$0?0@01@AEBUcode_location@601@_N@Z +??$submit_kernel_direct_with_event_impl@$02@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$02@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@V?$span@$$CBVevent@_V1@sycl@@$0?0@01@AEBUcode_location@601@_N@Z +??$submit_kernel_direct_without_event_impl@$00@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$00@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@V?$span@$$CBVevent@_V1@sycl@@$0?0@01@AEBUcode_location@501@_N@Z +??$submit_kernel_direct_without_event_impl@$01@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$01@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@V?$span@$$CBVevent@_V1@sycl@@$0?0@01@AEBUcode_location@501@_N@Z +??$submit_kernel_direct_without_event_impl@$02@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$02@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@V?$span@$$CBVevent@_V1@sycl@@$0?0@01@AEBUcode_location@501@_N@Z ??$update_nd_range@$00@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$00@45@@Z ??$update_nd_range@$01@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$01@45@@Z ??$update_nd_range@$02@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$02@45@@Z From 0187376e344eb41a4dc5c032e2bd1004c45c4e84 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 28 Oct 2025 11:50:36 +0000 Subject: [PATCH 06/24] Fix formatting --- sycl/test/basic_tests/single_task_error_message.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sycl/test/basic_tests/single_task_error_message.cpp b/sycl/test/basic_tests/single_task_error_message.cpp index 66fc1c39b1415..7c6f020741247 100644 --- a/sycl/test/basic_tests/single_task_error_message.cpp +++ b/sycl/test/basic_tests/single_task_error_message.cpp @@ -31,8 +31,8 @@ int main() { [&](sycl::handler &cgh) { // expected-error-re@sycl/queue.hpp:* {{static assertion failed due to requirement '{{.*}}': sycl::queue.single_task() requires a kernel instead of command group.{{.*}} Use queue.submit() instead}} // expected-error-re@sycl/detail/cg_types.hpp:* {{no matching function for call to object of type '(lambda at {{.*}}single_task_error_message.cpp:{{.*}})'}} - // TODO Investigate why this function template is not instantiated - // (if this is expected). + // TODO Investigate why this function template is not + // instantiated (if this is expected). // expected-error@sycl/detail/cg_types.hpp:* {{no matching function for call to 'runKernelWithoutArg'}} }) .wait(); @@ -50,8 +50,8 @@ int main() { [&](sycl::handler &cgh) { // expected-error-re@sycl/queue.hpp:* {{static assertion failed due to requirement '{{.*}}': sycl::queue.single_task() requires a kernel instead of command group.{{.*}} Use queue.submit() instead}} // expected-error-re@sycl/detail/cg_types.hpp:* {{no matching function for call to object of type '(lambda at {{.*}}single_task_error_message.cpp:{{.*}})'}} - // TODO Investigate why this function template is not instantiated - // (if this is expected). + // TODO Investigate why this function template is not + // instantiated (if this is expected). // expected-error@sycl/detail/cg_types.hpp:* {{no matching function for call to 'runKernelWithoutArg'}} }) .wait(); From 7cd45e7f9d54ec5ce5f9fbdd3235c7d743c89756 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 29 Oct 2025 14:49:33 +0000 Subject: [PATCH 07/24] Fix comment --- sycl/source/detail/graph/graph_impl.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sycl/source/detail/graph/graph_impl.hpp b/sycl/source/detail/graph/graph_impl.hpp index 185f75ca82c1b..10cbbfab0282c 100644 --- a/sycl/source/detail/graph/graph_impl.hpp +++ b/sycl/source/detail/graph/graph_impl.hpp @@ -463,9 +463,10 @@ class graph_impl : public std::enable_shared_from_this { /// Sets the Queue state to queue_state::recording. Adds the queue to the list /// of recording queues associated with this graph. - /// Does not take the queue submission lock. Required for the cases, - /// when the recording is started direct from within the kernel submission - /// flow. + /// Does not take the queue submission lock. + /// + /// Required for the cases, when the recording is started directly + /// from within the kernel submission flow. /// @param[in] Queue The queue to be recorded from. void beginRecordingUnlockedQueue(sycl::detail::queue_impl &Queue); From 497ad12e23470fbcd6d7903c0d55db25f3cc7bce Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 31 Oct 2025 11:43:35 +0000 Subject: [PATCH 08/24] Avoid vector allocation for a single dependency --- sycl/include/sycl/queue.hpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 6a5e429c1f342..1e6191cfea46d 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -2851,10 +2851,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { !(ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod< const KernelType &>::value) && !(detail::KernelLambdaHasKernelHandlerArgT::value)) { - std::vector DepEvents = {DepEvent}; return detail::submit_kernel_direct_single_task( *this, ext::oneapi::experimental::empty_properties_t{}, KernelFunc, - DepEvents, TlsCodeLocCapture.query()); + sycl::span(&DepEvent, 1), TlsCodeLocCapture.query()); } else { return submit( [&](handler &CGH) { @@ -3441,7 +3440,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); using KernelType = std::tuple_element_t<0, std::tuple>; - // TODO The handler-less path does not support reductions, kernel // function properties and kernel functions with the kernel_handler // type argument yet. @@ -3451,10 +3449,10 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const KernelType &>::value) && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { - std::vector DepEvents = {DepEvent}; return detail::submit_kernel_direct_parallel_for( *this, ext::oneapi::experimental::empty_properties_t{}, Range, - Rest..., DepEvents, TlsCodeLocCapture.query()); + Rest..., sycl::span(&DepEvent, 1), + TlsCodeLocCapture.query()); } else { return submit( [&](handler &CGH) { From 50568ea42f28925d2f7ac9a9794bc2802429b61e Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 4 Nov 2025 08:26:19 +0000 Subject: [PATCH 09/24] Add sycl::span layout test. --- sycl/test/abi/layout_span.cpp | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 sycl/test/abi/layout_span.cpp diff --git a/sycl/test/abi/layout_span.cpp b/sycl/test/abi/layout_span.cpp new file mode 100644 index 0000000000000..d274b5ee33247 --- /dev/null +++ b/sycl/test/abi/layout_span.cpp @@ -0,0 +1,23 @@ +// RUN: %clangxx -fsycl -c -fno-color-diagnostics -Xclang -fdump-record-layouts %s -o %t.out | FileCheck %s +// RUN: %clangxx -fsycl -fsycl-device-only -c -fno-color-diagnostics -Xclang -fdump-record-layouts %s -o %t.out | FileCheck %s +// REQUIRES: linux +// UNSUPPORTED: libcxx + +// clang-format off + +#include + +void span(sycl::span) {} +// CHECK: 0 | class sycl::span +// CHECK-NEXT: 0 | pointer __data +// CHECK-NEXT: | [sizeof=8, dsize=8, align=8, +// CHECK-NEXT: | nvsize=8, nvalign=8] + +//---------------------------- + +void span_dynamic_extent(sycl::span) {} +// CHECK: 0 | class sycl::span +// CHECK-NEXT: 0 | pointer __data +// CHECK-NEXT: 8 | size_type __size +// CHECK-NEXT: | [sizeof=16, dsize=16, align=8, +// CHECK-NEXT: | nvsize=16, nvalign=8] From a66128aa6d7c1576583cbc4e13b9db04d8b11eb2 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 4 Nov 2025 09:35:33 +0000 Subject: [PATCH 10/24] Add a new header with event dependency utilities. --- sycl/source/detail/event_deps.hpp | 123 ++++++++++++++++++++++++++++++ sycl/source/detail/event_impl.cpp | 93 ---------------------- sycl/source/detail/event_impl.hpp | 9 --- sycl/source/detail/queue_impl.cpp | 7 +- sycl/source/handler.cpp | 4 +- 5 files changed, 130 insertions(+), 106 deletions(-) create mode 100644 sycl/source/detail/event_deps.hpp diff --git a/sycl/source/detail/event_deps.hpp b/sycl/source/detail/event_deps.hpp new file mode 100644 index 0000000000000..cd62a86138c47 --- /dev/null +++ b/sycl/source/detail/event_deps.hpp @@ -0,0 +1,123 @@ +//==---------------- event_deps.hpp - SYCL event dependency utils ----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace sycl { +inline namespace _V1 { +namespace detail { + +/// Adds an event dependency to the list of dependencies, performing +/// a series of checks. +/// +/// If the event is associated with a graph, and the queue is not, +/// the queue will be switched to a recording mode (transitive queue +/// recording feature). +/// +/// The LockQueue template argument defines, whether the queue lock +/// should be acquired for the transition to a recording mode. It is +/// set to false in cases, where the event dependencies are set directly +/// in the command submission flow, where the lock is already aquired. +/// +/// \param EventImpl Event to register as a dependency +/// \param EventsRegistered A list of already registered events, where +/// the event will be added. +/// \param QueueImpl A queue associated with the event dependencies. Can +/// be nullptr if no associated queue. +/// \param ContextImpl A context associated with a queue or graph. +/// \param DeviceImpl A device associated with a queue or graph. +/// \param GraphImpl A graph associated with a queue or a handler. Can +/// be nullptr if no associated graph. +/// \param CommandGroupType Type of command group. +template +void registerEventDependency( + const EventImplPtr &EventImpl, std::vector &EventsRegistered, + queue_impl *QueueImpl, const context_impl &ContextImpl, + const device_impl &DeviceImpl, + const ext::oneapi::experimental::detail::graph_impl *GraphImpl, + CGType CommandGroupType) { + + if (!EventImpl) + return; + if (EventImpl->isDiscarded()) { + throw sycl::exception(make_error_code(errc::invalid), + "Queue operation cannot depend on discarded event."); + } + + // Async alloc calls adapter immediately. Any explicit/implicit dependencies + // are handled at that point, including in order queue deps. Further calls to + // depends_on after an async alloc are explicitly disallowed. + if (CommandGroupType == CGType::AsyncAlloc) { + throw sycl::exception(make_error_code(errc::invalid), + "Cannot submit a dependency after an asynchronous " + "allocation has already been executed!"); + } + + auto EventGraph = EventImpl->getCommandGraph(); + if (QueueImpl && EventGraph) { + auto QueueGraph = QueueImpl->getCommandGraph(); + + if (&EventGraph->getContextImpl() != &ContextImpl) { + throw sycl::exception( + make_error_code(errc::invalid), + "Cannot submit to a queue with a dependency from a graph that is " + "associated with a different context."); + } + + if (&EventGraph->getDeviceImpl() != &DeviceImpl) { + throw sycl::exception( + make_error_code(errc::invalid), + "Cannot submit to a queue with a dependency from a graph that is " + "associated with a different device."); + } + + if (QueueGraph && QueueGraph != EventGraph) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "Cannot submit to a recording queue with a " + "dependency from a different graph."); + } + + // If the event dependency has a graph, that means that the queue that + // created it was in recording mode. If the current queue is not recording, + // we need to set it to recording (implements the transitive queue recording + // feature). + if (!QueueGraph) { + if constexpr (LockQueue) { + EventGraph->beginRecording(*QueueImpl); + } else { + EventGraph->beginRecordingUnlockedQueue(*QueueImpl); + } + } + } + + if (GraphImpl) { + if (EventGraph == nullptr) { + throw sycl::exception( + make_error_code(errc::invalid), + "Graph nodes cannot depend on events from outside the graph."); + } + if (EventGraph.get() != GraphImpl) { + throw sycl::exception( + make_error_code(errc::invalid), + "Graph nodes cannot depend on events from another graph."); + } + } + EventsRegistered.push_back(EventImpl); +} + +} // namespace detail +} // namespace _V1 +} // namespace sycl diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index 1c14ed43d3875..b0c838cdd890c 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -611,99 +611,6 @@ bool event_impl::isCompleted() { void event_impl::setCommand(Command *Cmd) { MCommand = Cmd; } -template -void registerEventDependency( - const detail::EventImplPtr &EventImpl, - std::vector &EventsRegistered, - detail::queue_impl *QueueImpl, const detail::context_impl &ContextImpl, - const detail::device_impl &DeviceImpl, - const std::shared_ptr &Graph, - sycl::detail::CGType CommandGroupType) { - - if (!EventImpl) - return; - if (EventImpl->isDiscarded()) { - throw sycl::exception(make_error_code(errc::invalid), - "Queue operation cannot depend on discarded event."); - } - - // Async alloc calls adapter immediately. Any explicit/implicit dependencies - // are handled at that point, including in order queue deps. Further calls to - // depends_on after an async alloc are explicitly disallowed. - if (CommandGroupType == CGType::AsyncAlloc) { - throw sycl::exception(make_error_code(errc::invalid), - "Cannot submit a dependency after an asynchronous " - "allocation has already been executed!"); - } - - auto EventGraph = EventImpl->getCommandGraph(); - if (QueueImpl && EventGraph) { - auto QueueGraph = QueueImpl->getCommandGraph(); - - if (&EventGraph->getContextImpl() != &ContextImpl) { - throw sycl::exception( - make_error_code(errc::invalid), - "Cannot submit to a queue with a dependency from a graph that is " - "associated with a different context."); - } - - if (&EventGraph->getDeviceImpl() != &DeviceImpl) { - throw sycl::exception( - make_error_code(errc::invalid), - "Cannot submit to a queue with a dependency from a graph that is " - "associated with a different device."); - } - - if (QueueGraph && QueueGraph != EventGraph) { - throw sycl::exception(sycl::make_error_code(errc::invalid), - "Cannot submit to a recording queue with a " - "dependency from a different graph."); - } - - // If the event dependency has a graph, that means that the queue that - // created it was in recording mode. If the current queue is not recording, - // we need to set it to recording (implements the transitive queue recording - // feature). - if (!QueueGraph) { - if constexpr (LockQueue) { - EventGraph->beginRecording(*QueueImpl); - } else { - EventGraph->beginRecordingUnlockedQueue(*QueueImpl); - } - } - } - - if (Graph) { - if (EventGraph == nullptr) { - throw sycl::exception( - make_error_code(errc::invalid), - "Graph nodes cannot depend on events from outside the graph."); - } - if (EventGraph != Graph) { - throw sycl::exception( - make_error_code(errc::invalid), - "Graph nodes cannot depend on events from another graph."); - } - } - EventsRegistered.push_back(EventImpl); -} - -template void registerEventDependency( - const detail::EventImplPtr &EventImpl, - std::vector &EventsRegistered, - detail::queue_impl *QueueImpl, const detail::context_impl &ContextImpl, - const detail::device_impl &DeviceImpl, - const std::shared_ptr &Graph, - sycl::detail::CGType CommandGroupType); - -template void registerEventDependency( - const detail::EventImplPtr &EventImpl, - std::vector &EventsRegistered, - detail::queue_impl *QueueImpl, const detail::context_impl &ContextImpl, - const detail::device_impl &DeviceImpl, - const std::shared_ptr &Graph, - sycl::detail::CGType CommandGroupType); - } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index 0e4a1e62d904a..5c2fee0f74d75 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -475,15 +475,6 @@ class events_range : public iterator_range { using Base::Base; }; -template -void registerEventDependency( - const detail::EventImplPtr &EventToRegister, - std::vector &EventsRegistered, - detail::queue_impl *QueueImpl, const detail::context_impl &ContextImpl, - const detail::device_impl &DeviceImpl, - const std::shared_ptr &Graph, - sycl::detail::CGType CommandGroupType); - } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index ef1dd7fd9d3f9..9ba6262eaec05 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include #include #include #include @@ -639,7 +640,7 @@ queue_impl::submit_direct(bool CallerNeedsEvent, if (ExternalEvent) { registerEventDependency( getSyclObjImpl(*ExternalEvent), CGData.MEvents, this, getContextImpl(), - getDeviceImpl(), hasCommandGraph() ? getCommandGraph() : nullptr, + getDeviceImpl(), hasCommandGraph() ? getCommandGraph().get() : nullptr, detail::CGType::Kernel); } @@ -650,14 +651,14 @@ queue_impl::submit_direct(bool CallerNeedsEvent, if (isInOrder() && LastEvent) { registerEventDependency( LastEvent, CGData.MEvents, this, getContextImpl(), getDeviceImpl(), - hasCommandGraph() ? getCommandGraph() : nullptr, + hasCommandGraph() ? getCommandGraph().get() : nullptr, detail::CGType::Kernel); } for (event e : DepEvents) { registerEventDependency( getSyclObjImpl(e), CGData.MEvents, this, getContextImpl(), - getDeviceImpl(), hasCommandGraph() ? getCommandGraph() : nullptr, + getDeviceImpl(), hasCommandGraph() ? getCommandGraph().get() : nullptr, detail::CGType::Kernel); } diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 146695a1d2681..7a606ec79b095 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -1673,7 +1674,8 @@ void handler::depends_on(const std::vector &Events) { void handler::depends_on(const detail::EventImplPtr &EventImpl) { registerEventDependency(EventImpl, impl->CGData.MEvents, impl->get_queue_or_null(), impl->get_context(), - impl->get_device(), getCommandGraph(), getType()); + impl->get_device(), getCommandGraph().get(), + getType()); } void handler::depends_on(const std::vector &Events) { From d0882d46ba2e1ebfe4178bd6f3192a225be9cfce Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 4 Nov 2025 10:25:03 +0000 Subject: [PATCH 11/24] Limit the sycl::span layout test to host --- sycl/test/abi/layout_span.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/sycl/test/abi/layout_span.cpp b/sycl/test/abi/layout_span.cpp index d274b5ee33247..49fdce6d79f58 100644 --- a/sycl/test/abi/layout_span.cpp +++ b/sycl/test/abi/layout_span.cpp @@ -1,5 +1,4 @@ // RUN: %clangxx -fsycl -c -fno-color-diagnostics -Xclang -fdump-record-layouts %s -o %t.out | FileCheck %s -// RUN: %clangxx -fsycl -fsycl-device-only -c -fno-color-diagnostics -Xclang -fdump-record-layouts %s -o %t.out | FileCheck %s // REQUIRES: linux // UNSUPPORTED: libcxx From e780335fbbd1d74d3f1d809ae4cb3a42737fbc69 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 4 Nov 2025 11:03:32 +0000 Subject: [PATCH 12/24] Update Linux symbols --- sycl/test/abi/sycl_symbols_linux.dump | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 5e58ac99474d3..e42759431d374 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -2985,12 +2985,12 @@ _ZN4sycl3_V121__isgreaterequal_implEdd _ZN4sycl3_V121__isgreaterequal_implEff _ZN4sycl3_V122accelerator_selector_vERKNS0_6deviceE _ZN4sycl3_V128verifyUSMAllocatorPropertiesERKNS0_13property_listE -_ZN4sycl3_V136submit_kernel_direct_with_event_implILi1EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoERKNSA_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSH_5intel12experimental12cache_configENSJ_17use_root_sync_keyENSJ_23work_group_progress_keyENSJ_22sub_group_progress_keyENSJ_22work_item_progress_keyENSJ_4cuda12cluster_sizeILi1EEENST_ILi2EEENST_ILi3EEEEEERKNSA_13code_locationEb -_ZN4sycl3_V136submit_kernel_direct_with_event_implILi2EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoERKNSA_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSH_5intel12experimental12cache_configENSJ_17use_root_sync_keyENSJ_23work_group_progress_keyENSJ_22sub_group_progress_keyENSJ_22work_item_progress_keyENSJ_4cuda12cluster_sizeILi1EEENST_ILi2EEENST_ILi3EEEEEERKNSA_13code_locationEb -_ZN4sycl3_V136submit_kernel_direct_with_event_implILi3EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoERKNSA_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSH_5intel12experimental12cache_configENSJ_17use_root_sync_keyENSJ_23work_group_progress_keyENSJ_22sub_group_progress_keyENSJ_22work_item_progress_keyENSJ_4cuda12cluster_sizeILi1EEENST_ILi2EEENST_ILi3EEEEEERKNSA_13code_locationEb -_ZN4sycl3_V139submit_kernel_direct_without_event_implILi1EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoERKNS9_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSG_5intel12experimental12cache_configENSI_17use_root_sync_keyENSI_23work_group_progress_keyENSI_22sub_group_progress_keyENSI_22work_item_progress_keyENSI_4cuda12cluster_sizeILi1EEENSS_ILi2EEENSS_ILi3EEEEEERKNS9_13code_locationEb -_ZN4sycl3_V139submit_kernel_direct_without_event_implILi2EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoERKNS9_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSG_5intel12experimental12cache_configENSI_17use_root_sync_keyENSI_23work_group_progress_keyENSI_22sub_group_progress_keyENSI_22work_item_progress_keyENSI_4cuda12cluster_sizeILi1EEENSS_ILi2EEENSS_ILi3EEEEEERKNS9_13code_locationEb -_ZN4sycl3_V139submit_kernel_direct_without_event_implILi3EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoERKNS9_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSG_5intel12experimental12cache_configENSI_17use_root_sync_keyENSI_23work_group_progress_keyENSI_22sub_group_progress_keyENSI_22work_item_progress_keyENSI_4cuda12cluster_sizeILi1EEENSS_ILi2EEENSS_ILi3EEEEEERKNS9_13code_locationEb +_ZN4sycl3_V136submit_kernel_direct_with_event_implILi1EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoENS0_4spanIKS2_Lm18446744073709551615EEERKNSA_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSK_5intel12experimental12cache_configENSM_17use_root_sync_keyENSM_23work_group_progress_keyENSM_22sub_group_progress_keyENSM_22work_item_progress_keyENSM_4cuda12cluster_sizeILi1EEENSW_ILi2EEENSW_ILi3EEEEEERKNSA_13code_locationEb +_ZN4sycl3_V136submit_kernel_direct_with_event_implILi2EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoENS0_4spanIKS2_Lm18446744073709551615EEERKNSA_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSK_5intel12experimental12cache_configENSM_17use_root_sync_keyENSM_23work_group_progress_keyENSM_22sub_group_progress_keyENSM_22work_item_progress_keyENSM_4cuda12cluster_sizeILi1EEENSW_ILi2EEENSW_ILi3EEEEEERKNSA_13code_locationEb +_ZN4sycl3_V136submit_kernel_direct_with_event_implILi3EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoENS0_4spanIKS2_Lm18446744073709551615EEERKNSA_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSK_5intel12experimental12cache_configENSM_17use_root_sync_keyENSM_23work_group_progress_keyENSM_22sub_group_progress_keyENSM_22work_item_progress_keyENSM_4cuda12cluster_sizeILi1EEENSW_ILi2EEENSW_ILi3EEEEEERKNSA_13code_locationEb +_ZN4sycl3_V139submit_kernel_direct_without_event_implILi1EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoENS0_4spanIKNS0_5eventELm18446744073709551615EEERKNS9_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSK_5intel12experimental12cache_configENSM_17use_root_sync_keyENSM_23work_group_progress_keyENSM_22sub_group_progress_keyENSM_22work_item_progress_keyENSM_4cuda12cluster_sizeILi1EEENSW_ILi2EEENSW_ILi3EEEEEERKNS9_13code_locationEb +_ZN4sycl3_V139submit_kernel_direct_without_event_implILi2EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoENS0_4spanIKNS0_5eventELm18446744073709551615EEERKNS9_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSK_5intel12experimental12cache_configENSM_17use_root_sync_keyENSM_23work_group_progress_keyENSM_22sub_group_progress_keyENSM_22work_item_progress_keyENSM_4cuda12cluster_sizeILi1EEENSW_ILi2EEENSW_ILi3EEEEEERKNS9_13code_locationEb +_ZN4sycl3_V139submit_kernel_direct_without_event_implILi3EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoENS0_4spanIKNS0_5eventELm18446744073709551615EEERKNS9_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSK_5intel12experimental12cache_configENSM_17use_root_sync_keyENSM_23work_group_progress_keyENSM_22sub_group_progress_keyENSM_22work_item_progress_keyENSM_4cuda12cluster_sizeILi1EEENSW_ILi2EEENSW_ILi3EEEEEERKNS9_13code_locationEb _ZN4sycl3_V13ext5intel12experimental9pipe_base13get_pipe_nameB5cxx11EPKv _ZN4sycl3_V13ext5intel12experimental9pipe_base17wait_non_blockingERKNS0_5eventE _ZN4sycl3_V13ext5intel12experimental9pipe_base18get_pipe_name_implEPKv From bece167eaaf3a3e707c6c4988af8461b3adf3afe Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 4 Nov 2025 11:19:12 +0000 Subject: [PATCH 13/24] Update Windows symbols --- sycl/test/abi/sycl_symbols_windows.dump | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index f7e0f0f062223..1db98f1b5cf9d 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -286,12 +286,12 @@ ??$is_image_handle_supported@Usampled_image_handle@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA_NAEBUimage_descriptor@01234@W4image_memory_handle_type@01234@AEBVqueue@34@@Z ??$is_image_handle_supported@Uunsampled_image_handle@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA_NAEBUimage_descriptor@01234@W4image_memory_handle_type@01234@AEBVdevice@34@AEBVcontext@34@@Z ??$is_image_handle_supported@Uunsampled_image_handle@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA_NAEBUimage_descriptor@01234@W4image_memory_handle_type@01234@AEBVqueue@34@@Z -??$submit_kernel_direct_with_event_impl@$00@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$00@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@601@AEBUcode_location@601@_N@Z -??$submit_kernel_direct_with_event_impl@$01@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$01@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@601@AEBUcode_location@601@_N@Z -??$submit_kernel_direct_with_event_impl@$02@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$02@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@601@AEBUcode_location@601@_N@Z -??$submit_kernel_direct_without_event_impl@$00@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$00@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@501@AEBUcode_location@501@_N@Z -??$submit_kernel_direct_without_event_impl@$01@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$01@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@501@AEBUcode_location@501@_N@Z -??$submit_kernel_direct_without_event_impl@$02@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$02@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@501@AEBUcode_location@501@_N@Z +??$submit_kernel_direct_with_event_impl@$00@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$00@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@V?$span@$$CBVevent@_V1@sycl@@$0?0@01@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@601@AEBUcode_location@601@_N@Z +??$submit_kernel_direct_with_event_impl@$01@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$01@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@V?$span@$$CBVevent@_V1@sycl@@$0?0@01@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@601@AEBUcode_location@601@_N@Z +??$submit_kernel_direct_with_event_impl@$02@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$02@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@V?$span@$$CBVevent@_V1@sycl@@$0?0@01@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@601@AEBUcode_location@601@_N@Z +??$submit_kernel_direct_without_event_impl@$00@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$00@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@V?$span@$$CBVevent@_V1@sycl@@$0?0@01@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@501@AEBUcode_location@501@_N@Z +??$submit_kernel_direct_without_event_impl@$01@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$01@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@V?$span@$$CBVevent@_V1@sycl@@$0?0@01@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@501@AEBUcode_location@501@_N@Z +??$submit_kernel_direct_without_event_impl@$02@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$02@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@V?$span@$$CBVevent@_V1@sycl@@$0?0@01@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@501@AEBUcode_location@501@_N@Z ??$update_nd_range@$00@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$00@45@@Z ??$update_nd_range@$01@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$01@45@@Z ??$update_nd_range@$02@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$02@45@@Z From 12aa7af80f54871ca02aa2f96ef0a1173bdd5068 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 4 Nov 2025 11:29:21 +0000 Subject: [PATCH 14/24] Enable properties parsing for kernel submission with event deps --- sycl/include/sycl/queue.hpp | 40 ++++++++++++------------------------- 1 file changed, 13 insertions(+), 27 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 5bcfe2c97cf37..d170426e5cb63 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -2860,15 +2860,10 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - // TODO The handler-less path does not support kernel - // function properties and kernel functions with the kernel_handler - // type argument yet. - if constexpr ( - std::is_same_v && - !(ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod< - const KernelType &>::value) && - !(detail::KernelLambdaHasKernelHandlerArgT::value)) { + // TODO The handler-less path does not support kernel functions + // with the kernel_handler type argument yet. + if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT::value)) { return detail::submit_kernel_direct_single_task( *this, KernelFunc, sycl::span(&DepEvent, 1), Properties, TlsCodeLocCapture.query()); @@ -2926,12 +2921,10 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - if constexpr ( - std::is_same_v && - !(ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod< - const KernelType &>::value) && - !(detail::KernelLambdaHasKernelHandlerArgT::value)) { + // TODO The handler-less path does not support kernel functions + // with the kernel_handler type argument yet. + if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT::value)) { return detail::submit_kernel_direct_single_task( *this, KernelFunc, DepEvents, Properties, TlsCodeLocCapture.query()); } else { @@ -3465,13 +3458,10 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); using KernelType = std::tuple_element_t<0, std::tuple>; - // TODO The handler-less path does not support reductions, kernel - // function properties and kernel functions with the kernel_handler - // type argument yet. + + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. if constexpr (sizeof...(RestT) == 1 && - !(ext::oneapi::experimental::detail:: - HasKernelPropertiesGetMethod< - const KernelType &>::value) && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { return detail::submit_kernel_direct_parallel_for( @@ -3536,13 +3526,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); using KernelType = std::tuple_element_t<0, std::tuple>; - // TODO The handler-less path does not support reductions, kernel - // function properties and kernel functions with the kernel_handler - // type argument yet. + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. if constexpr (sizeof...(RestT) == 1 && - !(ext::oneapi::experimental::detail:: - HasKernelPropertiesGetMethod< - const KernelType &>::value) && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { return detail::submit_kernel_direct_parallel_for( From e35119cf59e725135c489841505c12aaf5b1bd1b Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 4 Nov 2025 11:45:27 +0000 Subject: [PATCH 15/24] Remove unused include and a new line --- sycl/source/detail/event_impl.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index ccd90fe5a7cd3..53727aa0505ba 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -10,7 +10,6 @@ #include #include -#include #include #include #include @@ -474,7 +473,6 @@ class events_range : public iterator_range { public: using Base::Base; }; - } // namespace detail } // namespace _V1 } // namespace sycl From 07920cac4a82110136ed70c0283c90270a3d89ad Mon Sep 17 00:00:00 2001 From: Slawomir Ptak Date: Tue, 4 Nov 2025 13:55:54 +0100 Subject: [PATCH 16/24] Update sycl/source/detail/event_deps.hpp Co-authored-by: Sergei Vinogradov --- sycl/source/detail/event_deps.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sycl/source/detail/event_deps.hpp b/sycl/source/detail/event_deps.hpp index cd62a86138c47..dd8c7bf30a9f6 100644 --- a/sycl/source/detail/event_deps.hpp +++ b/sycl/source/detail/event_deps.hpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +#pragma once + #include #include #include From 43471f1594b6d2e8c5e107a3bc4341540ddebdd4 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 4 Nov 2025 14:10:24 +0000 Subject: [PATCH 17/24] Fix function args --- .../ext/oneapi/experimental/enqueue_functions.hpp | 2 +- sycl/include/sycl/queue.hpp | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 924bae1d9ae22..0f06f610600b6 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -312,7 +312,7 @@ void nd_launch(queue Q, launch_config, Properties> Config, LaunchConfigAccess(Config); detail::submit_kernel_direct_parallel_for( - std::move(Q), LaunchConfigAccess.getRange(), KernelObj, + std::move(Q), LaunchConfigAccess.getRange(), KernelObj, {}, LaunchConfigAccess.getProperties()); } else { submit(std::move(Q), [&](handler &CGH) { diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index d170426e5cb63..95c96004dc067 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3374,7 +3374,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { KernelType, sycl::nd_item>::value)) { return detail::submit_kernel_direct_parallel_for( - *this, Range, Rest..., Properties, TlsCodeLocCapture.query()); + *this, Range, Rest..., {}, Properties, TlsCodeLocCapture.query()); } else return submit( [&](handler &CGH) { @@ -3465,9 +3465,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { return detail::submit_kernel_direct_parallel_for( - *this, ext::oneapi::experimental::empty_properties_t{}, Range, - Rest..., sycl::span(&DepEvent, 1), - TlsCodeLocCapture.query()); + *this, Range, Rest..., sycl::span(&DepEvent, 1), + ext::oneapi::experimental::empty_properties_t{}, TlsCodeLocCapture.query()); } else { return submit( [&](handler &CGH) { @@ -3532,8 +3531,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { return detail::submit_kernel_direct_parallel_for( - *this, ext::oneapi::experimental::empty_properties_t{}, Range, - Rest..., DepEvents, TlsCodeLocCapture.query()); + *this, Range, Rest..., DepEvents, ext::oneapi::experimental::empty_properties_t{}, + TlsCodeLocCapture.query()); } else { return submit( [&](handler &CGH) { From 9e046b5d7ac107e0da138e1a1922eb17a24257f8 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 4 Nov 2025 14:14:57 +0000 Subject: [PATCH 18/24] Fix formatting --- sycl/include/sycl/queue.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 95c96004dc067..6025b8b30edce 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3466,7 +3466,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { KernelType, sycl::nd_item>::value)) { return detail::submit_kernel_direct_parallel_for( *this, Range, Rest..., sycl::span(&DepEvent, 1), - ext::oneapi::experimental::empty_properties_t{}, TlsCodeLocCapture.query()); + ext::oneapi::experimental::empty_properties_t{}, + TlsCodeLocCapture.query()); } else { return submit( [&](handler &CGH) { @@ -3531,7 +3532,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { return detail::submit_kernel_direct_parallel_for( - *this, Range, Rest..., DepEvents, ext::oneapi::experimental::empty_properties_t{}, + *this, Range, Rest..., DepEvents, + ext::oneapi::experimental::empty_properties_t{}, TlsCodeLocCapture.query()); } else { return submit( From 618183780cfad7dbb56750121efb5c4559866c04 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 5 Nov 2025 10:08:42 +0000 Subject: [PATCH 19/24] Address review comments --- .../sycl/ext/oneapi/experimental/enqueue_functions.hpp | 2 +- sycl/include/sycl/khr/free_function_commands.hpp | 6 +++--- sycl/include/sycl/queue.hpp | 4 ++-- sycl/source/detail/event_deps.hpp | 6 +++--- sycl/source/detail/queue_impl.cpp | 6 +++--- sycl/source/detail/queue_impl.hpp | 7 ++++--- 6 files changed, 16 insertions(+), 15 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 0f06f610600b6..ffa071f209580 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -274,7 +274,7 @@ void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { detail::submit_kernel_direct_parallel_for(std::move(Q), Range, - KernelObj, {}); + KernelObj); } else { submit(std::move(Q), [&](handler &CGH) { nd_launch(CGH, Range, KernelObj, diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index b10e1dd9823a9..f155d4e6ad73c 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -162,7 +162,7 @@ void launch_grouped(const queue &q, range<1> r, range<1> size, KernelType &&k, if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item<1>>::value)) { detail::submit_kernel_direct_parallel_for(q, nd_range<1>(r, size), - std::forward(k), {}); + std::forward(k)); } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, @@ -179,7 +179,7 @@ void launch_grouped(const queue &q, range<2> r, range<2> size, KernelType &&k, if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item<2>>::value)) { detail::submit_kernel_direct_parallel_for(q, nd_range<2>(r, size), - std::forward(k), {}); + std::forward(k)); } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, @@ -196,7 +196,7 @@ void launch_grouped(const queue &q, range<3> r, range<3> size, KernelType &&k, if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item<3>>::value)) { detail::submit_kernel_direct_parallel_for(q, nd_range<3>(r, size), - std::forward(k), {}); + std::forward(k)); } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 6025b8b30edce..f7bff57c2df9a 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -249,7 +249,7 @@ template auto submit_kernel_direct_parallel_for( const queue &Queue, const nd_range &Range, - KernelTypeUniversalRef &&KernelFunc, sycl::span DepEvents, + KernelTypeUniversalRef &&KernelFunc, sycl::span DepEvents = {}, const PropertiesT &Props = ext::oneapi::experimental::empty_properties_t{}, const detail::code_location &CodeLoc = detail::code_location::current()) { @@ -280,7 +280,7 @@ template auto submit_kernel_direct_single_task( const queue &Queue, KernelTypeUniversalRef &&KernelFunc, - sycl::span DepEvents, + sycl::span DepEvents = {}, const PropertiesT &Props = ext::oneapi::experimental::empty_properties_t{}, const detail::code_location &CodeLoc = detail::code_location::current()) { diff --git a/sycl/source/detail/event_deps.hpp b/sycl/source/detail/event_deps.hpp index dd8c7bf30a9f6..fb30099dc9821 100644 --- a/sycl/source/detail/event_deps.hpp +++ b/sycl/source/detail/event_deps.hpp @@ -29,10 +29,10 @@ namespace detail { /// the queue will be switched to a recording mode (transitive queue /// recording feature). /// -/// The LockQueue template argument defines, whether the queue lock +/// The LockQueue template argument defines whether the queue lock /// should be acquired for the transition to a recording mode. It is -/// set to false in cases, where the event dependencies are set directly -/// in the command submission flow, where the lock is already aquired. +/// set to false in cases where the event dependencies are set directly +/// in the command submission flow and the lock is already acquired. /// /// \param EventImpl Event to register as a dependency /// \param EventsRegistered A list of already registered events, where diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 45a70d0ffe5f2..5b7bfb5e90fae 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -643,7 +643,7 @@ queue_impl::submit_direct(bool CallerNeedsEvent, // Sync with an external event std::optional ExternalEvent = popExternalEvent(); if (ExternalEvent) { - registerEventDependency( + registerEventDependency( getSyclObjImpl(*ExternalEvent), CGData.MEvents, this, getContextImpl(), getDeviceImpl(), hasCommandGraph() ? getCommandGraph().get() : nullptr, detail::CGType::Kernel); @@ -654,14 +654,14 @@ queue_impl::submit_direct(bool CallerNeedsEvent, // Sync with the last event for in order queue EventImplPtr &LastEvent = Deps.LastEventPtr; if (isInOrder() && LastEvent) { - registerEventDependency( + registerEventDependency( LastEvent, CGData.MEvents, this, getContextImpl(), getDeviceImpl(), hasCommandGraph() ? getCommandGraph().get() : nullptr, detail::CGType::Kernel); } for (event e : DepEvents) { - registerEventDependency( + registerEventDependency( getSyclObjImpl(e), CGData.MEvents, this, getContextImpl(), getDeviceImpl(), hasCommandGraph() ? getCommandGraph().get() : nullptr, detail::CGType::Kernel); diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index e505cf717b0d2..7c793b619ecab 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -368,8 +368,8 @@ class queue_impl : public std::enable_shared_from_this { const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = submit_kernel_direct_impl( - NDRDescT{Range}, HostKernel, DeviceKernelInfo, true, DepEvents, Props, - CodeLoc, IsTopCodeLoc); + NDRDescT{Range}, HostKernel, DeviceKernelInfo, + /*CallerNeedsEvent*/ true, DepEvents, Props, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } @@ -381,7 +381,8 @@ class queue_impl : public std::enable_shared_from_this { const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, - false, DepEvents, Props, CodeLoc, IsTopCodeLoc); + /*CallerNeedsEvent*/ false, DepEvents, Props, + CodeLoc, IsTopCodeLoc); } void submit_without_event(const detail::type_erased_cgfo_ty &CGF, From 73cdaeac705c16252c6a29ae05274566cb259b45 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 5 Nov 2025 10:26:00 +0000 Subject: [PATCH 20/24] Temp - add asserts --- sycl/include/sycl/queue.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index f7bff57c2df9a..9f9f4ba815238 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -2859,6 +2859,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { "Use queue.submit() instead"); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + assert(false); // TODO The handler-less path does not support kernel functions // with the kernel_handler type argument yet. @@ -2921,6 +2922,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + assert(false); + // TODO The handler-less path does not support kernel functions // with the kernel_handler type argument yet. if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); using KernelType = std::tuple_element_t<0, std::tuple>; + assert(false); // TODO The handler-less path does not support reductions, and // kernel functions with the kernel_handler type argument yet. @@ -3525,6 +3529,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); using KernelType = std::tuple_element_t<0, std::tuple>; + assert(false); // TODO The handler-less path does not support reductions, and // kernel functions with the kernel_handler type argument yet. From 5c8df60ef1ebf6862124ae7eb219c0ad4a7392bd Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 5 Nov 2025 10:58:20 +0000 Subject: [PATCH 21/24] Move assertions --- sycl/include/sycl/queue.hpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 9f9f4ba815238..5646616523869 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -2859,12 +2859,12 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { "Use queue.submit() instead"); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - assert(false); // TODO The handler-less path does not support kernel functions // with the kernel_handler type argument yet. if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT::value)) { + assert(false); return detail::submit_kernel_direct_single_task( *this, KernelFunc, sycl::span(&DepEvent, 1), Properties, TlsCodeLocCapture.query()); @@ -2922,12 +2922,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - assert(false); - // TODO The handler-less path does not support kernel functions // with the kernel_handler type argument yet. if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT::value)) { + assert(false); return detail::submit_kernel_direct_single_task( *this, KernelFunc, DepEvents, Properties, TlsCodeLocCapture.query()); } else { @@ -3461,13 +3460,13 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); using KernelType = std::tuple_element_t<0, std::tuple>; - assert(false); // TODO The handler-less path does not support reductions, and // kernel functions with the kernel_handler type argument yet. if constexpr (sizeof...(RestT) == 1 && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { + assert(false); return detail::submit_kernel_direct_parallel_for( *this, Range, Rest..., sycl::span(&DepEvent, 1), ext::oneapi::experimental::empty_properties_t{}, @@ -3529,13 +3528,13 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); using KernelType = std::tuple_element_t<0, std::tuple>; - assert(false); // TODO The handler-less path does not support reductions, and // kernel functions with the kernel_handler type argument yet. if constexpr (sizeof...(RestT) == 1 && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { + assert(false); return detail::submit_kernel_direct_parallel_for( *this, Range, Rest..., DepEvents, ext::oneapi::experimental::empty_properties_t{}, From 378abb077646b8f743f955b65d82f7198ce051d9 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 5 Nov 2025 12:52:03 +0000 Subject: [PATCH 22/24] Switch asserts --- sycl/include/sycl/queue.hpp | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 5646616523869..d42913a7b97ec 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -2864,7 +2864,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { // with the kernel_handler type argument yet. if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT::value)) { - assert(false); return detail::submit_kernel_direct_single_task( *this, KernelFunc, sycl::span(&DepEvent, 1), Properties, TlsCodeLocCapture.query()); @@ -2926,7 +2925,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { // with the kernel_handler type argument yet. if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT::value)) { - assert(false); return detail::submit_kernel_direct_single_task( *this, KernelFunc, DepEvents, Properties, TlsCodeLocCapture.query()); } else { @@ -3438,6 +3436,15 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { PropertiesT Properties, RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + using KernelType = std::tuple_element_t<0, std::tuple>; + + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. + if constexpr (sizeof...(RestT) == 1 && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, sycl::nd_item>::value)) { + assert(false); + } return submit( [&](handler &CGH) { CGH.depends_on(DepEvent); @@ -3466,7 +3473,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { if constexpr (sizeof...(RestT) == 1 && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { - assert(false); return detail::submit_kernel_direct_parallel_for( *this, Range, Rest..., sycl::span(&DepEvent, 1), ext::oneapi::experimental::empty_properties_t{}, @@ -3504,6 +3510,16 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { PropertiesT Properties, RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + using KernelType = std::tuple_element_t<0, std::tuple>; + + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. + if constexpr (sizeof...(RestT) == 1 && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, sycl::nd_item>::value)) { + assert(false); + } + return submit( [&](handler &CGH) { CGH.depends_on(DepEvents); @@ -3534,7 +3550,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { if constexpr (sizeof...(RestT) == 1 && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { - assert(false); return detail::submit_kernel_direct_parallel_for( *this, Range, Rest..., DepEvents, ext::oneapi::experimental::empty_properties_t{}, From 0b996a377dc6618440ba72b54d4ddd76eada9b8e Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 5 Nov 2025 14:21:16 +0000 Subject: [PATCH 23/24] All functions asserts --- sycl/include/sycl/queue.hpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index d42913a7b97ec..7179e8e6e68ac 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -2808,6 +2808,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { // with the kernel_handler type argument yet. if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT::value)) { + assert(false); return detail::submit_kernel_direct_single_task( *this, KernelFunc, {}, Properties, TlsCodeLocCapture.query()); } else { @@ -2864,6 +2865,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { // with the kernel_handler type argument yet. if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT::value)) { + assert(false); return detail::submit_kernel_direct_single_task( *this, KernelFunc, sycl::span(&DepEvent, 1), Properties, TlsCodeLocCapture.query()); @@ -2925,6 +2927,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { // with the kernel_handler type argument yet. if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT::value)) { + assert(false); return detail::submit_kernel_direct_single_task( *this, KernelFunc, DepEvents, Properties, TlsCodeLocCapture.query()); } else { @@ -3372,7 +3375,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { if constexpr (sizeof...(RestT) == 1 && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { - + assert(false); return detail::submit_kernel_direct_parallel_for( *this, Range, Rest..., {}, Properties, TlsCodeLocCapture.query()); } else @@ -3402,6 +3405,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { if constexpr (sizeof...(RestT) == 1 && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { + assert(false); return detail::submit_kernel_direct_parallel_for( *this, Range, Rest..., {}, ext::oneapi::experimental::empty_properties_t{}, @@ -3473,6 +3477,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { if constexpr (sizeof...(RestT) == 1 && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { + assert(false); return detail::submit_kernel_direct_parallel_for( *this, Range, Rest..., sycl::span(&DepEvent, 1), ext::oneapi::experimental::empty_properties_t{}, @@ -3550,6 +3555,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { if constexpr (sizeof...(RestT) == 1 && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { + assert(false); return detail::submit_kernel_direct_parallel_for( *this, Range, Rest..., DepEvents, ext::oneapi::experimental::empty_properties_t{}, From 74b6fabd7b1a3d901db177c6ce3bd5cd7b684af9 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 5 Nov 2025 17:19:03 +0000 Subject: [PATCH 24/24] asserts on properties only --- sycl/include/sycl/queue.hpp | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 7179e8e6e68ac..71e2479fb8346 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -2804,11 +2804,13 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + assert(false); + // TODO The handler-less path does not support kernel functions // with the kernel_handler type argument yet. if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT::value)) { - assert(false); + return detail::submit_kernel_direct_single_task( *this, KernelFunc, {}, Properties, TlsCodeLocCapture.query()); } else { @@ -2861,11 +2863,13 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + assert(false); + // TODO The handler-less path does not support kernel functions // with the kernel_handler type argument yet. if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT::value)) { - assert(false); + return detail::submit_kernel_direct_single_task( *this, KernelFunc, sycl::span(&DepEvent, 1), Properties, TlsCodeLocCapture.query()); @@ -2923,11 +2927,13 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + assert(false); + // TODO The handler-less path does not support kernel functions // with the kernel_handler type argument yet. if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT::value)) { - assert(false); + return detail::submit_kernel_direct_single_task( *this, KernelFunc, DepEvents, Properties, TlsCodeLocCapture.query()); } else { @@ -3370,12 +3376,14 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); using KernelType = std::tuple_element_t<0, std::tuple>; + assert(false); + // TODO The handler-less path does not support reductions, and // kernel functions with the kernel_handler type argument yet. if constexpr (sizeof...(RestT) == 1 && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { - assert(false); + return detail::submit_kernel_direct_parallel_for( *this, Range, Rest..., {}, Properties, TlsCodeLocCapture.query()); } else @@ -3405,7 +3413,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { if constexpr (sizeof...(RestT) == 1 && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { - assert(false); return detail::submit_kernel_direct_parallel_for( *this, Range, Rest..., {}, ext::oneapi::experimental::empty_properties_t{}, @@ -3442,12 +3449,13 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); using KernelType = std::tuple_element_t<0, std::tuple>; + assert(false); + // TODO The handler-less path does not support reductions, and // kernel functions with the kernel_handler type argument yet. if constexpr (sizeof...(RestT) == 1 && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { - assert(false); } return submit( [&](handler &CGH) { @@ -3477,7 +3485,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { if constexpr (sizeof...(RestT) == 1 && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { - assert(false); return detail::submit_kernel_direct_parallel_for( *this, Range, Rest..., sycl::span(&DepEvent, 1), ext::oneapi::experimental::empty_properties_t{}, @@ -3517,12 +3524,13 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); using KernelType = std::tuple_element_t<0, std::tuple>; + assert(false); + // TODO The handler-less path does not support reductions, and // kernel functions with the kernel_handler type argument yet. if constexpr (sizeof...(RestT) == 1 && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { - assert(false); } return submit( @@ -3555,7 +3563,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { if constexpr (sizeof...(RestT) == 1 && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { - assert(false); return detail::submit_kernel_direct_parallel_for( *this, Range, Rest..., DepEvents, ext::oneapi::experimental::empty_properties_t{},