@@ -326,7 +326,8 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
326326 ET_LOG (Error, " Ethos-U invocation failed error (%d)" , result);
327327 return Error::InvalidProgram;
328328 }
329- int tensor_dim = 0 , io_dim = 0 ;
329+ size_t tensor_bytes_total = 0 ;
330+ size_t io_bytes_total = 0 ;
330331 // Write outputs from scratch into EValue pointers
331332 for (int i = 0 ; i < handles.outputs ->count ; i++) {
332333 int tensor_count = 1 , io_count = 1 ;
@@ -338,23 +339,39 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
338339 calculate_dimensions (
339340 tensor_out, &handles.outputs ->io [i], &tensor_count, &io_count);
340341
341- // At times the topological order of the outputs may change.
342- // Lets instead ensure that the sum of dimensions match.
343- tensor_dim = tensor_dim + tensor_count;
344- io_dim = io_dim + io_count;
342+ size_t tensor_bytes = tensor_out.nbytes ();
343+ size_t io_bytes = static_cast <size_t >(io_count) *
344+ static_cast <size_t >(handles.outputs ->io [i].elem_size );
345+
346+ if (tensor_bytes != io_bytes) {
347+ Error status = copy_with_layout_adjustment (
348+ handles.outputs ->io [i], i, output_addr, tensor_out, tensor_bytes);
349+ if (status != Error::Ok) {
350+ return status;
351+ }
352+ io_bytes_total += tensor_bytes;
353+ } else {
354+ EXECUTORCH_PROF_SCOPE (
355+ event_tracer, " +EthosUBackend::execute()handles.output.memcpy()" );
345356
346- EXECUTORCH_PROF_SCOPE (
347- event_tracer, " +EthosUBackend::execute()handles.output.memcpy()" );
357+ memcpy (
358+ tensor_out.mutable_data_ptr <char >(),
359+ static_cast <const char *>(output_addr),
360+ tensor_bytes);
361+ io_bytes_total += io_bytes;
362+ }
348363
349- memcpy (
350- tensor_out.mutable_data_ptr <char >(),
351- static_cast <const char *>(output_addr),
352- tensor_out.nbytes ());
364+ // At times the topological order of the outputs may change.
365+ // Lets instead ensure that the sum of output bytes match.
366+ tensor_bytes_total += tensor_bytes;
353367 }
354- if (tensor_dim != io_dim ) {
368+ if (tensor_bytes_total != io_bytes_total ) {
355369 ET_LOG (Error, " Total output tensor sizes do not match" );
356370 ET_LOG (
357- Error, " Program expects size of %d but got %d" , tensor_dim, io_dim);
371+ Error,
372+ " Program expects %zu bytes but got %zu" ,
373+ io_bytes_total,
374+ tensor_bytes_total);
358375 return Error::InvalidProgram;
359376 }
360377 return Error::Ok;
@@ -365,6 +382,147 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
365382 }
366383
367384 private:
385+ // Copies Vela output into the ExecuTorch tensor, adjusting for padding or
386+ // packed layouts produced by the delegate.
387+ Error copy_with_layout_adjustment (
388+ const VelaIO& output_io,
389+ int output_index,
390+ const char * src,
391+ executorch::aten::Tensor& tensor_out,
392+ size_t tensor_bytes) const {
393+ const int elem_size = output_io.elem_size ;
394+ if (elem_size == 0 ) {
395+ ET_LOG (
396+ Error, " Ethos-U output %d reports zero element size" , output_index);
397+ return Error::InvalidProgram;
398+ }
399+
400+ size_t chunk_count = 1 ;
401+ for (int dim = 0 ; dim < shapeDim - 1 ; ++dim) {
402+ const int vela_dim = output_io.shape [dim];
403+ chunk_count *= static_cast <size_t >(vela_dim == 0 ? 1 : vela_dim);
404+ }
405+ const int last_dim = output_io.shape [shapeDim - 1 ];
406+ const size_t vela_chunk_elems =
407+ static_cast <size_t >(last_dim == 0 ? 1 : last_dim);
408+ const size_t vela_chunk_size =
409+ vela_chunk_elems * static_cast <size_t >(elem_size);
410+
411+ if (tensor_bytes % chunk_count != 0 ) {
412+ ET_LOG (
413+ Error,
414+ " Ethos-U output %d tensor bytes %zu not divisible by chunk count %zu" ,
415+ output_index,
416+ tensor_bytes,
417+ chunk_count);
418+ return Error::InvalidProgram;
419+ }
420+
421+ const size_t chunk_size = tensor_bytes / chunk_count;
422+
423+ // If Vela writes fewer bytes than the tensor expects we may need to
424+ // expand 4-bit data to 8-bit. Ethos-U outputs may be
425+ // packed 4-bit values but ExecuTorch tensors are at least 8-bit.
426+ if (vela_chunk_size < chunk_size) {
427+ if (chunk_size % vela_chunk_size != 0 ) {
428+ ET_LOG (
429+ Error,
430+ " Ethos-U output %d chunk bytes %zu not divisible by vela chunk bytes %zu" ,
431+ output_index,
432+ chunk_size,
433+ vela_chunk_size);
434+ return Error::InvalidProgram;
435+ }
436+
437+ const size_t expand_factor = chunk_size / vela_chunk_size;
438+ if (expand_factor == 2 && elem_size == 1 &&
439+ tensor_out.scalar_type () == ScalarType::Char) {
440+ return unpack_chunks_4bit_to_int8 (
441+ reinterpret_cast <const uint8_t *>(src),
442+ tensor_out.mutable_data_ptr <int8_t >(),
443+ chunk_count,
444+ chunk_size,
445+ vela_chunk_size);
446+ }
447+
448+ ET_LOG (
449+ Error,
450+ " Ethos-U output %d expansion factor %zu with element size %d not supported" ,
451+ output_index,
452+ expand_factor,
453+ elem_size);
454+ return Error::InvalidProgram;
455+ }
456+
457+ return strip_delegate_padding (
458+ src,
459+ tensor_out.mutable_data_ptr <char >(),
460+ chunk_count,
461+ chunk_size,
462+ vela_chunk_size);
463+ }
464+
465+ Error unpack_chunks_4bit_to_int8 (
466+ const uint8_t * src,
467+ int8_t * dest,
468+ size_t chunk_count,
469+ size_t dest_chunk_size,
470+ size_t src_chunk_size) const {
471+ const uint8_t * chunk_src = src;
472+ int8_t * chunk_dest = dest;
473+ for (size_t chunk_idx = 0 ; chunk_idx < chunk_count; ++chunk_idx) {
474+ unpack_single_chunk_4bit_to_int8 (chunk_src, chunk_dest, src_chunk_size);
475+ chunk_src += src_chunk_size;
476+ chunk_dest += dest_chunk_size;
477+ }
478+ return Error::Ok;
479+ }
480+
481+ void unpack_single_chunk_4bit_to_int8 (
482+ const uint8_t * src,
483+ int8_t * dest,
484+ size_t chunk_size) const {
485+ for (size_t byte_idx = 0 ; byte_idx < chunk_size; ++byte_idx) {
486+ const uint8_t packed = src[byte_idx];
487+ int8_t low = static_cast <int8_t >(packed & 0x0F );
488+ int8_t high = static_cast <int8_t >((packed >> 4 ) & 0x0F );
489+ if (low >= 8 ) {
490+ low -= 16 ;
491+ }
492+ if (high >= 8 ) {
493+ high -= 16 ;
494+ }
495+ dest[2 * byte_idx] = low;
496+ dest[2 * byte_idx + 1 ] = high;
497+ }
498+ }
499+
500+ Error strip_delegate_padding (
501+ const char * src,
502+ char * dest,
503+ size_t chunk_count,
504+ size_t dest_chunk_size,
505+ size_t src_chunk_size) const {
506+ if (dest_chunk_size > src_chunk_size) {
507+ ET_LOG (
508+ Error,
509+ " dest chunk size %zu must not exceed src chunk size %zu" ,
510+ dest_chunk_size,
511+ src_chunk_size);
512+ return Error::InvalidProgram;
513+ }
514+ if (src == nullptr || dest == nullptr ) {
515+ ET_LOG (Error, " Ethos-U padded copy received null buffer" );
516+ return Error::InvalidState;
517+ }
518+ for (size_t chunk_idx = 0 ; chunk_idx < chunk_count; ++chunk_idx) {
519+ memcpy (dest, src, dest_chunk_size);
520+ src += src_chunk_size;
521+ dest += dest_chunk_size;
522+ }
523+ return Error::Ok;
524+ }
525+
368526 void calculate_dimensions (
369527 const executorch::aten::Tensor tensor,
370528 VelaIO* io,
@@ -389,4 +547,4 @@ static auto registered = register_backend(backend_id);
389547
390548} // namespace arm
391549} // namespace backends
392- } // namespace executorch
550+ } // namespace executorch
0 commit comments