Skip to content

Commit 6d614d6

Browse files
committed
[Executorch] Introduce caching cpu memory allocator
Meant to use this for temp allocator for kernels. Specifically for sdpa, it seems that on iOS there is a significant overhead coming from allocations Differential Revision: [D85532079](https://our.internmc.facebook.com/intern/diff/D85532079/) ghstack-source-id: 321123656 Pull Request resolved: #15611
1 parent 6763d60 commit 6d614d6

File tree

8 files changed

+553
-0
lines changed

8 files changed

+553
-0
lines changed
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
# Please this file formatted by running:
8+
# ~~~
9+
# cmake-format -i CMakeLists.txt
10+
# ~~~
11+
12+
cmake_minimum_required(VERSION 3.19)
13+
14+
# Source root directory for executorch.
15+
if(NOT EXECUTORCH_ROOT)
16+
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
17+
endif()
18+
19+
list(TRANSFORM _extension_module__srcs PREPEND "${EXECUTORCH_ROOT}/")
20+
if(CMAKE_TOOLCHAIN_IOS
21+
OR CMAKE_TOOLCHAIN_ANDROID
22+
OR APPLE
23+
)
24+
# Building a share library on iOS requires code signing On Android we see
25+
# duplicated registration when using shared lib
26+
add_library(extension_memory_allocator STATIC ${_extension_memory_allocator__srcs})
27+
else()
28+
add_library(extension_memory_allocator ${_extension_memory_allocator__srcs})
29+
endif()
30+
target_link_libraries(
31+
extension_memory_allocator PRIVATE executorch_core)
32+
target_include_directories(
33+
extension_memory_allocator PUBLIC ${_common_include_directories}
34+
)
35+
target_compile_options(
36+
extension_memory_allocator
37+
PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
38+
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations -fPIC>
39+
)
40+
41+
# Install libraries
42+
install(
43+
TARGETS extension_memory_allocator
44+
EXPORT ExecuTorchTargets
45+
DESTINATION ${CMAKE_INSTALL_LIBDIR}
46+
INCLUDES
47+
DESTINATION ${_common_include_directories}
48+
)
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
#include <cstdlib>
2+
3+
#include <executorch/extension/memory_allocator/cpu_caching_malloc_allocator.h>
4+
5+
namespace executorch::extension {
6+
7+
namespace {
8+
size_t get_alignment_adjusted_size(size_t size, size_t alignment) {
9+
alignment = std::max(alignment, kDefaultAlignment);
10+
if (size % alignment != 0) {
11+
// Adjust size to the next multiple of alignment
12+
// This is needed for aligned_alloc to work
13+
return (size + alignment) & ~(alignment - 1);
14+
} else {
15+
return size;
16+
}
17+
}
18+
} // namespace
19+
20+
CPUCachingAllocator::CPUCachingAllocator(uint32_t max_size) : MemoryAllocator(0, nullptr) {
21+
max_size_ = max_size;
22+
current_size_ = 0;
23+
}
24+
25+
void* CPUCachingAllocator::allocate(size_t size, size_t alignment) {
26+
EXECUTORCH_TRACK_ALLOCATION(prof_id(), size);
27+
28+
if (!isPowerOf2(alignment)) {
29+
ET_LOG(Error, "Alignment %zu is not a power of 2", alignment);
30+
return nullptr;
31+
}
32+
size = get_alignment_adjusted_size(size, alignment);
33+
34+
std::lock_guard<std::mutex> guard(mutex_);
35+
const auto& it = available_map_.find(size);
36+
if (it == available_map_.end() || it->second.empty()) {
37+
if (current_size_ + size > max_size_) {
38+
// Freeing while holding the lock will cause performance issues
39+
// we probably should log how often this happens so as to allow
40+
// for calling site to adjust the max_size_ parameter
41+
free_cached();
42+
}
43+
void* ptr = std::aligned_alloc(alignment, size);
44+
current_size_ += size;
45+
if (ptr == nullptr) {
46+
ET_LOG(Error, "Failed to allocate memory");
47+
return nullptr;
48+
}
49+
allocation_map_[ptr] = size;
50+
return ptr;
51+
}
52+
void* ptr = it->second.back();
53+
it->second.pop_back();
54+
allocation_map_[ptr] = size;
55+
return ptr;
56+
}
57+
58+
void CPUCachingAllocator::free_cached() {
59+
// We dont lock mutex_ here because it will cause deadlock otherwise
60+
// we could use recursive_mutex but we just design this differently since
61+
// free_cache is not a public API anyways
62+
for (const auto& it : available_map_) {
63+
for (const auto ptr : it.second) {
64+
std::free(ptr);
65+
}
66+
}
67+
available_map_.clear();
68+
}
69+
70+
void CPUCachingAllocator::reset() {
71+
std::lock_guard<std::mutex> guard(mutex_);
72+
for (auto& it : allocation_map_) {
73+
void* ptr = it.first;
74+
size_t alloc_size = it.second;
75+
// Cache the memory
76+
available_map_[alloc_size].push_back(ptr);
77+
current_size_ -= alloc_size;
78+
}
79+
allocation_map_.clear();
80+
}
81+
82+
CPUCachingAllocator::~CPUCachingAllocator() {
83+
// destructor must be called in thread safe manner
84+
reset();
85+
free_cached();
86+
}
87+
88+
} // namespace executorch::extension
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#pragma once
2+
3+
#include <cstddef>
4+
#include <mutex>
5+
6+
#include <executorch/runtime/core/memory_allocator.h>
7+
8+
#ifdef USE_C10_SMALL_VECTOR
9+
#include <c10/util/SmallVector.h>
10+
#else
11+
#include <vector>
12+
#endif
13+
14+
#ifdef USE_C10_FLAT_HASH_MAP
15+
#include <c10/util/flat_hash_map.h>
16+
#else
17+
#include <unordered_map>
18+
#endif
19+
20+
/*
21+
* CPUCachingAllocator:
22+
* This file is copied over from c10/mobile/CPUCachingAllocator.h
23+
* It is a thread safe caching allocator.
24+
*/
25+
26+
namespace executorch::extension {
27+
28+
#ifdef USE_C10_SMALL_VECTOR
29+
template <typename T, unsigned N>
30+
using SmallVector = c10::SmallVector<T, N>;
31+
#else
32+
template <typename T, unsigned N>
33+
using SmallVector = std::vector<T>;
34+
#endif
35+
36+
#ifdef USE_C10_FLAT_HASH_MAP
37+
template<typename KeyType, typename ValueType>
38+
using FlatHashMap = ska::flat_hash_map<KeyType, ValueType>;
39+
#else
40+
template<typename KeyType, typename ValueType>
41+
using FlatHashMap = std::unordered_map<KeyType, ValueType>;
42+
#endif
43+
44+
constexpr size_t kDefaultAlignment = 64;
45+
class CPUCachingAllocator : public executorch::runtime::MemoryAllocator {
46+
/*
47+
* What it does:
48+
* Caches all the allocations carried out by this allocator.
49+
* Cache key is the size of the allocation.
50+
* If requested size is found in the cache returns the cached pointer.
51+
* What it does not do:
52+
* No speculative allocation for any future allocations.
53+
*/
54+
private:
55+
void free_cached();
56+
57+
protected:
58+
// Invariants.
59+
// New invariants must be written.
60+
FlatHashMap<size_t, SmallVector<void*, 16>> available_map_;
61+
FlatHashMap<void*, size_t> allocation_map_;
62+
// Since allocation_map, which is a global instance, is mutated/read via
63+
// all public APIs we need a global mutex.
64+
std::mutex mutex_;
65+
size_t max_size_;
66+
size_t current_size_;
67+
68+
public:
69+
/*
70+
max_size: Maximum size of memory to cache. Never cache more than that.
71+
*/
72+
CPUCachingAllocator(uint32_t max_size);
73+
// Checks the cache to see if allocation of size bytes can be found.
74+
// If so return cached memory, else
75+
// allocates memory, records it for caching and returns.
76+
void* allocate(size_t size, size_t alignment = kDefaultAlignment) override;
77+
void reset() override;
78+
~CPUCachingAllocator();
79+
};
80+
81+
} // namespace executorch::extension

extension/memory_allocator/targets.bzl

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,20 @@ def define_common_targets():
2020
"@EXECUTORCH_CLIENTS",
2121
],
2222
)
23+
24+
runtime.cxx_library(
25+
name = "cpu_caching_allocator",
26+
srcs = [
27+
"cpu_caching_malloc_allocator.cpp",
28+
],
29+
exported_headers = [
30+
"cpu_caching_malloc_allocator.h",
31+
],
32+
exported_deps = [
33+
"//executorch/runtime/core:memory_allocator",
34+
],
35+
visibility = [
36+
"//executorch/extension/memory_allocator/test/...",
37+
"@EXECUTORCH_CLIENTS",
38+
],
39+
)

0 commit comments

Comments
 (0)