Implement Numba VM with caching

ricardoV94 · ricardoV94 · commit 40e6dade6d6c · 2025-09-14T12:25:08.000+02:00
And make that the default backend
diff --git a/pytensor/compile/mode.py b/pytensor/compile/mode.py
@@ -50,7 +50,7 @@
     "jax": JAXLinker(),
     "pytorch": PytorchLinker(),
     "numba": NumbaLinker(),
-    "numba_vm": NumbaLinker(),
+    "numba_vm": NumbaLinker(vm=True),
 }
 
 
@@ -453,15 +453,15 @@ def clone(self, link_kwargs=None, optimizer="", **kwargs):
 # string as the key
 # Use VM_linker to allow lazy evaluation by default.
 FAST_COMPILE = Mode(
-    NumbaLinker(),
+    NumbaLinker(vm=True),
     # TODO: Fast_compile should just use python code, CHANGE ME!
     RewriteDatabaseQuery(
         include=["fast_compile", "numba"],
         exclude=["cxx_only", "BlasOpt", "local_careduce_fusion"],
     ),
 )
 FAST_RUN = Mode(
-    NumbaLinker(),
+    NumbaLinker(vm=True),
     RewriteDatabaseQuery(
         include=["fast_run", "numba"],
         exclude=["cxx_only", "BlasOpt", "local_careduce_fusion"],
@@ -481,6 +481,11 @@ def clone(self, link_kwargs=None, optimizer="", **kwargs):
     ),
 )
 
+NUMBA_VM = Mode(
+    NumbaLinker(vm=True),
+    NUMBA._optimizer,
+)
+
 JAX = Mode(
     JAXLinker(),
     RewriteDatabaseQuery(
@@ -519,6 +524,7 @@ def clone(self, link_kwargs=None, optimizer="", **kwargs):
     "FAST_RUN": FAST_RUN,
     "JAX": JAX,
     "NUMBA": NUMBA,
+    "NUMBA_VM": NUMBA_VM,
     "PYTORCH": PYTORCH,
 }
 
diff --git a/pytensor/configdefaults.py b/pytensor/configdefaults.py
@@ -380,11 +380,12 @@ def add_compile_configvars():
             "vm_nogc",
             "cvm_nogc",
             "jax",
+            "numba",
         ]
     else:
         # g++ is not present or the user disabled it,
         # linker should default to python only.
-        linker_options = ["py", "vm", "vm_nogc", "jax"]
+        linker_options = ["py", "vm", "vm_nogc", "jax", "numba"]
         if type(config).cxx.is_default:
             # If the user provided an empty value for cxx, do not warn.
             _logger.warning(
@@ -398,7 +399,7 @@ def add_compile_configvars():
         "linker",
         "Default linker used if the pytensor flags mode is Mode",
         # Not mutable because the default mode is cached after the first use.
-        EnumStr("numba", linker_options, mutable=False),
+        EnumStr("numba_vm", linker_options, mutable=False),
         in_c_key=False,
     )
 
diff --git a/pytensor/link/numba/cache.py b/pytensor/link/numba/cache.py
@@ -0,0 +1,102 @@
+from collections.abc import Callable
+from pathlib import Path
+from tempfile import NamedTemporaryFile, TemporaryFile
+from typing import Any
+
+from numba.core.caching import CacheImpl, _CacheLocator
+
+from pytensor import config
+
+
+NUMBA_PYTENSOR_CACHE_ENABLED = True
+COMPILED_SRC_FUNCTIONS = {}
+
+
+def compile_and_cache_numba_function_src(
+    src: str,
+    function_name: str,
+    global_env: dict[Any, Any] | None = None,
+    local_env: dict[Any, Any] | None = None,
+    key: str | None = None,
+) -> Callable:
+    if key is not None:
+        numba_path = config.base_compiledir / "numba"
+        numba_path.mkdir(exist_ok=True)
+        filename = numba_path / key
+        with filename.open("wb") as f:
+            f.write(src.encode())
+    else:
+        with NamedTemporaryFile(delete=False) as f:
+            filename = f.name
+            f.write(src.encode())
+
+    if global_env is None:
+        global_env = {}
+
+    if local_env is None:
+        local_env = {}
+
+    mod_code = compile(src, filename, mode="exec")
+    exec(mod_code, global_env, local_env)
+
+    res = local_env[function_name]
+    res.__source__ = src  # type: ignore
+
+    if key is not None:
+        COMPILED_SRC_FUNCTIONS[res] = key
+    return res
+
+
+class NumbaPyTensorCacheLocator(_CacheLocator):
+    def __init__(self, py_func, py_file, hash):
+        # print(f"New locator {py_func=}, {py_file=}, {hash=}")
+        self._py_func = py_func
+        self._py_file = py_file
+        self._hash = hash
+        # src_hash = hash(pytensor_loader._module_sources[self._py_file])
+        # self._hash = hash((src_hash, py_file, pytensor.__version__))
+
+    def ensure_cache_path(self):
+        # print("ensure_cache_path called")
+        path = self.get_cache_path()
+        path.mkdir(exist_ok=True)
+        # Ensure the directory is writable by trying to write a temporary file
+        TemporaryFile(dir=path).close()
+
+    def get_cache_path(self):
+        """
+        Return the directory the function is cached in.
+        """
+        # print("get_cache_path called")
+        return self._py_file
+
+    def get_source_stamp(self):
+        """
+        Get a timestamp representing the source code's freshness.
+        Can return any picklable Python object.
+        """
+        return 0
+        # print("get_source_stamp called")
+        return self._hash
+
+    def get_disambiguator(self):
+        """
+        Get a string disambiguator for this locator's function.
+        It should allow disambiguating different but similarly-named functions.
+        """
+        # print("get_disambiguator called")
+        return self._hash
+
+    @classmethod
+    def from_function(cls, py_func, py_file):
+        """
+        Create a locator instance for the given function located in the given file.
+        """
+        # py_file = Path(py_file).parent
+        # if py_file == (config.base_compiledir / "numba"):
+        if NUMBA_PYTENSOR_CACHE_ENABLED and py_func in COMPILED_SRC_FUNCTIONS:
+            # print(f"Applies to {py_file}")
+            return cls(py_func, Path(py_file).parent, COMPILED_SRC_FUNCTIONS[py_func])
+
+
+CacheImpl._locator_classes.insert(0, NumbaPyTensorCacheLocator)
diff --git a/pytensor/link/numba/dispatch/basic.py b/pytensor/link/numba/dispatch/basic.py
@@ -14,7 +14,7 @@
 from numba import types
 from numba.core.errors import NumbaWarning, TypingError
 from numba.cpython.unsafe.tuple import tuple_setitem  # noqa: F401
-from numba.extending import box, overload
+from numba.extending import box, overload, register_jitable as _register_jitable
 
 from pytensor import In, config
 from pytensor.compile import NUMBA
@@ -50,10 +50,11 @@ def global_numba_func(func):
     return func
 
 
-def numba_njit(*args, fastmath=None, **kwargs):
-    kwargs.setdefault("cache", config.numba__cache)
-    kwargs.setdefault("no_cpython_wrapper", True)
-    kwargs.setdefault("no_cfunc_wrapper", True)
+def numba_njit(*args, fastmath=None, register_jitable: bool = False, **kwargs):
+    kwargs.setdefault("cache", True)
+    kwargs.setdefault("no_cpython_wrapper", False)
+    kwargs.setdefault("no_cfunc_wrapper", False)
+    # print(kwargs)
     if fastmath is None:
         if config.numba__fastmath:
             # Opinionated default on fastmath flags
@@ -81,10 +82,11 @@ def numba_njit(*args, fastmath=None, **kwargs):
         category=NumbaWarning,
     )
 
+    func = _register_jitable if register_jitable else numba.njit
     if len(args) > 0 and callable(args[0]):
-        return numba.njit(*args[1:], fastmath=fastmath, **kwargs)(args[0])
-
-    return numba.njit(*args, fastmath=fastmath, **kwargs)
+        return func(*args[1:], fastmath=fastmath, **kwargs)(args[0])
+    else:
+        return func(*args, fastmath=fastmath, **kwargs)
 
 
 def numba_vectorize(*args, **kwargs):
diff --git a/pytensor/link/numba/dispatch/elemwise.py b/pytensor/link/numba/dispatch/elemwise.py
@@ -1,4 +1,5 @@
 from functools import singledispatch
+from hashlib import sha256
 from textwrap import dedent, indent
 
 import numba
@@ -7,18 +8,17 @@
 from numpy.lib.stride_tricks import as_strided
 
 from pytensor.graph.op import Op
+from pytensor.link.numba.cache import compile_and_cache_numba_function_src
 from pytensor.link.numba.dispatch import basic as numba_basic
 from pytensor.link.numba.dispatch.basic import (
     numba_funcify,
     numba_njit,
 )
 from pytensor.link.numba.dispatch.vectorize_codegen import (
-    _jit_options,
     _vectorized,
     encode_literals,
     store_core_outputs,
 )
-from pytensor.link.utils import compile_function_src
 from pytensor.npy_2_compat import normalize_axis_index, normalize_axis_tuple
 from pytensor.scalar.basic import (
     AND,
@@ -237,7 +237,7 @@ def {careduce_fn_name}(x):
     careduce_def_src += "\n\n"
     careduce_def_src += indent(f"return {return_obj}", " " * 4)
 
-    careduce_fn = compile_function_src(
+    careduce_fn = compile_and_cache_numba_function_src(
         careduce_def_src, careduce_fn_name, {**globals(), **global_env}
     )
 
@@ -264,19 +264,34 @@ def axis_apply_fn(x):
 
 @numba_funcify.register(Elemwise)
 def numba_funcify_Elemwise(op, node, **kwargs):
+    nin = len(node.inputs)
+    nout = len(node.outputs)
+
     scalar_inputs = [get_scalar_type(dtype=input.dtype)() for input in node.inputs]
     scalar_node = op.scalar_op.make_node(*scalar_inputs)
-
     scalar_op_fn = numba_funcify(
         op.scalar_op,
         node=scalar_node,
         parent_node=node,
         **kwargs,
     )
 
-    nin = len(node.inputs)
-    nout = len(node.outputs)
-    core_op_fn = store_core_outputs(scalar_op_fn, nin=nin, nout=nout)
+    # TODO: Proper key
+    core_op_key = "_".join(
+        map(
+            str,
+            (
+                op,
+                op.scalar_op,
+                tuple(op.inplace_pattern.items()),
+                tuple(getattr(op.scalar_op, "props_dict", lambda: {})().items()),
+            ),
+        )
+    )
+    core_op_key = sha256(core_op_key.encode()).hexdigest()
+    core_op_fn = store_core_outputs(
+        scalar_op_fn, nin=nin, nout=nout, core_op_key=core_op_key
+    )
 
     input_bc_patterns = tuple(inp.type.broadcastable for inp in node.inputs)
     output_bc_patterns = tuple(out.type.broadcastable for out in node.outputs)
@@ -333,11 +348,31 @@ def elemwise(*inputs):
             return tuple(outputs_summed)
         return outputs_summed[0]
 
-    @overload(elemwise, jit_options=_jit_options)
+    @overload(elemwise)
     def ov_elemwise(*inputs):
         return elemwise_wrapper
 
-    return elemwise
+    # TODO: Also input dtypes in key
+    elemwise_key = "_".join(
+        map(
+            str,
+            (
+                "Elemwise",
+                core_op_key,
+                input_bc_patterns,
+                inplace_pattern,
+            ),
+        )
+    )
+    elemwise_key = sha256(elemwise_key.encode()).hexdigest()
+    f = compile_and_cache_numba_function_src(
+        "def f(*inputs): return elemwise(*inputs)",
+        "f",
+        {**globals(), **{"elemwise": elemwise}},
+        key=elemwise_key,
+    )
+
+    return numba_njit(f)
 
 
 @numba_funcify.register(Sum)
diff --git a/pytensor/link/numba/dispatch/scalar.py b/pytensor/link/numba/dispatch/scalar.py
@@ -4,6 +4,7 @@
 
 from pytensor.compile.ops import TypeCastingOp
 from pytensor.graph.basic import Variable
+from pytensor.link.numba.cache import compile_and_cache_numba_function_src
 from pytensor.link.numba.dispatch import basic as numba_basic
 from pytensor.link.numba.dispatch.basic import (
     create_numba_signature,
@@ -12,7 +13,6 @@
 )
 from pytensor.link.numba.dispatch.cython_support import wrap_cython_function
 from pytensor.link.utils import (
-    compile_function_src,
     get_name_for_object,
     unique_name_generator,
 )
@@ -128,16 +128,20 @@ def {scalar_op_fn_name}({', '.join(input_names)}):
     return direct_cast(scalar_func_numba({converted_call_args}, np.intc(1)), output_dtype)
             """
 
-    scalar_op_fn = compile_function_src(
-        scalar_op_src, scalar_op_fn_name, {**globals(), **global_env}
+    scalar_op_fn = compile_and_cache_numba_function_src(
+        scalar_op_src,
+        scalar_op_fn_name,
+        {**globals(), **global_env},
     )
 
-    signature = create_numba_signature(node, force_scalar=True)
+    # signature = create_numba_signature(node, force_scalar=True)
 
     return numba_basic.numba_njit(
-        signature,
+        # signature,
         # Functions that call a function pointer can't be cached
-        cache=False,
+        no_cfunc_wrapper=True,
+        no_cpython_wrapper=True,
+        register_jitable=False,
     )(scalar_op_fn)
 
 
@@ -164,7 +168,7 @@ def binary_to_nary_func(inputs: list[Variable], binary_op_name: str, binary_op:
 def {binary_op_name}({input_signature}):
     return {output_expr}
     """
-    nary_fn = compile_function_src(nary_src, binary_op_name, globals())
+    nary_fn = compile_and_cache_numba_function_src(nary_src, binary_op_name, globals())
 
     return nary_fn
 
diff --git a/pytensor/link/numba/dispatch/vectorize_codegen.py b/pytensor/link/numba/dispatch/vectorize_codegen.py
diff --git a/pytensor/link/numba/linker.py b/pytensor/link/numba/linker.py
diff --git a/tests/link/numba/test_basic.py b/tests/link/numba/test_basic.py