Skip to content
Open
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
65 commits
Select commit Hold shift + click to select a range
729654c
Add new argument to `gil_safe_call_once_and_store::call_once_and_stor…
XuehaiPan Dec 13, 2025
d2b7605
Add per-interpreter storage for `gil_safe_call_once_and_store`
XuehaiPan Dec 13, 2025
e741760
Make `~gil_safe_call_once_and_store` a no-op
XuehaiPan Dec 14, 2025
5d1d678
Fix C++11 compatibility
XuehaiPan Dec 14, 2025
0bac82d
Improve thread-safety and add default finalizer
XuehaiPan Dec 14, 2025
2a4b118
Merge branch 'master' into subinterp-call-once-and-store
XuehaiPan Dec 14, 2025
be97110
Try fix thread-safety
XuehaiPan Dec 14, 2025
3e77ce9
Try fix thread-safety
XuehaiPan Dec 14, 2025
d5b8813
Add a warning comment
XuehaiPan Dec 15, 2025
f6d0f88
Simplify `PYBIND11_INTERNALS_VERSION >= 12`
XuehaiPan Dec 15, 2025
7d8339e
Try fix thread-safety
XuehaiPan Dec 15, 2025
1920f43
Try fix thread-safety
XuehaiPan Dec 15, 2025
900bed6
Merge branch 'master' into subinterp-call-once-and-store
XuehaiPan Dec 15, 2025
a6754ba
Revert get_pp()
XuehaiPan Dec 16, 2025
1aed3ab
Update comments
XuehaiPan Dec 16, 2025
b61e902
Move call-once storage out of internals
XuehaiPan Dec 17, 2025
b72cd41
Revert internal version bump
XuehaiPan Dec 17, 2025
ac02a32
Cleanup outdated comments
XuehaiPan Dec 17, 2025
ddb6dd4
Move atomic_bool alias into pybind11::detail namespace
rwgk Dec 20, 2025
3fb52df
Add explicit #include <unordered_map> for subinterpreter support
rwgk Dec 20, 2025
32deca4
Remove extraneous semicolon after destructor definition
rwgk Dec 20, 2025
a4d4d73
Add comment explaining unused finalize parameter
rwgk Dec 20, 2025
7cb30ce
Add comment explaining error_scope usage
rwgk Dec 20, 2025
7d34139
Improve exception safety in get_or_create_call_once_storage_map()
rwgk Dec 20, 2025
78e3945
Add timeout-minutes: 3 to cpptest workflow steps
rwgk Dec 20, 2025
1014ee4
Add progress reporter for test_with_catch Catch2 runner
rwgk Dec 20, 2025
21d0dc5
clang-format auto-fix (overlooked before)
rwgk Dec 20, 2025
e1b1b1b
Disable "Move Subinterpreter" test on free-threaded Python 3.14+
rwgk Dec 21, 2025
89cae6d
style: pre-commit fixes
pre-commit-ci[bot] Dec 21, 2025
a090637
Add test for gil_safe_call_once_and_store per-interpreter isolation
rwgk Dec 21, 2025
cb5e7d7
Add STARTING/DONE timestamps to test_with_catch output
rwgk Dec 21, 2025
0f8f32a
Disable stdout buffering in test_with_catch
rwgk Dec 21, 2025
a3abdee
EXPERIMENT: Re-enable hanging test to verify CI log buffering fix
rwgk Dec 21, 2025
d6f2a7f
Revert "Disable stdout buffering in test_with_catch"
rwgk Dec 21, 2025
9b70460
Use USES_TERMINAL for cpptest to show output immediately
rwgk Dec 21, 2025
8951004
Fix clang-tidy performance-avoid-endl warning
rwgk Dec 21, 2025
c4cbe73
Add SIGTERM handler to show when test is killed by timeout
rwgk Dec 21, 2025
f330a79
Fix typo: atleast -> at_least
rwgk Dec 21, 2025
6c1ccb9
Fix GCC warn_unused_result error for write() in signal handler
rwgk Dec 21, 2025
3c01ff3
Add USES_TERMINAL to other C++ test targets
rwgk Dec 21, 2025
9e9843d
Revert "EXPERIMENT: Re-enable hanging test to verify CI log buffering…
rwgk Dec 21, 2025
e7c2648
Update comment to reference PR #5940 for Move Subinterpreter fix
rwgk Dec 21, 2025
58c08ac
Add alias `interpid_t = std::int64_t`
XuehaiPan Dec 21, 2025
305a293
Add isolation and gc test for `gil_safe_call_once_and_store`
XuehaiPan Dec 21, 2025
f6bba0f
Add thread local cache for gil_safe_call_once_and_store
XuehaiPan Dec 21, 2025
66e4697
Revert "Add thread local cache for gil_safe_call_once_and_store"
XuehaiPan Dec 21, 2025
d0819cc
Revert changes according to code review
XuehaiPan Dec 21, 2025
5ce00e5
Relocate multiple-interpreters tests
XuehaiPan Dec 21, 2025
97b50fe
Add more tests for multiple interpreters
XuehaiPan Dec 21, 2025
8819ec4
Remove copy constructor
XuehaiPan Dec 21, 2025
e84e9c1
Merge remote-tracking branch 'upstream/master' into subinterp-call-on…
XuehaiPan Dec 22, 2025
d9daef5
Apply suggestions from code review
XuehaiPan Dec 22, 2025
9a3328b
Refactor to use per-storage capsule instead
XuehaiPan Dec 22, 2025
b68faf0
Merge remote-tracking branch 'upstream/master' into subinterp-call-on…
XuehaiPan Dec 22, 2025
bc20601
Update comments
XuehaiPan Dec 22, 2025
b39c049
Update singleton tests
XuehaiPan Dec 22, 2025
9ef71ec
Use interpreter id type for `get_num_interpreters_seen()`
XuehaiPan Dec 22, 2025
98370f2
Suppress unused variable warning
XuehaiPan Dec 22, 2025
534235e
HACKING
XuehaiPan Dec 22, 2025
d038714
Revert "HACKING"
XuehaiPan Dec 22, 2025
3a2c34a
Try fix concurrency
XuehaiPan Dec 22, 2025
99a095d
Test even harder
XuehaiPan Dec 22, 2025
7daecd7
Reorg code to avoid duplicates
XuehaiPan Dec 23, 2025
cd950dc
Fix unique_ptr::reset -> unique_ptr::release
XuehaiPan Dec 23, 2025
db22bb4
Merge remote-tracking branch 'upstream/master' into subinterp-call-on…
XuehaiPan Dec 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions include/pybind11/detail/class.h
Original file line number Diff line number Diff line change
Expand Up @@ -226,14 +226,12 @@ extern "C" inline void pybind11_meta_dealloc(PyObject *obj) {
local_internals.registered_types_cpp.erase(tinfo->cpptype);
} else {
internals.registered_types_cpp.erase(tindex);
#if PYBIND11_INTERNALS_VERSION >= 12
internals.registered_types_cpp_fast.erase(tinfo->cpptype);
for (const std::type_info *alias : tinfo->alias_chain) {
auto num_erased = internals.registered_types_cpp_fast.erase(alias);
(void) num_erased;
assert(num_erased > 0);
}
#endif
}
internals.registered_types_py.erase(tinfo->type);

Expand Down
87 changes: 73 additions & 14 deletions include/pybind11/detail/internals.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@
/// further ABI-incompatible changes may be made before the ABI is officially
/// changed to the new version.
#ifndef PYBIND11_INTERNALS_VERSION
# define PYBIND11_INTERNALS_VERSION 11
# define PYBIND11_INTERNALS_VERSION 12
#endif

#if PYBIND11_INTERNALS_VERSION < 11
# error "PYBIND11_INTERNALS_VERSION 11 is the minimum for all platforms for pybind11v3."
#if PYBIND11_INTERNALS_VERSION < 12
# error "PYBIND11_INTERNALS_VERSION 12 is the minimum for all platforms for pybind11v3."
#endif

PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
Expand Down Expand Up @@ -234,6 +234,38 @@ inline uint64_t round_up_to_next_pow2(uint64_t x) {

class loader_life_support;

struct call_once_storage_base {
call_once_storage_base() = default;
virtual ~call_once_storage_base() = default;
call_once_storage_base(const call_once_storage_base &) = delete;
call_once_storage_base(call_once_storage_base &&) = delete;
call_once_storage_base &operator=(const call_once_storage_base &) = delete;
call_once_storage_base &operator=(call_once_storage_base &&) = delete;
};

template <typename T>
struct call_once_storage : call_once_storage_base {
alignas(T) char storage[sizeof(T)] = {};
std::once_flag once_flag;
void (*finalize)(T &) = nullptr;
std::atomic_bool is_initialized{false};

call_once_storage() = default;
~call_once_storage() override {
if (is_initialized) {
if (finalize != nullptr) {
finalize(*reinterpret_cast<T *>(storage));
} else {
reinterpret_cast<T *>(storage)->~T();
}
}
};
call_once_storage(const call_once_storage &) = delete;
call_once_storage(call_once_storage &&) = delete;
call_once_storage &operator=(const call_once_storage &) = delete;
call_once_storage &operator=(call_once_storage &&) = delete;
};

/// Internal data structure used to track registered instances and types.
/// Whenever binary incompatible changes are made to this structure,
/// `PYBIND11_INTERNALS_VERSION` must be incremented.
Expand All @@ -242,14 +274,12 @@ struct internals {
pymutex mutex;
pymutex exception_translator_mutex;
#endif
#if PYBIND11_INTERNALS_VERSION >= 12
// non-normative but fast "hint" for registered_types_cpp. Meant
// to be used as the first level of a two-level lookup: successful
// lookups are correct, but unsuccessful lookups need to try
// registered_types_cpp and then backfill this map if they find
// anything.
fast_type_map<type_info *> registered_types_cpp_fast;
#endif

// std::type_index -> pybind11's type information
type_map<type_info *> registered_types_cpp;
Expand All @@ -275,14 +305,13 @@ struct internals {
PyObject *instance_base = nullptr;
// Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined:
thread_specific_storage<PyThreadState> tstate;
#if PYBIND11_INTERNALS_VERSION <= 11
thread_specific_storage<loader_life_support> loader_life_support_tls; // OBSOLETE (PR #5830)
#endif
// Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined:
PyInterpreterState *istate = nullptr;

type_map<PyObject *> native_enum_type_map;

std::unordered_map<const void *, call_once_storage_base *> call_once_storage_map;

internals()
: static_property_type(make_static_property_type()),
default_metaclass(make_default_metaclass()) {
Expand All @@ -308,7 +337,12 @@ struct internals {
internals(internals &&other) = delete;
internals &operator=(const internals &other) = delete;
internals &operator=(internals &&other) = delete;
~internals() = default;
~internals() {
for (const auto &entry : call_once_storage_map) {
delete entry.second;
}
call_once_storage_map.clear();
}
};

// the internals struct (above) is shared between all the modules. local_internals are only
Expand Down Expand Up @@ -358,7 +392,6 @@ struct type_info {
void *(*module_local_load)(PyObject *, const type_info *) = nullptr;
holder_enum_t holder_enum_v = holder_enum_t::undefined;

#if PYBIND11_INTERNALS_VERSION >= 12
// When a type appears in multiple DSOs,
// internals::registered_types_cpp_fast will have multiple distinct
// keys (the std::type_info from each DSO) mapped to the same
Expand All @@ -369,7 +402,6 @@ struct type_info {
// nb_alias_chain` added in
// https://github.com/wjakob/nanobind/commit/b515b1f7f2f4ecc0357818e6201c94a9f4cbfdc2
std::forward_list<const std::type_info *> alias_chain;
#endif

/* A simple type never occurs as a (direct or indirect) parent
* of a class that makes use of multiple inheritance.
Expand Down Expand Up @@ -564,6 +596,15 @@ class internals_pp_manager {
/// acquire the GIL. Will never return nullptr.
std::unique_ptr<InternalsType> *get_pp() {
#ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT
// FIXME: We cannot use `get_num_interpreters_seen() > 1` here to create a fast path for
// the multi-interpreter case. The singleton may be initialized by a subinterpreter not the
// main interpreter.
//
// For multi-interpreter support, the subinterpreters can be initialized concurrently, and
// the first time this function may not be called in the main interpreter.
// For example, a clean main interpreter that does not import any pybind11 module and then
// spawns multiple subinterpreters using `InterpreterPoolExecutor` that each imports a
// pybind11 module concurrently.
if (get_num_interpreters_seen() > 1) {
Copy link
Contributor Author

@XuehaiPan XuehaiPan Dec 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc @b-pass @rwgk for thoughts about this, see the code comment referenced for more details.

I test the patch in this PR in:

Most things works fine except test_import_in_subinterpreter_before_main:

run_in_subprocess(
    """
    with contextlib.closing(interpreters.create()) as subinterpreter:
        subinterpreter.exec('import optree')

    import optree
    """
)

If I remove the get_num_interpreters_seen() > 1 condition, my import test works but the cpptest in pybind11 CI breaks because internals_singleton_pp_ is never initialized. For instance, the memory address get_internals() should be a shared constant for differenet pybind11 modules in a single program.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

get_num_interpreters_seen() > 1 check is meant to keep the "there are no subinterpreters" case (the most common case) as fast as possible.

Can you help me understand the problem that concurrent subinterpreter imports is causing?

I'm not understanding what exactly the problem is... doing concurrent imports of pybind11 modules each in its own subinterpreter should still result in each subinterpreter getting its own internals (as it should). The one that gets to this code first would populate the internals_singleton_pp_, but this does not have to be the main interpreter, and the next time get_pp is called from that first subinterprter it will take the other (> 1) code path. Since get_num_interpreters_seen() never decreases (it is "seen" not "currently alive"), once it goes up to 2 the inner code path will be used and what is stored in the internals_singleton_pp_ doesn't matter for the rest of the program.

What's the purpose of get_pp_for_main_interpreter() and the associated once flag?

Copy link
Contributor Author

@XuehaiPan XuehaiPan Dec 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let me elaborate more about my failed test. I have a C++ singleton instance using gil_safe_call_once_and_store to ensure exactly one instance per-interpreter. With the patch in this PR, the call-once result is stored in the internals (*get_pp()), where the internals is stored in the interpreter's state dict.

MyClass &get_singleton() {
    PYBIND11_CONSTINIT static py::gil_safe_call_once_and_store<MyClass> storage;
    return storage
        .call_once_and_store_result([]() -> MyClass {
            MyClass instance{};

            {
                // initialize
                ...
            }
            return instance;
        })
        .get_stored();
}
  • OK - test_import_in_subinterpreters_concurrently: import the module in multiple subinterpreters concurrently, without any import for a pybind11 module in the main interpreter.
  • OK - test_import_in_subinterpreter_after_main: import the module in the main interpreter, then import in a subinterpreter.
  • FAIL - test_import_in_subinterpreter_before_main: import the module in a subinterpreter (the main interpreter does not import any pybind11 module yet), then import the module in the main interpreter.

    The import succeeds in the subinterpreter, but the instance is ill-initialized in the main interpreter. If I remove the get_num_interpreters_seen() > 1 entirely or initialize the atomic counter for seen interpreters with 2 instead of 0, my test passes.


What's the purpose of get_pp_for_main_interpreter() and the associated once flag?

Just to ensure the singleton is initialized once. I'm not sure if the scoped GIL is still working when there are multiple interpreters running. I can revert the call_once approach back to scopded GIL approach.

// Whenever the interpreter changes on the current thread we need to invalidate the
// internals_pp so that it can be pulled from the interpreter's state dict. That is
Expand All @@ -580,15 +621,29 @@ class internals_pp_manager {
return internals_p_tls();
}
#endif
if (!internals_singleton_pp_) {
gil_scoped_acquire_simple gil;
internals_singleton_pp_ = get_or_create_pp_in_state_dict();
return get_pp_for_main_interpreter();
}

/// Get the pointer-to-pointer for the main interpreter, allocating it if it does not already
/// exist. May acquire the GIL. Will never return nullptr.
std::unique_ptr<InternalsType> *get_pp_for_main_interpreter() {
if (!seen_main_interpreter_) {
// The first call to this function **MUST** be from the main interpreter.
// Here we **ASSUME** that the current thread is running in the main interpreter.
// The caller is responsible for ensuring this.
std::call_once(seen_main_interpreter_flag_, [&] {
gil_scoped_acquire_simple gil;
internals_singleton_pp_ = get_or_create_pp_in_state_dict();
seen_main_interpreter_ = true;
});
}
// This is shared between all threads and all interpreters.
return internals_singleton_pp_;
}

/// Drop all the references we're currently holding.
void unref() {
// See comment in get_pp() above.
#ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT
if (get_num_interpreters_seen() > 1) {
last_istate_tls() = nullptr;
Expand All @@ -600,6 +655,7 @@ class internals_pp_manager {
}

void destroy() {
// See comment in get_pp() above.
#ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT
if (get_num_interpreters_seen() > 1) {
auto *tstate = get_thread_state_unchecked();
Expand Down Expand Up @@ -660,7 +716,10 @@ class internals_pp_manager {

char const *holder_id_ = nullptr;
on_fetch_function *on_fetch_ = nullptr;
// Pointer to the singleton internals for the main interpreter
std::unique_ptr<InternalsType> *internals_singleton_pp_;
std::once_flag seen_main_interpreter_flag_;
std::atomic_bool seen_main_interpreter_{false};
};

// If We loaded the internals through `state_dict`, our `error_already_set`
Expand Down
10 changes: 2 additions & 8 deletions include/pybind11/detail/type_caster_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -227,32 +227,26 @@ inline detail::type_info *get_global_type_info_lock_held(const std::type_info &t
// next time.
detail::type_info *type_info = nullptr;
auto &internals = get_internals();
#if PYBIND11_INTERNALS_VERSION >= 12
auto &fast_types = internals.registered_types_cpp_fast;
#endif
auto &types = internals.registered_types_cpp;
#if PYBIND11_INTERNALS_VERSION >= 12
auto fast_it = fast_types.find(&tp);
if (fast_it != fast_types.end()) {
# ifndef NDEBUG
#ifndef NDEBUG
auto types_it = types.find(std::type_index(tp));
assert(types_it != types.end());
assert(types_it->second == fast_it->second);
# endif
#endif
return fast_it->second;
}
#endif // PYBIND11_INTERNALS_VERSION >= 12

auto it = types.find(std::type_index(tp));
if (it != types.end()) {
#if PYBIND11_INTERNALS_VERSION >= 12
// We found the type in the slow map but not the fast one, so
// some other DSO added it (otherwise it would be in the fast
// map under &tp) and therefore we must be an alias. Record
// that.
it->second->alias_chain.push_front(&tp);
fast_types.emplace(&tp, it->second);
#endif
type_info = it->second;
}
return type_info;
Expand Down
Loading
Loading