X Tutup
Skip to content

Commit d248a04

Browse files
authored
Refine specialization caches and extend binary-op coverage (#7386)
* Align BINARY_OP_EXTEND with CPython descriptor cache model * Align type _spec_cache and latin1 singleton string paths * Add specialization differential harness and align init error text * Tighten CALL_ALLOC_AND_ENTER_INIT stack-space guard * Align call-init frame flow and spec cache atomic ordering * Refine call-init recursion guard and cache swap lifetime handling * Align spec cache write locking with CPython contract * Align load attr miss cooldown with CPython * Align CALL descriptor and class-call specialization with CPython * Extract datastack_frame_size_bytes_for_code, skip monitoring for init_cleanup frames, guard trace dispatch - Extract datastack_frame_size_bytes_for_code as free function, use it to compute init_cleanup stack bytes instead of hardcoded constant - Add monitoring_disabled_for_code to skip instrumentation for synthetic init_cleanup code object in RESUME and execute_instrumented - Add is_trace_event guard so profile-only events skip trace_func dispatch
1 parent a854ef2 commit d248a04

File tree

10 files changed

+795
-473
lines changed

10 files changed

+795
-473
lines changed

crates/vm/src/builtins/function.rs

Lines changed: 62 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ use crate::{
1313
bytecode,
1414
class::PyClassImpl,
1515
common::wtf8::{Wtf8Buf, wtf8_concat},
16-
frame::Frame,
16+
frame::{Frame, FrameRef},
1717
function::{FuncArgs, OptionalArg, PyComparisonValue, PySetterValue},
1818
scope::Scope,
1919
types::{
@@ -673,27 +673,14 @@ impl Py<PyFunction> {
673673
/// Returns `None` for generator/coroutine code paths that do not push a
674674
/// regular datastack-backed frame in the fast call path.
675675
pub(crate) fn datastack_frame_size_bytes(&self) -> Option<usize> {
676-
let code: &Py<PyCode> = &self.code;
677-
if code
678-
.flags
679-
.intersects(bytecode::CodeFlags::GENERATOR | bytecode::CodeFlags::COROUTINE)
680-
{
681-
return None;
682-
}
683-
let nlocalsplus = code
684-
.varnames
685-
.len()
686-
.checked_add(code.cellvars.len())?
687-
.checked_add(code.freevars.len())?;
688-
let capacity = nlocalsplus.checked_add(code.max_stackdepth as usize)?;
689-
capacity.checked_mul(core::mem::size_of::<usize>())
676+
datastack_frame_size_bytes_for_code(&self.code)
690677
}
691678

692-
/// Fast path for calling a simple function with exact positional args.
693-
/// Skips FuncArgs allocation, prepend_arg, and fill_locals_from_args.
694-
/// Only valid when: CO_OPTIMIZED, no VARARGS, no VARKEYWORDS, no kwonlyargs,
695-
/// and nargs == co_argcount.
696-
pub fn invoke_exact_args(&self, mut args: Vec<PyObjectRef>, vm: &VirtualMachine) -> PyResult {
679+
pub(crate) fn prepare_exact_args_frame(
680+
&self,
681+
mut args: Vec<PyObjectRef>,
682+
vm: &VirtualMachine,
683+
) -> FrameRef {
697684
let code: PyRef<PyCode> = (*self.code).to_owned();
698685

699686
debug_assert_eq!(args.len(), code.arg_count as usize);
@@ -704,16 +691,11 @@ impl Py<PyFunction> {
704691
.intersects(bytecode::CodeFlags::VARARGS | bytecode::CodeFlags::VARKEYWORDS)
705692
);
706693
debug_assert_eq!(code.kwonlyarg_count, 0);
707-
708-
// Generator/coroutine code objects are SIMPLE_FUNCTION in call
709-
// specialization classification, but their call path must still
710-
// go through invoke() to produce generator/coroutine objects.
711-
if code
712-
.flags
713-
.intersects(bytecode::CodeFlags::GENERATOR | bytecode::CodeFlags::COROUTINE)
714-
{
715-
return self.invoke(FuncArgs::from(args), vm);
716-
}
694+
debug_assert!(
695+
!code
696+
.flags
697+
.intersects(bytecode::CodeFlags::GENERATOR | bytecode::CodeFlags::COROUTINE)
698+
);
717699

718700
let locals = if code.flags.contains(bytecode::CodeFlags::NEWLOCALS) {
719701
None
@@ -727,20 +709,18 @@ impl Py<PyFunction> {
727709
self.builtins.clone(),
728710
self.closure.as_ref().map_or(&[], |c| c.as_slice()),
729711
Some(self.to_owned().into()),
730-
true, // Always use datastack (invoke_exact_args is never gen/coro)
712+
true, // Exact-args fast path is only used for non-gen/coro functions.
731713
vm,
732714
)
733715
.into_ref(&vm.ctx);
734716

735-
// Move args directly into fastlocals (no clone/refcount needed)
736717
{
737718
let fastlocals = unsafe { frame.fastlocals_mut() };
738719
for (slot, arg) in fastlocals.iter_mut().zip(args.drain(..)) {
739720
*slot = Some(arg);
740721
}
741722
}
742723

743-
// Handle cell2arg
744724
if let Some(cell2arg) = code.cell2arg.as_deref() {
745725
let fastlocals = unsafe { frame.fastlocals_mut() };
746726
for (cell_idx, arg_idx) in cell2arg.iter().enumerate().filter(|(_, i)| **i != -1) {
@@ -749,6 +729,36 @@ impl Py<PyFunction> {
749729
}
750730
}
751731

732+
frame
733+
}
734+
735+
/// Fast path for calling a simple function with exact positional args.
736+
/// Skips FuncArgs allocation, prepend_arg, and fill_locals_from_args.
737+
/// Only valid when: CO_OPTIMIZED, no VARARGS, no VARKEYWORDS, no kwonlyargs,
738+
/// and nargs == co_argcount.
739+
pub fn invoke_exact_args(&self, args: Vec<PyObjectRef>, vm: &VirtualMachine) -> PyResult {
740+
let code: PyRef<PyCode> = (*self.code).to_owned();
741+
742+
debug_assert_eq!(args.len(), code.arg_count as usize);
743+
debug_assert!(code.flags.contains(bytecode::CodeFlags::OPTIMIZED));
744+
debug_assert!(
745+
!code
746+
.flags
747+
.intersects(bytecode::CodeFlags::VARARGS | bytecode::CodeFlags::VARKEYWORDS)
748+
);
749+
debug_assert_eq!(code.kwonlyarg_count, 0);
750+
751+
// Generator/coroutine code objects are SIMPLE_FUNCTION in call
752+
// specialization classification, but their call path must still
753+
// go through invoke() to produce generator/coroutine objects.
754+
if code
755+
.flags
756+
.intersects(bytecode::CodeFlags::GENERATOR | bytecode::CodeFlags::COROUTINE)
757+
{
758+
return self.invoke(FuncArgs::from(args), vm);
759+
}
760+
let frame = self.prepare_exact_args_frame(args, vm);
761+
752762
let result = vm.run_frame(frame.clone());
753763
unsafe {
754764
if let Some(base) = frame.materialize_localsplus() {
@@ -759,6 +769,22 @@ impl Py<PyFunction> {
759769
}
760770
}
761771

772+
pub(crate) fn datastack_frame_size_bytes_for_code(code: &Py<PyCode>) -> Option<usize> {
773+
if code
774+
.flags
775+
.intersects(bytecode::CodeFlags::GENERATOR | bytecode::CodeFlags::COROUTINE)
776+
{
777+
return None;
778+
}
779+
let nlocalsplus = code
780+
.varnames
781+
.len()
782+
.checked_add(code.cellvars.len())?
783+
.checked_add(code.freevars.len())?;
784+
let capacity = nlocalsplus.checked_add(code.max_stackdepth as usize)?;
785+
capacity.checked_mul(core::mem::size_of::<usize>())
786+
}
787+
762788
impl PyPayload for PyFunction {
763789
#[inline]
764790
fn class(ctx: &Context) -> &'static Py<PyType> {
@@ -1351,6 +1377,7 @@ pub(crate) fn vectorcall_function(
13511377

13521378
let has_kwargs = kwnames.is_some_and(|kw| !kw.is_empty());
13531379
let is_simple = !has_kwargs
1380+
&& code.flags.contains(bytecode::CodeFlags::OPTIMIZED)
13541381
&& !code.flags.contains(bytecode::CodeFlags::VARARGS)
13551382
&& !code.flags.contains(bytecode::CodeFlags::VARKEYWORDS)
13561383
&& code.kwonlyarg_count == 0
@@ -1361,37 +1388,8 @@ pub(crate) fn vectorcall_function(
13611388
if is_simple && nargs == code.arg_count as usize {
13621389
// FAST PATH: simple positional-only call, exact arg count.
13631390
// Move owned args directly into fastlocals — no clone needed.
1364-
let locals = if code.flags.contains(bytecode::CodeFlags::NEWLOCALS) {
1365-
None // lazy allocation — most frames never access locals dict
1366-
} else {
1367-
Some(ArgMapping::from_dict_exact(zelf.globals.clone()))
1368-
};
1369-
1370-
let frame = Frame::new(
1371-
code.to_owned(),
1372-
Scope::new(locals, zelf.globals.clone()),
1373-
zelf.builtins.clone(),
1374-
zelf.closure.as_ref().map_or(&[], |c| c.as_slice()),
1375-
Some(zelf.to_owned().into()),
1376-
true, // Always use datastack (is_simple excludes gen/coro)
1377-
vm,
1378-
)
1379-
.into_ref(&vm.ctx);
1380-
1381-
{
1382-
let fastlocals = unsafe { frame.fastlocals_mut() };
1383-
for (slot, arg) in fastlocals.iter_mut().zip(args.drain(..nargs)) {
1384-
*slot = Some(arg);
1385-
}
1386-
}
1387-
1388-
if let Some(cell2arg) = code.cell2arg.as_deref() {
1389-
let fastlocals = unsafe { frame.fastlocals_mut() };
1390-
for (cell_idx, arg_idx) in cell2arg.iter().enumerate().filter(|(_, i)| **i != -1) {
1391-
let x = fastlocals[*arg_idx as usize].take();
1392-
frame.set_cell_contents(cell_idx, x);
1393-
}
1394-
}
1391+
args.truncate(nargs);
1392+
let frame = zelf.prepare_exact_args_frame(args, vm);
13951393

13961394
let result = vm.run_frame(frame.clone());
13971395
unsafe {

crates/vm/src/builtins/str.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1691,7 +1691,7 @@ impl ToPyObject for char {
16911691
fn to_pyobject(self, vm: &VirtualMachine) -> PyObjectRef {
16921692
let cp = self as u32;
16931693
if cp <= u8::MAX as u32 {
1694-
vm.ctx.latin1_char_cache[cp as usize].clone().into()
1694+
vm.ctx.latin1_char(cp as u8).into()
16951695
} else {
16961696
vm.ctx.new_str(self).into()
16971697
}
@@ -1702,7 +1702,7 @@ impl ToPyObject for CodePoint {
17021702
fn to_pyobject(self, vm: &VirtualMachine) -> PyObjectRef {
17031703
let cp = self.to_u32();
17041704
if cp <= u8::MAX as u32 {
1705-
vm.ctx.latin1_char_cache[cp as usize].clone().into()
1705+
vm.ctx.latin1_char(cp as u8).into()
17061706
} else {
17071707
vm.ctx.new_str(self).into()
17081708
}
@@ -1747,7 +1747,7 @@ impl ToPyObject for AsciiString {
17471747

17481748
impl ToPyObject for AsciiChar {
17491749
fn to_pyobject(self, vm: &VirtualMachine) -> PyObjectRef {
1750-
vm.ctx.new_str(self).into()
1750+
vm.ctx.latin1_char(u8::from(self)).into()
17511751
}
17521752
}
17531753

0 commit comments

Comments
 (0)
X Tutup