X Tutup
Skip to content

Commit 9a0511b

Browse files
authored
Align specialization guards and caching with CPython (#7341)
* vm: complete specialized opcode dispatch paths * vm: cache LOAD_GLOBAL with dict entry hints * vm: align adaptive specialization counters with CPython backoff * vm: apply cooldown counter on specialization success paths * vm: retain LOAD_GLOBAL specializations on misses * vm: keep attr and call specializations on guard misses * vm: retain store-attr and store-subscr specializations on misses * vm: retain specialization opcodes on generic fallback paths * vm: align jump-backward specialization defaults with CPython * vm: retain exact-args call specializations on misses * vm: retain SEND_GEN specialization on non-coroutine sends * vm: specialize list.append calls like CPython CALL_LIST_APPEND * vm: set cooldown on LOAD_ATTR_CLASS specialization * vm: specialize bound method object CALL paths * vm: specialize CALL_KW for bound method objects * vm: use current-state function version for CALL_KW specialization * vm: align CALL/CALL_KW pyfunction specialization with CPython * vm: drop call-site identity caches in generic CALL specializations * vm: align builtin type call specializations with CPython guards * vm: align builtin CALL guards with CPython self_or_null semantics * vm: require exact list in CALL_LIST_APPEND fast path * vm: align CALL builtin/class specialization flow with CPython * vm: tighten len/isinstance CALL specializations to builtin guards * vm: gate CALL_BUILTIN_CLASS on type vectorcall like CPython * vm: run non-py CALL specializations via direct vectorcall * vm: align class-call specialization branching with CPython * Fix CI: disable ForIterGen, tighten CALL guards - Disable ForIterGen specialization (falls through to generic path) because inline generator frame resumption is needed for correct debugger StopIteration visibility (test_bdb) - Use downcast_ref_if_exact for PyNativeFunction in CALL specialization guards - Add can_specialize_call guard for class __init__ specialization - Remove expectedFailure for test_bad_newobj_args (now passing)
1 parent 5c29074 commit 9a0511b

File tree

9 files changed

+1214
-713
lines changed

9 files changed

+1214
-713
lines changed

Lib/test/test_pickle.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -85,10 +85,6 @@ def dumps(self, arg, proto=None, **kwargs):
8585
f.seek(0)
8686
return bytes(f.read())
8787

88-
@unittest.expectedFailure # TODO: RUSTPYTHON
89-
def test_bad_newobj_args(self):
90-
return super().test_bad_newobj_args()
91-
9288
@unittest.expectedFailure # TODO: RUSTPYTHON
9389
def test_buffer_callback_error(self):
9490
return super().test_buffer_callback_error()

crates/compiler-core/src/bytecode.rs

Lines changed: 92 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -349,9 +349,47 @@ pub struct CodeUnit {
349349
const _: () = assert!(mem::size_of::<CodeUnit>() == 2);
350350

351351
/// Adaptive specialization: number of executions before attempting specialization.
352-
pub const ADAPTIVE_WARMUP_VALUE: u8 = 50;
353-
/// Adaptive specialization: backoff counter after de-optimization.
354-
pub const ADAPTIVE_BACKOFF_VALUE: u8 = 250;
352+
///
353+
/// Matches CPython's `_Py_BackoffCounter` encoding.
354+
pub const ADAPTIVE_WARMUP_VALUE: u16 = adaptive_counter_bits(1, 1);
355+
/// Adaptive specialization: cooldown counter after a successful specialization.
356+
///
357+
/// Value/backoff = (52, 0), matching CPython's ADAPTIVE_COOLDOWN bits.
358+
pub const ADAPTIVE_COOLDOWN_VALUE: u16 = adaptive_counter_bits(52, 0);
359+
/// Initial JUMP_BACKWARD counter bits (value/backoff = 4095/12).
360+
pub const JUMP_BACKWARD_INITIAL_VALUE: u16 = adaptive_counter_bits(4095, 12);
361+
362+
const BACKOFF_BITS: u16 = 4;
363+
const MAX_BACKOFF: u16 = 12;
364+
const UNREACHABLE_BACKOFF: u16 = 15;
365+
366+
/// Encode an adaptive counter as `(value << 4) | backoff`.
367+
pub const fn adaptive_counter_bits(value: u16, backoff: u16) -> u16 {
368+
(value << BACKOFF_BITS) | backoff
369+
}
370+
371+
/// True when the adaptive counter should trigger specialization.
372+
#[inline]
373+
pub const fn adaptive_counter_triggers(counter: u16) -> bool {
374+
counter < UNREACHABLE_BACKOFF
375+
}
376+
377+
/// Decrement adaptive counter by one countdown step.
378+
#[inline]
379+
pub const fn advance_adaptive_counter(counter: u16) -> u16 {
380+
counter.wrapping_sub(1 << BACKOFF_BITS)
381+
}
382+
383+
/// Reset adaptive counter with exponential backoff.
384+
#[inline]
385+
pub const fn adaptive_counter_backoff(counter: u16) -> u16 {
386+
let backoff = counter & ((1 << BACKOFF_BITS) - 1);
387+
if backoff < MAX_BACKOFF {
388+
adaptive_counter_bits((1 << (backoff + 1)) - 1, backoff + 1)
389+
} else {
390+
adaptive_counter_bits((1 << MAX_BACKOFF) - 1, MAX_BACKOFF)
391+
}
392+
}
355393

356394
impl CodeUnit {
357395
pub const fn new(op: Instruction, arg: OpArgByte) -> Self {
@@ -370,12 +408,15 @@ impl TryFrom<&[u8]> for CodeUnit {
370408
}
371409
}
372410

373-
pub struct CodeUnits(UnsafeCell<Box<[CodeUnit]>>);
411+
pub struct CodeUnits {
412+
units: UnsafeCell<Box<[CodeUnit]>>,
413+
adaptive_counters: Box<[AtomicU16]>,
414+
}
374415

375416
// SAFETY: All cache operations use atomic read/write instructions.
376417
// - replace_op / compare_exchange_op: AtomicU8 store/CAS (Release)
377418
// - cache read/write: AtomicU16 load/store (Relaxed)
378-
// - adaptive counter: AtomicU8 load/store (Relaxed)
419+
// - adaptive counter: AtomicU16 load/store (Relaxed)
379420
// Ordering is established by:
380421
// - replace_op (Release) ↔ dispatch loop read_op (Acquire) for cache data visibility
381422
// - tp_version_tag (Acquire) for descriptor pointer validity
@@ -385,15 +426,23 @@ impl Clone for CodeUnits {
385426
fn clone(&self) -> Self {
386427
// SAFETY: No concurrent mutation during clone — cloning is only done
387428
// during code object construction or marshaling, not while instrumented.
388-
let inner = unsafe { &*self.0.get() };
389-
Self(UnsafeCell::new(inner.clone()))
429+
let units = unsafe { &*self.units.get() }.clone();
430+
let adaptive_counters = self
431+
.adaptive_counters
432+
.iter()
433+
.map(|c| AtomicU16::new(c.load(Ordering::Relaxed)))
434+
.collect();
435+
Self {
436+
units: UnsafeCell::new(units),
437+
adaptive_counters,
438+
}
390439
}
391440
}
392441

393442
impl fmt::Debug for CodeUnits {
394443
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
395444
// SAFETY: Debug formatting doesn't race with replace_op
396-
let inner = unsafe { &*self.0.get() };
445+
let inner = unsafe { &*self.units.get() };
397446
f.debug_tuple("CodeUnits").field(inner).finish()
398447
}
399448
}
@@ -406,29 +455,37 @@ impl TryFrom<&[u8]> for CodeUnits {
406455
return Err(Self::Error::InvalidBytecode);
407456
}
408457

409-
let units: Self = value
458+
let units = value
410459
.chunks_exact(2)
411460
.map(CodeUnit::try_from)
412-
.collect::<Result<_, _>>()?;
413-
Ok(units)
461+
.collect::<Result<Vec<_>, _>>()?;
462+
Ok(units.into())
414463
}
415464
}
416465

417466
impl<const N: usize> From<[CodeUnit; N]> for CodeUnits {
418467
fn from(value: [CodeUnit; N]) -> Self {
419-
Self(UnsafeCell::new(Box::from(value)))
468+
Self::from(Vec::from(value))
420469
}
421470
}
422471

423472
impl From<Vec<CodeUnit>> for CodeUnits {
424473
fn from(value: Vec<CodeUnit>) -> Self {
425-
Self(UnsafeCell::new(value.into_boxed_slice()))
474+
let units = value.into_boxed_slice();
475+
let adaptive_counters = (0..units.len())
476+
.map(|_| AtomicU16::new(0))
477+
.collect::<Vec<_>>()
478+
.into_boxed_slice();
479+
Self {
480+
units: UnsafeCell::new(units),
481+
adaptive_counters,
482+
}
426483
}
427484
}
428485

429486
impl FromIterator<CodeUnit> for CodeUnits {
430487
fn from_iter<T: IntoIterator<Item = CodeUnit>>(iter: T) -> Self {
431-
Self(UnsafeCell::new(iter.into_iter().collect()))
488+
Self::from(iter.into_iter().collect::<Vec<_>>())
432489
}
433490
}
434491

@@ -439,7 +496,7 @@ impl Deref for CodeUnits {
439496
// SAFETY: Shared references to the slice are valid even while replace_op
440497
// may update individual opcode bytes — readers tolerate stale opcodes
441498
// (they will re-read on the next iteration).
442-
unsafe { &*self.0.get() }
499+
unsafe { &*self.units.get() }
443500
}
444501
}
445502

@@ -452,7 +509,7 @@ impl CodeUnits {
452509
/// - `index` must be in bounds.
453510
/// - `new_op` must have the same arg semantics as the original opcode.
454511
pub unsafe fn replace_op(&self, index: usize, new_op: Instruction) {
455-
let units = unsafe { &*self.0.get() };
512+
let units = unsafe { &*self.units.get() };
456513
let ptr = units.as_ptr().wrapping_add(index) as *const AtomicU8;
457514
unsafe { &*ptr }.store(new_op.into(), Ordering::Release);
458515
}
@@ -468,7 +525,7 @@ impl CodeUnits {
468525
expected: Instruction,
469526
new_op: Instruction,
470527
) -> bool {
471-
let units = unsafe { &*self.0.get() };
528+
let units = unsafe { &*self.units.get() };
472529
let ptr = units.as_ptr().wrapping_add(index) as *const AtomicU8;
473530
unsafe { &*ptr }
474531
.compare_exchange(
@@ -483,7 +540,7 @@ impl CodeUnits {
483540
/// Atomically read the opcode at `index` with Acquire ordering.
484541
/// Pairs with `replace_op` (Release) to ensure cache data visibility.
485542
pub fn read_op(&self, index: usize) -> Instruction {
486-
let units = unsafe { &*self.0.get() };
543+
let units = unsafe { &*self.units.get() };
487544
let ptr = units.as_ptr().wrapping_add(index) as *const AtomicU8;
488545
let byte = unsafe { &*ptr }.load(Ordering::Acquire);
489546
// SAFETY: Only valid Instruction values are stored via replace_op/compare_exchange_op.
@@ -492,7 +549,7 @@ impl CodeUnits {
492549

493550
/// Atomically read the arg byte at `index` with Relaxed ordering.
494551
pub fn read_arg(&self, index: usize) -> OpArgByte {
495-
let units = unsafe { &*self.0.get() };
552+
let units = unsafe { &*self.units.get() };
496553
let ptr = units.as_ptr().wrapping_add(index) as *const u8;
497554
let arg_ptr = unsafe { ptr.add(1) } as *const AtomicU8;
498555
OpArgByte::from(unsafe { &*arg_ptr }.load(Ordering::Relaxed))
@@ -505,7 +562,7 @@ impl CodeUnits {
505562
/// # Safety
506563
/// - `index` must be in bounds and point to a CACHE entry.
507564
pub unsafe fn write_cache_u16(&self, index: usize, value: u16) {
508-
let units = unsafe { &*self.0.get() };
565+
let units = unsafe { &*self.units.get() };
509566
let ptr = units.as_ptr().wrapping_add(index) as *const AtomicU16;
510567
unsafe { &*ptr }.store(value, Ordering::Relaxed);
511568
}
@@ -516,7 +573,7 @@ impl CodeUnits {
516573
/// # Panics
517574
/// Panics if `index` is out of bounds.
518575
pub fn read_cache_u16(&self, index: usize) -> u16 {
519-
let units = unsafe { &*self.0.get() };
576+
let units = unsafe { &*self.units.get() };
520577
assert!(index < units.len(), "read_cache_u16: index out of bounds");
521578
let ptr = units.as_ptr().wrapping_add(index) as *const AtomicU16;
522579
unsafe { &*ptr }.load(Ordering::Relaxed)
@@ -564,25 +621,19 @@ impl CodeUnits {
564621
lo | (hi << 32)
565622
}
566623

567-
/// Read the adaptive counter from the CACHE entry's `arg` byte at `index`.
624+
/// Read adaptive counter bits for instruction at `index`.
568625
/// Uses Relaxed atomic load.
569-
pub fn read_adaptive_counter(&self, index: usize) -> u8 {
570-
let units = unsafe { &*self.0.get() };
571-
let ptr = units.as_ptr().wrapping_add(index) as *const u8;
572-
let arg_ptr = unsafe { ptr.add(1) } as *const AtomicU8;
573-
unsafe { &*arg_ptr }.load(Ordering::Relaxed)
626+
pub fn read_adaptive_counter(&self, index: usize) -> u16 {
627+
self.adaptive_counters[index].load(Ordering::Relaxed)
574628
}
575629

576-
/// Write the adaptive counter to the CACHE entry's `arg` byte at `index`.
630+
/// Write adaptive counter bits for instruction at `index`.
577631
/// Uses Relaxed atomic store.
578632
///
579633
/// # Safety
580-
/// - `index` must be in bounds and point to a CACHE entry.
581-
pub unsafe fn write_adaptive_counter(&self, index: usize, value: u8) {
582-
let units = unsafe { &*self.0.get() };
583-
let ptr = units.as_ptr().wrapping_add(index) as *const u8;
584-
let arg_ptr = unsafe { ptr.add(1) } as *const AtomicU8;
585-
unsafe { &*arg_ptr }.store(value, Ordering::Relaxed);
634+
/// - `index` must be in bounds.
635+
pub unsafe fn write_adaptive_counter(&self, index: usize, value: u16) {
636+
self.adaptive_counters[index].store(value, Ordering::Relaxed);
586637
}
587638

588639
/// Produce a clean copy of the bytecode suitable for serialization
@@ -611,7 +662,7 @@ impl CodeUnits {
611662

612663
/// Initialize adaptive warmup counters for all cacheable instructions.
613664
/// Called lazily at RESUME (first execution of a code object).
614-
/// Uses the `arg` byte of the first CACHE entry, preserving `op = Instruction::Cache`.
665+
/// Counters are stored out-of-line to preserve `op = Instruction::Cache`.
615666
/// All writes are atomic (Relaxed) to avoid data races with concurrent readers.
616667
pub fn quicken(&self) {
617668
let len = self.len();
@@ -625,8 +676,13 @@ impl CodeUnits {
625676
if !op.is_instrumented() {
626677
let cache_base = i + 1;
627678
if cache_base < len {
679+
let initial_counter = if matches!(op, Instruction::JumpBackward { .. }) {
680+
JUMP_BACKWARD_INITIAL_VALUE
681+
} else {
682+
ADAPTIVE_WARMUP_VALUE
683+
};
628684
unsafe {
629-
self.write_adaptive_counter(cache_base, ADAPTIVE_WARMUP_VALUE);
685+
self.write_adaptive_counter(cache_base, initial_counter);
630686
}
631687
}
632688
}

crates/jit/src/instructions.rs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -210,9 +210,18 @@ impl<'a, 'b> FunctionCompiler<'a, 'b> {
210210
func_ref: FuncRef,
211211
bytecode: &CodeObject<C>,
212212
) -> Result<(), JitCompileError> {
213+
// JIT should consume a stable instruction stream: de-specialized opcodes
214+
// with zeroed CACHE entries, not runtime-mutated quickened code.
215+
let clean_instructions: bytecode::CodeUnits = bytecode
216+
.instructions
217+
.original_bytes()
218+
.as_slice()
219+
.try_into()
220+
.map_err(|_| JitCompileError::BadBytecode)?;
221+
213222
let mut label_targets = BTreeSet::new();
214223
let mut target_arg_state = OpArgState::default();
215-
for (offset, &raw_instr) in bytecode.instructions.iter().enumerate() {
224+
for (offset, &raw_instr) in clean_instructions.iter().enumerate() {
216225
let (instruction, arg) = target_arg_state.get(raw_instr);
217226
if let Some(target) = Self::instruction_target(offset as u32, instruction, arg)? {
218227
label_targets.insert(target);
@@ -223,7 +232,7 @@ impl<'a, 'b> FunctionCompiler<'a, 'b> {
223232
// Track whether we have "returned" in the current block
224233
let mut in_unreachable_code = false;
225234

226-
for (offset, &raw_instr) in bytecode.instructions.iter().enumerate() {
235+
for (offset, &raw_instr) in clean_instructions.iter().enumerate() {
227236
let label = Label(offset as u32);
228237
let (instruction, arg) = arg_state.get(raw_instr);
229238

crates/vm/src/builtins/dict.rs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -668,6 +668,33 @@ impl Py<PyDict> {
668668
}
669669
}
670670

671+
/// Return a cached-entry hint for exact dict fast paths.
672+
pub(crate) fn hint_for_key<K: DictKey + ?Sized>(
673+
&self,
674+
key: &K,
675+
vm: &VirtualMachine,
676+
) -> PyResult<Option<u16>> {
677+
if self.exact_dict(vm) {
678+
self.entries.hint_for_key(vm, key)
679+
} else {
680+
Ok(None)
681+
}
682+
}
683+
684+
/// Fast lookup using a cached entry index hint.
685+
pub(crate) fn get_item_opt_hint<K: DictKey + ?Sized>(
686+
&self,
687+
key: &K,
688+
hint: u16,
689+
vm: &VirtualMachine,
690+
) -> PyResult<Option<PyObjectRef>> {
691+
if self.exact_dict(vm) {
692+
self.entries.get_hint(vm, key, usize::from(hint))
693+
} else {
694+
self.get_item_opt(key, vm)
695+
}
696+
}
697+
671698
pub fn get_item<K: DictKey + ?Sized>(&self, key: &K, vm: &VirtualMachine) -> PyResult {
672699
if self.exact_dict(vm) {
673700
self.inner_getitem(key, vm)

crates/vm/src/builtins/function.rs

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -727,10 +727,12 @@ impl PyFunction {
727727

728728
#[pygetset(setter)]
729729
fn set___code__(&self, code: PyRef<PyCode>, vm: &VirtualMachine) {
730+
#[cfg(feature = "jit")]
731+
let mut jit_guard = self.jitted_code.lock();
730732
self.code.swap_to_temporary_refs(code, vm);
731733
#[cfg(feature = "jit")]
732734
{
733-
*self.jitted_code.lock() = None;
735+
*jit_guard = None;
734736
}
735737
self.func_version.store(0, Relaxed);
736738
}
@@ -968,15 +970,16 @@ impl PyFunction {
968970
#[cfg(feature = "jit")]
969971
#[pymethod]
970972
fn __jit__(zelf: PyRef<Self>, vm: &VirtualMachine) -> PyResult<()> {
971-
if zelf.jitted_code.lock().is_some() {
973+
let mut jit_guard = zelf.jitted_code.lock();
974+
if jit_guard.is_some() {
972975
return Ok(());
973976
}
974977
let arg_types = jit::get_jit_arg_types(&zelf, vm)?;
975978
let ret_type = jit::jit_ret_type(&zelf, vm)?;
976979
let code: &Py<PyCode> = &zelf.code;
977980
let compiled = rustpython_jit::compile(&code.code, &arg_types, ret_type)
978981
.map_err(|err| jit::new_jit_error(err.to_string(), vm))?;
979-
*zelf.jitted_code.lock() = Some(compiled);
982+
*jit_guard = Some(compiled);
980983
Ok(())
981984
}
982985
}
@@ -1149,6 +1152,16 @@ impl PyBoundMethod {
11491152
Self { object, function }
11501153
}
11511154

1155+
#[inline]
1156+
pub(crate) fn function_obj(&self) -> &PyObjectRef {
1157+
&self.function
1158+
}
1159+
1160+
#[inline]
1161+
pub(crate) fn self_obj(&self) -> &PyObjectRef {
1162+
&self.object
1163+
}
1164+
11521165
#[deprecated(note = "Use `Self::new(object, function).into_ref(ctx)` instead")]
11531166
pub fn new_ref(object: PyObjectRef, function: PyObjectRef, ctx: &Context) -> PyRef<Self> {
11541167
Self::new(object, function).into_ref(ctx)

crates/vm/src/builtins/tuple.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -327,7 +327,13 @@ impl PyTuple {
327327

328328
fn _getitem(&self, needle: &PyObject, vm: &VirtualMachine) -> PyResult {
329329
match SequenceIndex::try_from_borrowed_object(vm, needle, "tuple")? {
330-
SequenceIndex::Int(i) => self.elements.getitem_by_index(vm, i),
330+
SequenceIndex::Int(i) => {
331+
let index = self
332+
.elements
333+
.wrap_index(i)
334+
.ok_or_else(|| vm.new_index_error("tuple index out of range"))?;
335+
Ok(self.elements[index].clone())
336+
}
331337
SequenceIndex::Slice(slice) => self
332338
.elements
333339
.getitem_by_slice(vm, slice)

0 commit comments

Comments
 (0)
X Tutup