""" open/DurusWorks/durus/connection.py """ from durus.error import ConflictError, WriteConflictError, ReadConflictError from durus.error import DurusKeyError from durus.logger import log from durus.persistent import ConnectionBase, GHOST from durus.persistent_dict import PersistentDict from durus.serialize import ObjectReader, ObjectWriter from durus.serialize import unpack_record, pack_record from durus.utils import int8_to_str, iteritems, loads, byte_string, as_bytes from heapq import heappush, heappop from itertools import islice, chain from os import getpid from time import time from weakref import ref, KeyedRef import durus.storage ROOT_DURUS_ID = int8_to_str(0) class Connection (ConnectionBase): """ The Connection manages movement of objects in and out of storage. Instance attributes: storage: Storage cache: Cache reader: ObjectReader changed: {durus_id:str : PersistentObject} invalid_durus_ids: set([str]) Set of durus_ids of objects known to have obsolete state. transaction_serial: int Number of calls to commit() or abort() since this instance was created. This is used to maintain consistency, and to implement LRU replacement in the cache. """ def __init__(self, storage, cache_size=100000, root_class=None): """(storage:Storage|str, cache_size:int=100000, root_class:class|None=None) Make a connection to `storage`. Set the target number of non-ghosted persistent objects to keep in the cache at `cache_size`. If there is no root object yet, create it as an instance of the root_class (or PersistentDict, if root_class is None), calling the constructor with no arguments. Also, if the root_class is not None, verify that this really is the class of the root object. """ if isinstance(storage, str): from durus.file_storage import FileStorage storage = FileStorage(storage) assert isinstance(storage, durus.storage.Storage) self.storage = storage self.reader = ObjectReader(self) self.changed = {} self.invalid_durus_ids = set() self.new_durus_id = storage.new_durus_id # needed by serialize self.cache = Cache(cache_size) self.root = self.get(ROOT_DURUS_ID) if self.root is None: new_durus_id = self.new_durus_id() assert ROOT_DURUS_ID == new_durus_id self.root = self.get_cache().get_instance( ROOT_DURUS_ID, root_class or PersistentDict, self) self.root._p_set_status_saved() self.root.__class__.__init__(self.root) self.root._p_note_change() self.commit() assert root_class in (None, self.root.__class__) def get_storage(self): """() -> Storage""" return self.storage def get_cache_count(self): """() -> int Return the number of PersistentObject instances currently in the cache. """ return self.cache.get_count() def get_cache_size(self): """() -> cache_size:int Return the target size for the cache. """ return self.cache.get_size() def set_cache_size(self, size): """(size:int) Set the target size for the cache. """ self.cache.set_size(size) def get_transaction_serial(self): """() -> int Return the number of calls to commit() or abort() on this instance. """ return self.transaction_serial def get_root(self): """() -> PersistentObject Returns the root object. """ return self.root def get_stored_pickle(self, durus_id): """(durus_id:str) -> str Retrieve the pickle from storage. Will raise ReadConflictError if the durus_id is invalid. """ assert durus_id not in self.invalid_durus_ids, "still conflicted: missing abort()" try: record = self.storage.load(durus_id) except ReadConflictError: invalid_durus_ids = self.storage.sync() self._handle_invalidations(invalid_durus_ids, read_durus_id=durus_id) record = self.storage.load(durus_id) durus_id2, data, refdata = unpack_record(record) assert as_bytes(durus_id) == durus_id2, (durus_id, durus_id2) return data def get(self, durus_id): """(durus_id:str|int|long) -> PersistentObject | None Return object for `durus_id`. The object may be a ghost. """ if not isinstance(durus_id, byte_string): durus_id = int8_to_str(durus_id) obj = self.cache.get(durus_id) if obj is not None: return obj try: data = self.get_stored_pickle(durus_id) except KeyError: return None klass = loads(data) obj = self.cache.get_instance(durus_id, klass, self) state = self.reader.get_state(data, load=True) obj.__setstate__(state) obj._p_set_status_saved() return obj __getitem__ = get def get_crawler(self, start_durus_id=ROOT_DURUS_ID, batch_size=100): """(start_durus_id:str = ROOT_DURUS_ID, batch_size:int = 100) -> sequence(PersistentObject) Returns a generator for the sequence of objects in a breadth first traversal of the object graph, starting at the given start_durus_id. The objects in the sequence have their state loaded at the same time, so this can be used to initialize the object cache. This uses the storage's bulk_load() method to make it faster. The batch_size argument sets the number of object records loaded on each call to bulk_load(). """ durus_id_record_sequence = self.storage.gen_durus_id_record( start_durus_id=start_durus_id, batch_size=batch_size) for durus_id, record in durus_id_record_sequence: obj = self.cache.get(durus_id) if obj is not None and not obj._p_is_ghost(): yield obj else: record_durus_id, data, refdata = unpack_record(record) if obj is None: klass = loads(data) obj = self.cache.get_instance(durus_id, klass, self) state = self.reader.get_state(data, load=True) obj.__setstate__(state) obj._p_set_status_saved() yield obj def get_cache(self): return self.cache def load_state(self, obj): """(obj:PersistentObject) Load the state for the given ghost object. """ assert self.storage is not None, 'connection is closed' assert obj._p_is_ghost() durus_id = obj._p_durus_id try: pickle = self.get_stored_pickle(durus_id) except DurusKeyError: # We have a ghost but cannot find the state for it. This can # happen if the object was removed from the storage as a result # of packing. raise ReadConflictError([durus_id]) state = self.reader.get_state(pickle) obj.__setstate__(state) obj._p_set_status_saved() def get_load_count(self): """() -> int Returns the number of times that any object's state has been loaded. """ return self.reader.get_load_count() def note_access(self, obj): assert obj._p_connection is self assert obj._p_durus_id is not None obj._p_serial = self.transaction_serial self.cache.recent_objects.add(obj) def note_change(self, obj): """(obj:PersistentObject) This is done when any persistent object is changed. Changed objects will be stored when the transaction is committed or rolled back, i.e. made into ghosts, on abort. """ # assert obj._p_connection is self self.changed[obj._p_durus_id] = obj def shrink_cache(self): """ If the number of saved and unsaved objects is more than twice the target cache size (and the target cache size is positive), try to ghostify enough of the saved objects to achieve the target cache size. """ self.cache.shrink(self) def _sync(self): """ Process all invalid_durus_ids so that all non-ghost objects are current. """ invalid_durus_ids = self.storage.sync() self.invalid_durus_ids.update(invalid_durus_ids) for durus_id in self.invalid_durus_ids: obj = self.cache.get(durus_id) if obj is not None: obj._p_set_status_ghost() self.invalid_durus_ids.clear() def abort(self): """ Abort uncommitted changes, sync, and try to shrink the cache. """ for durus_id, obj in iteritems(self.changed): obj._p_set_status_ghost() self.changed.clear() self._sync() self.shrink_cache() self.transaction_serial += 1 def commit(self): """ If there are any changes, try to store them, and raise WriteConflictError if there are any invalid durus_ids saved or if there are any invalid durus_ids for non-ghost objects. """ if not self.changed: self._sync() else: assert not self.invalid_durus_ids, "still conflicted: missing abort()" self.storage.begin() new_objects = {} for durus_id, changed_object in iteritems(self.changed): writer = ObjectWriter(self) try: for obj in writer.gen_new_objects(changed_object): durus_id = obj._p_durus_id if durus_id in new_objects: continue elif durus_id not in self.changed: new_objects[durus_id] = obj self.cache[durus_id] = obj data, refs = writer.get_state(obj) self.storage.store(durus_id, pack_record(durus_id, data, refs)) obj._p_set_status_saved() finally: writer.close() try: self.storage.end(self._handle_invalidations) except ConflictError: for durus_id, obj in iteritems(new_objects): obj._p_durus_id = None del self.cache[durus_id] obj._p_set_status_unsaved() obj._p_connection = None raise self.changed.clear() self.shrink_cache() self.transaction_serial += 1 def commit_call(self, handler, max_retries=10, log=None): """(handler: callable, max_retries=10, log=None) Call handler, a callable with no arguments, and commit the results. If there is a conflict, abort and retry, up to max_retries times. This call returns whatever the call to handler returns. If the commit is not completed, this function raises the last ConflictError. The log function, if provided, is called for all ConflictErrors. """ assert max_retries >= 0 for j in range(max_retries+1): try: result = handler() self.commit() return result except ConflictError: if log: log() if j == max_retries: raise self.abort() else: raise RuntimeError("Too many retries.") def _handle_invalidations(self, durus_ids, read_durus_id=None): """(durus_ids:[str], read_durus_id:str=None) Check if any of the durus_ids are for objects that were accessed during this transaction. If so, raise the appropriate conflict exception. """ conflicts = [] for durus_id in durus_ids: obj = self.cache.get(durus_id) if obj is None: continue if obj._p_serial == self.transaction_serial: conflicts.append(durus_id) self.invalid_durus_ids.add(durus_id) elif not obj._p_is_ghost(): assert durus_id not in self.changed obj._p_set_status_ghost() if conflicts: if read_durus_id is None: raise WriteConflictError(conflicts) else: raise ReadConflictError([read_durus_id]) def pack(self): """Clear any uncommited changes and pack the storage.""" self.abort() self.storage.pack() class ObjectDictionary (object): """ Like a WeakValueDictionary, except that the actual removal of keys is delayed until the next time an iteration is started, when it is assumed that other threads are not continuing any iterations. """ def __init__(self): self.mapping = {} self.dead = set() def callback(keyed_ref, selfref=ref(self)): self = selfref() if self is not None: self.dead.add(keyed_ref.key) self.callback = callback def get(self, key, default=None): ref = self.mapping.get(key, None) if ref is not None: value = ref() if value is not None and key not in self.dead: return value return default def __setitem__(self, key, value): self.dead.discard(key) self.mapping[key] = KeyedRef(value, self.callback, key) def __delitem__(self, key): self.dead.add(key) def __contains__(self, key): return self.get(key, None) is not None def __len__(self): return len(self.mapping) - len(self.dead) def clear_dead(self): while self.dead: self.mapping.pop(self.dead.pop(), None) def __iter__(self): self.clear_dead() for key in self.mapping: if key not in self.dead: yield key class ReferenceContainer (object): """ This is used to hold hard references to recently used instances. """ def __init__(self): self.map = {} def __len__(self): return len(self.map) def add(self, x): self.map[id(x)] = x def discard(self, x): key = id(x) if key in self.map: del self.map[key] class Cache (object): def __init__(self, size): self.objects = ObjectDictionary() self.recent_objects = ReferenceContainer() self.set_size(size) self.finger = 0 def get_size(self): """Return the target size of the cache.""" return self.size def get_count(self): """Return the number of objects currently in the cache.""" return len(self.objects) def set_size(self, size): if size <= 0: raise ValueError('cache target size must be > 0') self.size = size def get_instance(self, durus_id, klass, connection): """ This returns the existing object with the given durus_id, or else it makes a new one with the given class and connection. This method is called when unpickling a reference, which may happen at a high frequency, so it needs to be fast. For the sake of speed, it inlines some statements that would normally be executed through calling other functions. """ # if self.get(durus_id) is not None: return self.get(durus_id) objects = self.objects obj = objects.get(durus_id) if obj is None or obj.__class__ is not klass: # Make a new ghost. obj = klass.__new__(klass) obj._p_durus_id = durus_id obj._p_connection = connection obj._p_status = GHOST # obj._p_set_status_ghost() objects[durus_id] = obj return obj def get(self, durus_id): return self.objects.get(durus_id) def __setitem__(self, key, obj): self.objects[key] = obj def __delitem__(self, key): obj = self.objects.get(key) if obj is not None: self.recent_objects.discard(obj) assert obj._p_durus_id is None del self.objects[key] def _build_heap(self, transaction_serial): """(transaction_serial:int) -> [(serial, durus_id)] """ all = self.objects heap_size_target = (len(all) - self.size) * 2 start = self.finger % len(all) heap = [] for durus_id in islice(chain(all, all), start, start + len(all)): self.finger += 1 obj = all.get(durus_id) if obj is None: continue # The ref is dead. if obj._p_serial == transaction_serial: continue # obj is current. Leave it alone. heappush(heap, (obj._p_serial, durus_id)) if len(heap) >= heap_size_target: break self.finger = self.finger % len(all) return heap def shrink(self, connection): """(connection:Connection) Try to reduce the size of self.objects. """ current = len(self.objects) if current <= self.size: # No excess. log(10, '[%s] cache size %s recent %s', getpid(), current, len(self.recent_objects)) return start_time = time() heap = self._build_heap(connection.get_transaction_serial()) num_ghosted = 0 while heap and len(self.objects) > self.size: serial, durus_id = heappop(heap) obj = self.objects.get(durus_id) if obj is None: continue if obj._p_is_saved(): obj._p_set_status_ghost() num_ghosted += 1 self.recent_objects.discard(obj) log(10, '[%s] shrink %fs removed %s ghosted %s size %s recent %s', getpid(), time() - start_time, current - len(self.objects), num_ghosted, len(self.objects), len(self.recent_objects)) def __iter__(self): get = self.objects.get for key in self.objects: yield get(key) def touch_every_reference(connection, *words): """(connection:Connection, *words:(str)) Mark as changed, every object whose pickled class/state contains any of the given words. This is useful when you move or rename a class, so that all references can be updated. """ get = connection.get reader = ObjectReader(connection) words = [as_bytes(w) for w in words] for durus_id, record in connection.get_storage().gen_durus_id_record(): record_durus_id, data, refs = unpack_record(record) state = reader.get_state_pickle(data) for word in words: if word in data or word in state: get(durus_id)._p_note_change() def gen_every_instance(connection, *classes): """(connection:Connection, *classes:(class)) -> sequence [PersistentObject] Generate all PersistentObject instances that are instances of any of the given classes.""" for durus_id, record in connection.get_storage().gen_durus_id_record(): record_durus_id, state, refs = unpack_record(record) record_class = loads(state) if issubclass(record_class, classes): yield connection.get(durus_id)