diff --git a/mercurial/cext/revlog.c b/mercurial/cext/revlog.c --- a/mercurial/cext/revlog.c +++ b/mercurial/cext/revlog.c @@ -2837,6 +2837,127 @@ return NULL; } +typedef struct { + PyObject_HEAD + indexObject *idx; + int with_buffer; + Py_buffer buf; + uint32_t m, n, seed; +} phashObject; + +static int phash_init(phashObject *self, PyObject *args) +{ + if (!PyArg_ParseTuple(args, "y*O!", &self->buf, &HgRevlogIndex_Type, &self->idx)) + return -1; + self->with_buffer = 1; + if (!PyBuffer_IsContiguous(&self->buf, 'C') || self->buf.ndim > 1 || self->buf.len < 12) + return -1; + self->seed = getbe32(self->buf.buf + 0); + self->m = getbe32(self->buf.buf + 4); + self->n = getbe32(self->buf.buf + 8); + if (self->m == 0 || self->buf.len % 4 != 0 || (self->buf.len - 12) / 4 < self->m) + return -1; + return 0; +} + +static void phash_dealloc(phashObject *self) +{ + if (self->with_buffer) + PyBuffer_Release(&self->buf); +} + +static PyObject *phash_get(phashObject *self, PyObject *args) +{ + const char *node, *nodedata; + Py_ssize_t nodelen; + uint32_t h0, h1, h2, rev; + if (!PyArg_ParseTuple(args, "y#", &node, &nodelen)) + Py_RETURN_NONE; + if (self->n == 0 || nodelen < 12 || nodelen != self->idx->nodelen) + Py_RETURN_NONE; + + if (!memcmp(node, nullid, nodelen)) + return PyLong_FromLong(-1); + + if (self->seed + 12 > nodelen) + Py_RETURN_NONE; + + h0 = getbe32(node + self->seed); + h1 = getbe32(node + self->seed + 4); + h2 = getbe32(node + self->seed + 8); + + h0 %= self->m; + h1 %= self->m; + h2 %= self->m; + + h1 ^= (h0 == h1); + h2 ^= (h0 == h2 || h1 == h2); + h2 ^= 2 * (h0 == h2 || h1 == h2); + + h0 = getbe32(self->buf.buf + 12 + 4 * h0); + h1 = getbe32(self->buf.buf + 12 + 4 * h1); + h2 = getbe32(self->buf.buf + 12 + 4 * h2); + + rev = (h0 + h1 + h2) % self->n; + if (rev >= self->idx->length) + Py_RETURN_NONE; + + nodedata = index_deref(self->idx, rev); + if (!nodedata) + Py_RETURN_NONE; + + if (memcmp(nodedata + 32, node, nodelen)) + Py_RETURN_NONE; + + return PyLong_FromUnsignedLong(rev); +} + +static PyMethodDef phash_methods[] = { + {"get", (PyCFunction)phash_get, METH_VARARGS, + "find candidate revision for given node"}, + {NULL} +}; + +static PyTypeObject phashType = { + PyVarObject_HEAD_INIT(NULL, 0) /* header */ + "parsers.phash", /* tp_name */ + sizeof(phashObject), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)phash_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + "phash", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + phash_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)phash_init, /* tp_init */ + 0, /* tp_alloc */ +}; + static Revlog_CAPI CAPI = { /* increment the abi_version field upon each change in the Revlog_CAPI struct or in the ABI of the listed functions */ @@ -2861,6 +2982,12 @@ Py_INCREF(&nodetreeType); PyModule_AddObject(mod, "nodetree", (PyObject *)&nodetreeType); + phashType.tp_new = PyType_GenericNew; + if (PyType_Ready(&phashType) < 0) + return; + Py_INCREF(&phashType); + PyModule_AddObject(mod, "phash", (PyObject *)&phashType); + caps = PyCapsule_New(&CAPI, "mercurial.cext.parsers.revlog_CAPI", NULL); if (caps != NULL) PyModule_AddObject(mod, "revlog_CAPI", caps); diff --git a/mercurial/changelog.py b/mercurial/changelog.py --- a/mercurial/changelog.py +++ b/mercurial/changelog.py @@ -12,6 +12,7 @@ bin, hex, nullid, + nullrev, ) from .thirdparty import attr diff --git a/mercurial/localrepo.py b/mercurial/localrepo.py --- a/mercurial/localrepo.py +++ b/mercurial/localrepo.py @@ -2699,12 +2699,12 @@ self.filtered(b'served').branchmap() self.filtered(b'served.hidden').branchmap() + self.changelog.update_caches(transaction=tr) + self.manifestlog.update_caches(transaction=tr) + if full: unfi = self.unfiltered() - self.changelog.update_caches(transaction=tr) - self.manifestlog.update_caches(transaction=tr) - rbc = unfi.revbranchcache() for r in unfi.changelog: rbc.branchinfo(r) diff --git a/mercurial/pure/parsers.py b/mercurial/pure/parsers.py --- a/mercurial/pure/parsers.py +++ b/mercurial/pure/parsers.py @@ -284,3 +284,24 @@ write(e) write(f) return cs.getvalue() + + +from ..node import nullid, nullrev +from ..utils import phash_reader + +be32 = struct.Struct('>I') + + +class phash: + def __init__(self, data, revlogindex): + self.hashtable = phash_reader.hashtable(data) + self.revlogindex = revlogindex + + def get(self, node): + if node == nullid: + return nullrev + + rev = self.hashtable.get(node) + if self.revlogindex[rev][7] == node: + return rev + return None diff --git a/mercurial/revlog.py b/mercurial/revlog.py --- a/mercurial/revlog.py +++ b/mercurial/revlog.py @@ -227,6 +227,61 @@ indexformatv0_unpack = indexformatv0.unpack +class WrapperIndex(object): + def __init__(self, real_index, pphash, sphash): + self.__real_index = real_index + self.__pphash = pphash + self.__sphash = sphash + + def has_node(self, node): + if self.__pphash and self.__pphash.get(node) is not None: + return True + if self.__sphash and self.__sphash.get(node) is not None: + return True + return self.__real_index.has_node(node) + + def rev(self, node): + if self.__pphash: + rev = self.__pphash.get(node) + if rev is not None: + return rev + if self.__sphash: + rev = self.__sphash.get(node) + if rev is not None: + return rev + return self.__real_index.rev(node) + + def get_rev(self, node): + if self.__pphash: + rev = self.__pphash.get(node) + if rev is not None: + return rev + if self.__sphash: + rev = self.__sphash.get(node) + if rev is not None: + return rev + return self.__real_index.get_rev(node) + + def _stripnodes(self, start): + return self.__real_index.stripnodes(start) + + def clearcaches(self): + return self.__real_index.clearcaches() + + def __len__(self): + return self.__real_index.__len__() + return self._lgt + len(self._extra) + + def append(self, tup): + return self.__real_index.append(tup) + + def _check_index(self, i): + return self.__real_index._check_index(i) + + def __getitem__(self, i): + return self.__real_index.__getitem__(i) + + class revlogoldindex(list): @property def nodemap(self): @@ -385,6 +440,15 @@ return rustrevlog.MixedIndex(index), cache +def indexfile_sibling(opener, path, ext): + if not path.endswith(b".a"): + return path[:-2] + ext + pending_path = path[:-4] + ext + b".a" + if opener.exists(pending_path): + return pending_path + return path[:-4] + ext + + class revlog(object): """ the underlying revision storage object @@ -448,14 +512,12 @@ self.datafile = datafile or (indexfile[:-2] + b".d") self.nodemap_file = None if persistentnodemap: - if indexfile.endswith(b'.a'): - pending_path = indexfile[:-4] + b".n.a" - if opener.exists(pending_path): - self.nodemap_file = pending_path - else: - self.nodemap_file = indexfile[:-4] + b".n" - else: - self.nodemap_file = indexfile[:-2] + b".n" + self.nodemap_file = indexfile_sibling(opener, indexfile, b".n") + + self._pphash_file = indexfile_sibling(opener, indexfile, b".pphf") + self._pphash = None + self._sphash_file = indexfile_sibling(opener, indexfile, b".sphf") + self._sphash = None self.opener = opener # When True, indexfile is opened with checkambig=True at writing, to @@ -670,6 +732,18 @@ # no changelog tampering self._nodemap_docket = docket index.update_nodemap_data(*nodemap_data) + if not self._inline: + if self.opener.exists(self._pphash_file): + f = self.opener.open(self._pphash_file) + data = util.mmapread(f) + self._pphash = parsers.phash(data, d[0]) + if self.opener.exists(self._sphash_file): + f = self.opener.open(self._sphash_file) + data = util.mmapread(f) + self._sphash = parsers.phash(data, d[0]) + if self._pphash or self._sphash: + d = WrapperIndex(d[0], self._pphash, self._sphash), d[1] + except (ValueError, IndexError): raise error.RevlogError( _(b"index %s is corrupted") % self.indexfile @@ -780,12 +854,52 @@ return False return True + phash_update_delta = 1000 + + def _update_pphash(self, transaction): + if self._pphash and transaction: + if ( + transaction.changes[b'origrepolen'] // self.phash_update_delta + == len(self) // self.phash_update_delta + ): + return + elif len(self) < self.phash_update_delta: + return + from .utils import phash_writer + + writer = phash_writer.hashtable() + for rev in pycompat.xrange( + len(self) // self.phash_update_delta * self.phash_update_delta + ): + writer.insert(self.node(rev)) + data = writer.write() + f = self.opener(self._pphash_file, atomictemp=True, mode=b'wb') + f.write(data) + f.close() + + def _update_sphash(self, transaction): + from .utils import phash_writer + + writer = phash_writer.hashtable() + for rev in pycompat.xrange( + len(self) // self.phash_update_delta * self.phash_update_delta, + len(self), + ): + writer.insert(self.node(rev)) + data = writer.write() + f = self.opener(self._sphash_file, atomictemp=True, mode=b'wb') + f.write(data) + f.close() + def update_caches(self, transaction): if self.nodemap_file is not None: if transaction is None: nodemaputil.update_persistent_nodemap(self) else: nodemaputil.setup_persistent_nodemap(transaction, self) + if self._pphash_file and not self._inline: + self._update_pphash(transaction) + self._update_sphash(transaction) def clearcaches(self): self._revisioncache = None diff --git a/mercurial/utils/phash_reader.py b/mercurial/utils/phash_reader.py new file mode 100644 --- /dev/null +++ b/mercurial/utils/phash_reader.py @@ -0,0 +1,39 @@ +import struct + +be32 = struct.Struct('>I') + + +def default_hash(key, seed): + base = seed * 4 + return [ + be32.unpack(key[base : base + 4])[0], + be32.unpack(key[base + 4 : base + 8])[0], + be32.unpack(key[base + 8 : base + 12])[0], + ] + + +class hashtable: + def __init__(self, data, hash=default_hash): + self.data = data + (self.seed,) = be32.unpack(data[0:4]) + (self.m,) = be32.unpack(data[4:8]) + (self.n,) = be32.unpack(data[8:12]) + + def get(self, node): + base = self.seed * 4 + h0 = be32.unpack(node[base : base + 4])[0] % self.m + h1 = be32.unpack(node[base + 4 : base + 8])[0] % self.m + h2 = be32.unpack(node[base + 8 : base + 12])[0] % self.m + + if h0 == h1: + h1 ^= 1 + if h0 == h2 or h1 == h2: + h2 ^= 1 + if h0 == h2 or h1 == h2: + h2 ^= 2 + + def g(idx): + idx = 12 + 4 * idx + return be32.unpack(self.data[idx : idx + 4])[0] + + return (g(h0 % self.m) + g(h1 % self.m) + g(h2 % self.m)) % self.n diff --git a/mercurial/utils/phash_writer.py b/mercurial/utils/phash_writer.py new file mode 100644 --- /dev/null +++ b/mercurial/utils/phash_writer.py @@ -0,0 +1,104 @@ +import struct + +be32 = struct.Struct('>I') + + +def default_hash(key, seed): + base = seed * 4 + return [ + be32.unpack(key[base : base + 4])[0], + be32.unpack(key[base + 4 : base + 8])[0], + be32.unpack(key[base + 8 : base + 12])[0], + ] + + +class hashtable: + def __init__(self, hash=default_hash): + self.entries = [] + self.hash = hash + + def insert(self, key): + self.entries.append(key) + + def _hash_entry(self, nv, seed, entry): + hash = [x % nv for x in self.hash(entry, seed)] + if hash[0] == hash[1]: + hash[1] ^= 1 + if hash[0] == hash[2] or hash[1] == hash[2]: + hash[2] ^= 1 + if hash[0] == hash[2] or hash[1] == hash[2]: + hash[2] ^= 2 + + return hash + + def _compute_graph(self, ne, nv, seed): + edges = [] + vertices_edges = [0] * nv + vertices_degrees = [0] * nv + + for i in range(ne): + e = self._hash_entry(nv, seed, self.entries[i]) + edges.append(e) + for v in e: + vertices_edges[v] ^= i + vertices_degrees[v] += 1 + + output_order = [] + + def remove_vertex(v): + if vertices_degrees[v] != 1: + return + e = vertices_edges[v] + output_order.append(e) + for v in edges[e]: + vertices_edges[v] ^= e + vertices_degrees[v] -= 1 + + for v in range(nv): + remove_vertex(v) + oi = 0 + while oi < len(output_order): + for v in edges[output_order[oi]]: + remove_vertex(v) + oi += 1 + + if len(output_order) == ne: + return edges, output_order + else: + return None + + def write(self): + seed = 0 + ne = len(self.entries) + nv = max(128, ne * 124 // 100) + nv = (nv | 3) + 1 + + rv = None + seed = -1 + while rv is None: + seed += 1 + rv = self._compute_graph(ne, nv, seed) + edges, output_order = rv + + g = [0] * nv + visited = [False] * nv + for idx in reversed(output_order): + edge = edges[idx] + if not visited[edge[0]]: + g[edge[0]] = (idx - g[edge[1]] - g[edge[2]]) % ne + elif not visited[edge[1]]: + g[edge[1]] = (idx - g[edge[0]] - g[edge[2]]) % ne + else: + g[edge[2]] = (idx - g[edge[1]] - g[edge[0]]) % ne + for v in edge: + visited[v] = True + + output = bytearray(12 + 4 * len(g)) + be32.pack_into(output, 0, seed) + be32.pack_into(output, 4, nv) + be32.pack_into(output, 8, ne) + pos = 12 + for v in g: + be32.pack_into(output, pos, v) + pos += 4 + return output