diff --git a/remotefilelog/debugcommands.py b/remotefilelog/debugcommands.py --- a/remotefilelog/debugcommands.py +++ b/remotefilelog/debugcommands.py @@ -233,9 +233,11 @@ )) lastfilename = None - seennodes = {nullid: True} - failures = 0 + bases = {} + nodes = [] for filename, node, deltabase, deltalen in dpack.iterentries(): + bases[node] = deltabase + nodes.append(node) if filename != lastfilename: printtotals() name = '(empty name)' if filename == '' else filename @@ -262,20 +264,45 @@ hashformatter(deltabase), str(deltalen).ljust(14), blobsize)) - - # The only integrity constraint is that all deltabases must exist - # earlier in the pack. - if not deltabase in seennodes: - ui.warn("^ BAD ENTRY - unknown delta base of %s\n" % - short(deltabase)) - failures += 1 - seennodes[node] = True if filename is not None: printtotals() + + failures = _sanitycheck(ui, set(nodes), bases) if failures > 1: - ui.warn("\n%d invalid entries\n" % failures) + ui.warn("%d invalid entries\n" % failures) return 1 +def _sanitycheck(ui, nodes, bases): + """ + Does some basic sanity checking on a packfiles with ``nodes`` ``bases`` (a + mapping of node->base): + + - Each deltabase must itself be a node elsewhere in the pack + - There must be no cycles + """ + failures = 0 + for node in nodes: + seen = set() + current = node + deltabase = bases[current] + + while deltabase != nullid: + if deltabase not in nodes: + ui.warn("Bad entry: %s has an unknown deltabase (%s)\n" % + (short(node), short(deltabase))) + failures += 1 + break + + if deltabase in seen: + ui.warn("Bad entry: %s has a cycle\n" % short(node)) + failures += 1 + break + + seen.add(deltabase) + current = deltabase + deltabase = bases[current] + return failures + def dumpdeltachain(ui, deltachain, **opts): hashformatter = hex hashlen = 40 diff --git a/remotefilelog/repack.py b/remotefilelog/repack.py --- a/remotefilelog/repack.py +++ b/remotefilelog/repack.py @@ -11,7 +11,10 @@ scmutil, util, ) -from mercurial.node import nullid +from mercurial.node import ( + nullid, + short +) from mercurial.i18n import _ from . import ( constants, @@ -422,23 +425,26 @@ orderednodes = list(reversed(self._toposort(ancestors))) orderednodes.extend(sorted(nohistory)) - # Compute deltas and write to the pack + # Filter orderednodes to just the nodes we want to serialize (it + # currently also has the edge nodes' ancestors). + orderednodes = list(filter(lambda node: node in nodes, + orderednodes)) + + if self.garbagecollect: + def shouldkeep(node): + # If the node is old and is not in the keepset, we skip it, + # and mark as garbage collected + if ((filename, node) not in self.keepkeys and + self.isold(self.repo, filename, node)): + entries[node].gced = True + return False + return True + orderednodes = list(filter(shouldkeep, orderednodes)) + + # Compute delta bases for nodes deltabases = defaultdict(lambda: (nullid, 0)) nodes = set(nodes) for i, node in enumerate(orderednodes): - # orderednodes is all ancestors, but we only want to serialize - # the files we have. - if node not in nodes: - continue - - if self.garbagecollect: - # If the node is old and is not in the keepset - # we skip it and mark as garbage collected - if ((filename, node) not in self.keepkeys and - self.isold(self.repo, filename, node)): - entries[node].gced = True - continue - ui.progress(_("processing nodes"), i, unit='nodes', total=len(orderednodes)) # Find delta base @@ -468,6 +474,58 @@ if p2 != nullid: deltabases[p2] = (node, chainlen + 1) + if ui.configbool('repack', 'chainorphansbysize', True): + # Find orphans (nodes not referenced by other nodes) and move + # them into a single chain sorted by size. This yields better + # storage than storing them all as fulltexts, which would be + # the default behavior. + orphans = [] + neworderednodes = [] + for i, node in enumerate(orderednodes): + if deltabases[node] == (nullid, 0): + # Since nodes are stored in chain order, look ahead to + # see if the next entry is in the same chain. If not, + # we can conclude it's an orphan. + if (i < len(orderednodes) - 1 and + deltabases[orderednodes[i + 1]] != (node, 1)): + orphans.append(node) + del deltabases[node] + else: + neworderednodes.append(node) + ui.debug("%s: %s starts a chain\n" % + (filename, short(node))) + else: + neworderednodes.append(node) + if orphans: + def getsize(node, default=0): + meta = self.data.getmeta(filename, node) + if constants.METAKEYSIZE in meta: + return meta[constants.METAKEYSIZE] + else: + return default + + # Sort orphans by size. + # TODO make this stable for identical sizes? + orphans = list(reversed(sorted(orphans, key=getsize))) + ui.debug("%s: orphan chain: %s\n" % (filename, + ", ".join([short(s) for s in orphans]))) + ui.debug("%s: orderednodes: %s\n" % (filename, + ", ".join([short(s) for s in neworderednodes]))) + + # Reintroduce as one chain. + for i, node in enumerate(orphans): + if i == 0: + deltabases[node] = (nullid, 0) + else: + parent = orphans[i - i] + deltabases[node] = (parent, + deltabases[parent][1] + 1) + neworderednodes += orphans + orderednodes = neworderednodes + + # Compute deltas and write to the pack + for i, node in enumerate(orderednodes): + deltabase, chainlen = deltabases[node] # Compute delta # TODO: Optimize the deltachain fetching. Since we're # iterating over the different version of the file, we may diff --git a/tests/test-remotefilelog-repack-fast.t b/tests/test-remotefilelog-repack-fast.t --- a/tests/test-remotefilelog-repack-fast.t +++ b/tests/test-remotefilelog-repack-fast.t @@ -217,8 +217,8 @@ 2 files fetched over 2 fetches - (2 misses, 0.00% hit ratio) over * (glob) $ hg repack $ ls $TESTTMP/hgcache/master/packs - e634f60d2a9539fc595b1f4db480c64556a396c7.dataidx - e634f60d2a9539fc595b1f4db480c64556a396c7.datapack + 655c693cc097b59d135a1134a939cfbe57e798aa.dataidx + 655c693cc097b59d135a1134a939cfbe57e798aa.datapack ebbd7411e00456c0eec8d1150a77e2b3ef490f3f.histidx ebbd7411e00456c0eec8d1150a77e2b3ef490f3f.histpack $ hg debughistorypack $TESTTMP/hgcache/master/packs/*.histidx @@ -252,10 +252,10 @@ x: Node Delta Base Delta Length Blob Size + d4a3ed9310e5 000000000000 6 6 1bb2e6237e03 000000000000 8 8 - aee31534993a 000000000000 4 4 - d4a3ed9310e5 000000000000 6 6 - Total: 18 18 (0.0% bigger) + aee31534993a 1bb2e6237e03 12 4 + Total: 26 18 (44.4% bigger) y: Node Delta Base Delta Length Blob Size diff --git a/tests/test-remotefilelog-repack.t b/tests/test-remotefilelog-repack.t --- a/tests/test-remotefilelog-repack.t +++ b/tests/test-remotefilelog-repack.t @@ -212,8 +212,8 @@ 2 files fetched over 2 fetches - (2 misses, 0.00% hit ratio) over * (glob) $ hg repack $ ls $TESTTMP/hgcache/master/packs - e634f60d2a9539fc595b1f4db480c64556a396c7.dataidx - e634f60d2a9539fc595b1f4db480c64556a396c7.datapack + 655c693cc097b59d135a1134a939cfbe57e798aa.dataidx + 655c693cc097b59d135a1134a939cfbe57e798aa.datapack ebbd7411e00456c0eec8d1150a77e2b3ef490f3f.histidx ebbd7411e00456c0eec8d1150a77e2b3ef490f3f.histpack $ hg debughistorypack $TESTTMP/hgcache/master/packs/*.histidx @@ -247,10 +247,10 @@ x: Node Delta Base Delta Length Blob Size + d4a3ed9310e5 000000000000 6 6 1bb2e6237e03 000000000000 8 8 - aee31534993a 000000000000 4 4 - d4a3ed9310e5 000000000000 6 6 - Total: 18 18 (0.0% bigger) + aee31534993a 1bb2e6237e03 12 4 + Total: 26 18 (44.4% bigger) y: Node Delta Base Delta Length Blob Size