diff --git a/remotefilelog/repack.py b/remotefilelog/repack.py --- a/remotefilelog/repack.py +++ b/remotefilelog/repack.py @@ -1,7 +1,6 @@ from __future__ import absolute_import import os -from collections import defaultdict from hgext3rd.extutil import runshellcommand from mercurial import ( error, @@ -11,7 +10,10 @@ scmutil, util, ) -from mercurial.node import nullid +from mercurial.node import ( + nullid, + short, +) from mercurial.i18n import _ from . import ( constants, @@ -426,6 +428,45 @@ for source in ledger.sources: source.cleanup(ledger) + def _chainorphans(self, ui, filename, nodes, orphans, deltabases): + """Reorderes ``orphans`` into a single chain inside ``nodes`` and + ``deltabases``. + + We often have orphan entries (nodes without a base that aren't + referenced by other nodes -- i.e., part of a chain) due to gaps in + history. Rather than store them as individual fulltexts, we prefer to + insert them as one chain sorted by size. + """ + if not orphans: + return nodes + + def getsize(node, default=0): + meta = self.data.getmeta(filename, node) + if constants.METAKEYSIZE in meta: + return meta[constants.METAKEYSIZE] + else: + return default + + # Sort orphans by size; biggest first is preferred, since it's more + # likely to be the newest version assuming files grow over time. + # (Sort by node first to ensure the sort is stable.) + orphans = sorted(orphans) + orphans = list(sorted(orphans, key=getsize, reverse=True)) + if ui.debugflag: + ui.debug("%s: orphan chain: %s\n" % (filename, + ", ".join([short(s) for s in orphans]))) + + # Create one contiguous chain and reassign deltabases. + for i, node in enumerate(orphans): + if i == 0: + deltabases[node] = (nullid, 0) + else: + parent = orphans[i - 1] + deltabases[node] = (parent, deltabases[parent][1] + 1) + nodes = filter(lambda node: node not in orphans, nodes) + nodes += orphans + return nodes + def repackdata(self, ledger, target): ui = self.repo.ui maxchainlen = ui.configint('packs', 'maxchainlen', 1000) @@ -465,29 +506,42 @@ len(nohistory)) orderednodes.extend(sorted(nohistory)) - # Compute deltas and write to the pack - deltabases = defaultdict(lambda: (nullid, 0)) + # Filter orderednodes to just the nodes we want to serialize (it + # currently also has the edge nodes' ancestors). + orderednodes = filter(lambda node: node in nodes, orderednodes) + + # Garbage collect old nodes: + if self.garbagecollect: + neworderednodes = [] + for node in orderednodes: + # If the node is old and is not in the keepset, we skip it, + # and mark as garbage collected + if ((filename, node) not in self.keepkeys and + self.isold(self.repo, filename, node)): + entries[node].gced = True + continue + neworderednodes.append(node) + orderednodes = neworderednodes + + # Compute delta bases for nodes: + deltabases = {} + nobase = set() + referenced = set() nodes = set(nodes) for i, node in enumerate(orderednodes): - # orderednodes is all ancestors, but we only want to serialize - # the files we have. - if node not in nodes: - continue - - if self.garbagecollect: - # If the node is old and is not in the keepset - # we skip it and mark as garbage collected - if ((filename, node) not in self.keepkeys and - self.isold(self.repo, filename, node)): - entries[node].gced = True - continue - ui.progress(_("processing nodes"), i, unit='nodes', total=len(orderednodes)) # Find delta base # TODO: allow delta'ing against most recent descendant instead # of immediate child - deltabase, chainlen = deltabases[node] + deltatuple = deltabases.get(node, None) + if deltatuple is None: + deltabase, chainlen = nullid, 0 + deltabases[node] = (nullid, 0) + nobase.add(node) + else: + deltabase, chainlen = deltatuple + referenced.add(deltabase) # Use available ancestor information to inform our delta choices ancestorinfo = ancestors.get(node) @@ -511,6 +565,15 @@ if p2 != nullid: deltabases[p2] = (node, chainlen + 1) + # experimental config: repack.chainorphansbysize + if ui.configbool('repack', 'chainorphansbysize', True): + orphans = nobase - referenced + orderednodes = self._chainorphans(ui, filename, orderednodes, + orphans, deltabases) + + # Compute deltas and write to the pack + for i, node in enumerate(orderednodes): + deltabase, chainlen = deltabases[node] # Compute delta # TODO: Optimize the deltachain fetching. Since we're # iterating over the different version of the file, we may diff --git a/tests/test-remotefilelog-lfs.t b/tests/test-remotefilelog-lfs.t --- a/tests/test-remotefilelog-lfs.t +++ b/tests/test-remotefilelog-lfs.t @@ -233,10 +233,10 @@ $TESTTMP/hgcache $TESTTMP/hgcache/master $TESTTMP/hgcache/master/packs - $TESTTMP/hgcache/master/packs/879f0543e467d3cffb512cc0392ebece41b1480f.dataidx - $TESTTMP/hgcache/master/packs/879f0543e467d3cffb512cc0392ebece41b1480f.datapack $TESTTMP/hgcache/master/packs/bf634767241b49b174b18732f92c6653ff966751.histidx $TESTTMP/hgcache/master/packs/bf634767241b49b174b18732f92c6653ff966751.histpack + $TESTTMP/hgcache/master/packs/faa267575712c2ee0a4ff7e9c09bf75e10055c04.dataidx + $TESTTMP/hgcache/master/packs/faa267575712c2ee0a4ff7e9c09bf75e10055c04.datapack $TESTTMP/hgcache/repos $ hg log -p -r ::tip -T '{rev}:{node} {desc}\n' diff --git a/tests/test-remotefilelog-repack-fast.t b/tests/test-remotefilelog-repack-fast.t --- a/tests/test-remotefilelog-repack-fast.t +++ b/tests/test-remotefilelog-repack-fast.t @@ -220,8 +220,8 @@ 2 files fetched over 2 fetches - (2 misses, 0.00% hit ratio) over * (glob) $ hg repack $ ls $TESTTMP/hgcache/master/packs - e634f60d2a9539fc595b1f4db480c64556a396c7.dataidx - e634f60d2a9539fc595b1f4db480c64556a396c7.datapack + e8fdf7ae22b772dcc291f905b9c6e5f381d28739.dataidx + e8fdf7ae22b772dcc291f905b9c6e5f381d28739.datapack ebbd7411e00456c0eec8d1150a77e2b3ef490f3f.histidx ebbd7411e00456c0eec8d1150a77e2b3ef490f3f.histpack $ hg debughistorypack $TESTTMP/hgcache/master/packs/*.histidx @@ -252,14 +252,14 @@ $ rm -rf $CACHEDIR/master/packs/*hist* $ hg repack $ hg debugdatapack $TESTTMP/hgcache/master/packs/*.datapack - $TESTTMP/hgcache/master/packs/055c02949317b8507cdb7aaf2e00cc00fd0c5716: + $TESTTMP/hgcache/master/packs/a8d86ff8e1a11a77a85f5fea567f56a757583eda: x: Node Delta Base Delta Length Blob Size 1bb2e6237e03 000000000000 8 8 - aee31534993a 000000000000 4 4 - d4a3ed9310e5 000000000000 6 6 + d4a3ed9310e5 1bb2e6237e03 12 6 + aee31534993a d4a3ed9310e5 12 4 - Total: 18 18 (0.0% bigger) + Total: 32 18 (77.8% bigger) y: Node Delta Base Delta Length Blob Size 577959738234 000000000000 70 8 diff --git a/tests/test-remotefilelog-repack.t b/tests/test-remotefilelog-repack.t --- a/tests/test-remotefilelog-repack.t +++ b/tests/test-remotefilelog-repack.t @@ -227,8 +227,8 @@ 2 files fetched over 2 fetches - (2 misses, 0.00% hit ratio) over * (glob) $ hg repack $ ls $TESTTMP/hgcache/master/packs - e634f60d2a9539fc595b1f4db480c64556a396c7.dataidx - e634f60d2a9539fc595b1f4db480c64556a396c7.datapack + e8fdf7ae22b772dcc291f905b9c6e5f381d28739.dataidx + e8fdf7ae22b772dcc291f905b9c6e5f381d28739.datapack ebbd7411e00456c0eec8d1150a77e2b3ef490f3f.histidx ebbd7411e00456c0eec8d1150a77e2b3ef490f3f.histpack $ hg debughistorypack $TESTTMP/hgcache/master/packs/*.histidx @@ -259,14 +259,14 @@ $ rm -rf $CACHEDIR/master/packs/*hist* $ hg repack $ hg debugdatapack $TESTTMP/hgcache/master/packs/*.datapack - $TESTTMP/hgcache/master/packs/055c02949317b8507cdb7aaf2e00cc00fd0c5716: + $TESTTMP/hgcache/master/packs/a8d86ff8e1a11a77a85f5fea567f56a757583eda: x: Node Delta Base Delta Length Blob Size 1bb2e6237e03 000000000000 8 8 - aee31534993a 000000000000 4 4 - d4a3ed9310e5 000000000000 6 6 + d4a3ed9310e5 1bb2e6237e03 12 6 + aee31534993a d4a3ed9310e5 12 4 - Total: 18 18 (0.0% bigger) + Total: 32 18 (77.8% bigger) y: Node Delta Base Delta Length Blob Size 577959738234 000000000000 70 8