diff --git a/remotefilelog/repack.py b/remotefilelog/repack.py --- a/remotefilelog/repack.py +++ b/remotefilelog/repack.py @@ -11,7 +11,10 @@ scmutil, util, ) -from mercurial.node import nullid +from mercurial.node import ( + nullid, + short +) from mercurial.i18n import _ from . import ( constants, @@ -422,23 +425,26 @@ orderednodes = list(reversed(self._toposort(ancestors))) orderednodes.extend(sorted(nohistory)) - # Compute deltas and write to the pack + # Filter orderednodes to just the nodes we want to serialize (it + # currently also has the edge nodes' ancestors). + orderednodes = list(filter(lambda node: node in nodes, + orderednodes)) + + if self.garbagecollect: + def shouldkeep(node): + # If the node is old and is not in the keepset, we skip it, + # and mark as garbage collected + if ((filename, node) not in self.keepkeys and + self.isold(self.repo, filename, node)): + entries[node].gced = True + return False + return True + orderednodes = list(filter(shouldkeep, orderednodes)) + + # Compute delta bases for nodes deltabases = defaultdict(lambda: (nullid, 0)) nodes = set(nodes) for i, node in enumerate(orderednodes): - # orderednodes is all ancestors, but we only want to serialize - # the files we have. - if node not in nodes: - continue - - if self.garbagecollect: - # If the node is old and is not in the keepset - # we skip it and mark as garbage collected - if ((filename, node) not in self.keepkeys and - self.isold(self.repo, filename, node)): - entries[node].gced = True - continue - ui.progress(_("processing nodes"), i, unit='nodes', total=len(orderednodes)) # Find delta base @@ -468,6 +474,58 @@ if p2 != nullid: deltabases[p2] = (node, chainlen + 1) + if ui.configbool('repack', 'chainorphansbysize', True): + # Find orphans (nodes not referenced by other nodes) and move + # them into a single chain sorted by size. This yields better + # storage than storing them all as fulltexts, which would be + # the default behavior. + orphans = [] + neworderednodes = [] + for i, node in enumerate(orderednodes): + if deltabases[node] == (nullid, 0): + # Since nodes are stored in chain order, look ahead to + # see if the next entry is in the same chain. If not, + # we can conclude it's an orphan. + if (i < len(orderednodes) - 1 and + deltabases[orderednodes[i + 1]] != (node, 1)): + orphans.append(node) + del deltabases[node] + else: + neworderednodes.append(node) + ui.debug("%s: %s starts a chain\n" % + (filename, short(node))) + else: + neworderednodes.append(node) + if orphans: + def getsize(node, default=0): + meta = self.data.getmeta(filename, node) + if constants.METAKEYSIZE in meta: + return meta[constants.METAKEYSIZE] + else: + return default + + # Sort orphans by size. + # TODO make this stable for identical sizes? + orphans = list(reversed(sorted(orphans, key=getsize))) + ui.debug("%s: orphan chain: %s\n" % (filename, + ", ".join([short(s) for s in orphans]))) + ui.debug("%s: orderednodes: %s\n" % (filename, + ", ".join([short(s) for s in neworderednodes]))) + + # Reintroduce as one chain. + for i, node in enumerate(orphans): + if i == 0: + deltabases[node] = (nullid, 0) + else: + parent = orphans[i - i] + deltabases[node] = (parent, + deltabases[parent][1] + 1) + neworderednodes += orphans + orderednodes = neworderednodes + + # Compute deltas and write to the pack + for i, node in enumerate(orderednodes): + deltabase, chainlen = deltabases[node] # Compute delta # TODO: Optimize the deltachain fetching. Since we're # iterating over the different version of the file, we may diff --git a/tests/test-remotefilelog-repack-fast.t b/tests/test-remotefilelog-repack-fast.t --- a/tests/test-remotefilelog-repack-fast.t +++ b/tests/test-remotefilelog-repack-fast.t @@ -217,8 +217,8 @@ 2 files fetched over 2 fetches - (2 misses, 0.00% hit ratio) over * (glob) $ hg repack $ ls $TESTTMP/hgcache/master/packs - e634f60d2a9539fc595b1f4db480c64556a396c7.dataidx - e634f60d2a9539fc595b1f4db480c64556a396c7.datapack + 655c693cc097b59d135a1134a939cfbe57e798aa.dataidx + 655c693cc097b59d135a1134a939cfbe57e798aa.datapack ebbd7411e00456c0eec8d1150a77e2b3ef490f3f.histidx ebbd7411e00456c0eec8d1150a77e2b3ef490f3f.histpack $ hg debughistorypack $TESTTMP/hgcache/master/packs/*.histidx @@ -252,10 +252,10 @@ x: Node Delta Base Delta Length Blob Size + d4a3ed9310e5 000000000000 6 6 1bb2e6237e03 000000000000 8 8 - aee31534993a 000000000000 4 4 - d4a3ed9310e5 000000000000 6 6 - Total: 18 18 (0.0% bigger) + aee31534993a 1bb2e6237e03 12 4 + Total: 26 18 (44.4% bigger) y: Node Delta Base Delta Length Blob Size diff --git a/tests/test-remotefilelog-repack.t b/tests/test-remotefilelog-repack.t --- a/tests/test-remotefilelog-repack.t +++ b/tests/test-remotefilelog-repack.t @@ -212,8 +212,8 @@ 2 files fetched over 2 fetches - (2 misses, 0.00% hit ratio) over * (glob) $ hg repack $ ls $TESTTMP/hgcache/master/packs - e634f60d2a9539fc595b1f4db480c64556a396c7.dataidx - e634f60d2a9539fc595b1f4db480c64556a396c7.datapack + 655c693cc097b59d135a1134a939cfbe57e798aa.dataidx + 655c693cc097b59d135a1134a939cfbe57e798aa.datapack ebbd7411e00456c0eec8d1150a77e2b3ef490f3f.histidx ebbd7411e00456c0eec8d1150a77e2b3ef490f3f.histpack $ hg debughistorypack $TESTTMP/hgcache/master/packs/*.histidx @@ -247,10 +247,10 @@ x: Node Delta Base Delta Length Blob Size + d4a3ed9310e5 000000000000 6 6 1bb2e6237e03 000000000000 8 8 - aee31534993a 000000000000 4 4 - d4a3ed9310e5 000000000000 6 6 - Total: 18 18 (0.0% bigger) + aee31534993a 1bb2e6237e03 12 4 + Total: 26 18 (44.4% bigger) y: Node Delta Base Delta Length Blob Size