diff --git a/hgext/git/__init__.py b/hgext/git/__init__.py new file mode 100644 --- /dev/null +++ b/hgext/git/__init__.py @@ -0,0 +1,251 @@ +"""grant Mercurial the ability to operate on Git repositories. (EXPERIMENTAL) + +This is currently super experimental. It probably will consume your +firstborn a la Rumpelstiltskin, etc. +""" + +from __future__ import absolute_import + +import os + +import pygit2 + +from mercurial.i18n import _ + +from mercurial import ( + commands, + error, + extensions, + localrepo, + pycompat, + store, +) + +from . import ( + dirstate, + gitlog, + gitutil, + index, +) + + +# TODO: extract an interface for this in core +class gitstore(object): # store.basicstore): + def __init__(self, path, vfstype): + self.vfs = vfstype(path) + self.path = self.vfs.base + self.createmode = store._calcmode(self.vfs) + # above lines should go away in favor of: + # super(gitstore, self).__init__(path, vfstype) + + self.git = pygit2.Repository( + os.path.normpath(os.path.join(path, b'..', b'.git')) + ) + self._progress_factory = lambda *args, **kwargs: None + self._db_handle = None + + @property + def _db(self): + # We lazy-create the database because we want to thread a + # progress callback down to the indexing process if it's + # required, and we don't have a ui handle in makestore(). + if self._db_handle is None: + self._db_handle = index.get_index(self.git, self._progress_factory) + return self._db_handle + + def join(self, f): + """Fake store.join method for git repositories. + + For the most part, store.join is used for @storecache + decorators to invalidate caches when various files + change. We'll map the ones we care about, and ignore the rest. + """ + if f in (b'00changelog.i', b'00manifest.i'): + # This is close enough: in order for the changelog cache + # to be invalidated, HEAD will have to change. + return os.path.join(self.path, b'HEAD') + elif f == b'lock': + # TODO: we probably want to map this to a git lock, I + # suspect index.lock. We should figure out what the + # most-alike file is in git-land. For now we're risking + # bad concurrency errors if another git client is used. + return os.path.join(self.path, b'hgit-bogus-lock') + elif f in (b'obsstore', b'phaseroots', b'narrowspec', b'bookmarks'): + return os.path.join(self.path, b'..', b'.hg', f) + raise NotImplementedError(b'Need to pick file for %s.' % f) + + def changelog(self, trypending): + # TODO we don't have a plan for trypending in hg's git support yet + return gitlog.changelog(self.git, self._db) + + def manifestlog(self, repo, storenarrowmatch): + # TODO handle storenarrowmatch and figure out if we need the repo arg + return gitlog.manifestlog(self.git, self._db) + + def invalidatecaches(self): + pass + + def write(self, tr=None): + # normally this handles things like fncache writes, which we don't have + pass + + +def _makestore(orig, requirements, storebasepath, vfstype): + if os.path.exists( + os.path.join(storebasepath, b'this-is-git') + ) and os.path.exists(os.path.join(storebasepath, b'..', b'.git')): + return gitstore(storebasepath, vfstype) + return orig(requirements, storebasepath, vfstype) + + +class gitfilestorage(object): + def file(self, path): + if path[0:1] == b'/': + path = path[1:] + return gitlog.filelog(self.store.git, self.store._db, path) + + +def _makefilestorage(orig, requirements, features, **kwargs): + store = kwargs['store'] + if isinstance(store, gitstore): + return gitfilestorage + return orig(requirements, features, **kwargs) + + +def _setupdothg(ui, path): + dothg = os.path.join(path, b'.hg') + if os.path.exists(dothg): + ui.warn(_(b'git repo already initialized for hg\n')) + else: + os.mkdir(os.path.join(path, b'.hg')) + # TODO is it ok to extend .git/info/exclude like this? + with open( + os.path.join(path, b'.git', b'info', b'exclude'), 'ab' + ) as exclude: + exclude.write(b'\n.hg\n') + with open(os.path.join(dothg, b'this-is-git'), 'wb') as f: + pass + with open(os.path.join(dothg, b'requirements'), 'wb') as f: + f.write(b'git\n') + + +_BMS_PREFIX = 'refs/heads/' + + +class gitbmstore(object): + def __init__(self, gitrepo): + self.gitrepo = gitrepo + + def __contains__(self, name): + return ( + _BMS_PREFIX + pycompat.fsdecode(name) + ) in self.gitrepo.references + + def __iter__(self): + for r in self.gitrepo.listall_references(): + if r.startswith(_BMS_PREFIX): + yield pycompat.fsencode(r[len(_BMS_PREFIX) :]) + + def __getitem__(self, k): + return ( + self.gitrepo.references[_BMS_PREFIX + pycompat.fsdecode(k)] + .peel() + .id.raw + ) + + def get(self, k, default=None): + try: + if k in self: + return self[k] + return default + except pygit2.InvalidSpecError: + return default + + @property + def active(self): + h = self.gitrepo.references['HEAD'] + if not isinstance(h.target, str) or not h.target.startswith( + _BMS_PREFIX + ): + return None + return pycompat.fsencode(h.target[len(_BMS_PREFIX) :]) + + @active.setter + def active(self, mark): + raise NotImplementedError + + def names(self, node): + r = [] + for ref in self.gitrepo.listall_references(): + if not ref.startswith(_BMS_PREFIX): + continue + if self.gitrepo.references[ref].peel().id.raw != node: + continue + r.append(pycompat.fsencode(ref[len(_BMS_PREFIX) :])) + return r + + # Cleanup opportunity: this is *identical* to core's bookmarks store. + def expandname(self, bname): + if bname == b'.': + if self.active: + return self.active + raise error.RepoLookupError(_(b"no active bookmark")) + return bname + + def applychanges(self, repo, tr, changes): + """Apply a list of changes to bookmarks + """ + # TODO: this should respect transactions, but that's going to + # require enlarging the gitbmstore to know how to do in-memory + # temporary writes and read those back prior to transaction + # finalization. + for name, node in changes: + if node is None: + self.gitrepo.references.delete( + _BMS_PREFIX + pycompat.fsdecode(name) + ) + else: + self.gitrepo.references.create( + _BMS_PREFIX + pycompat.fsdecode(name), + gitutil.togitnode(node), + force=True, + ) + + +def init(orig, ui, dest=b'.', **opts): + if opts.get('git', False): + path = os.path.abspath(dest) + # TODO: walk up looking for the git repo + _setupdothg(ui, path) + return 0 + return orig(ui, dest=dest, **opts) + + +def reposetup(ui, repo): + if isinstance(repo.store, gitstore): + orig = repo.__class__ + repo.store._progress_factory = repo.ui.makeprogress + + class gitlocalrepo(orig): + def _makedirstate(self): + # TODO narrow support here + return dirstate.gitdirstate( + self.ui, self.vfs.base, self.store.git + ) + + @property + def _bookmarks(self): + return gitbmstore(self.store.git) + + repo.__class__ = gitlocalrepo + return repo + + +def extsetup(ui): + extensions.wrapfunction(localrepo, b'makestore', _makestore) + extensions.wrapfunction(localrepo, b'makefilestorage', _makefilestorage) + # Inject --git flag for `hg init` + entry = extensions.wrapcommand(commands.table, b'init', init) + entry[1].extend( + [(b'', b'git', None, b'setup up a git repository instead of hg')] + ) diff --git a/hgext/git/dirstate.py b/hgext/git/dirstate.py new file mode 100644 --- /dev/null +++ b/hgext/git/dirstate.py @@ -0,0 +1,278 @@ +from __future__ import absolute_import + +import contextlib +import errno +import os + +import pygit2 + +from mercurial import ( + error, + extensions, + match as matchmod, + node as nodemod, + pycompat, + scmutil, + util, +) +from mercurial.interfaces import ( + dirstate as intdirstate, + util as interfaceutil, +) + +from . import gitutil + + +def readpatternfile(orig, filepath, warn, sourceinfo=False): + if not (b'info/exclude' in filepath or filepath.endswith(b'.gitignore')): + return orig(filepath, warn, sourceinfo=False) + result = [] + warnings = [] + with open(filepath, b'rb') as fp: + for l in fp: + l = l.strip() + if not l or l.startswith(b'#'): + continue + if l.startswith(b'!'): + # on reflection, I think /foo is just glob: + warnings.append(b'unsupported ignore pattern %s' % l) + continue + if l.startswith(b'/'): + result.append(b'glob:' + l[1:]) + else: + result.append(b'relglob:' + l) + return result, warnings + + +extensions.wrapfunction(matchmod, b'readpatternfile', readpatternfile) + + +_STATUS_MAP = { + pygit2.GIT_STATUS_CONFLICTED: b'm', + pygit2.GIT_STATUS_CURRENT: b'n', + pygit2.GIT_STATUS_IGNORED: b'?', + pygit2.GIT_STATUS_INDEX_DELETED: b'r', + pygit2.GIT_STATUS_INDEX_MODIFIED: b'n', + pygit2.GIT_STATUS_INDEX_NEW: b'a', + pygit2.GIT_STATUS_INDEX_RENAMED: b'a', + pygit2.GIT_STATUS_INDEX_TYPECHANGE: b'n', + pygit2.GIT_STATUS_WT_DELETED: b'r', + pygit2.GIT_STATUS_WT_MODIFIED: b'n', + pygit2.GIT_STATUS_WT_NEW: b'?', + pygit2.GIT_STATUS_WT_RENAMED: b'a', + pygit2.GIT_STATUS_WT_TYPECHANGE: b'n', + pygit2.GIT_STATUS_WT_UNREADABLE: b'?', +} + + +@interfaceutil.implementer(intdirstate.idirstate) +class gitdirstate(object): + def __init__(self, ui, root, gitrepo): + self._ui = ui + self._root = os.path.dirname(root) + self.git = gitrepo + + def p1(self): + return self.git.head.peel().id.raw + + def p2(self): + # TODO: MERGE_HEAD? something like that, right? + return nodemod.nullid + + def setparents(self, p1, p2=nodemod.nullid): + assert p2 == nodemod.nullid, b'TODO merging support' + self.git.head.set_target(gitutil.togitnode(p1)) + + @util.propertycache + def identity(self): + self.identity = util.filestat.frompath( + os.path.join(self.root, b'.git', b'index') + ) + + def branch(self): + return b'default' + + def parents(self): + # TODO how on earth do we find p2 if a merge is in flight? + return self.p1(), nodemod.nullid + + def __iter__(self): + return (pycompat.fsencode(f.path) for f in self.git.index) + + def items(self): + for ie in self.git.index: + yield ie.path, None # value should be a dirstatetuple + + # py2,3 compat forward + iteritems = items + + def __getitem__(self, filename): + try: + gs = self.git.status_file(filename) + except KeyError: + return b'?' + return _STATUS_MAP[gs] + + def __contains__(self, filename): + try: + gs = self.git.status_file(filename) + return _STATUS_MAP[gs] != b'?' + except KeyError: + return False + + def status(self, match, subrepos, ignored, clean, unknown): + # TODO handling of clean files - can we get that from git.status()? + modified, added, removed, deleted, unknown, ignored, clean = ( + [], + [], + [], + [], + [], + [], + [], + ) + gstatus = self.git.status() + for path, status in gstatus.items(): + path = pycompat.fsencode(path) + if status == pygit2.GIT_STATUS_IGNORED: + if path.endswith(b'/'): + continue + ignored.append(path) + elif status in ( + pygit2.GIT_STATUS_WT_MODIFIED, + pygit2.GIT_STATUS_INDEX_MODIFIED, + pygit2.GIT_STATUS_WT_MODIFIED + | pygit2.GIT_STATUS_INDEX_MODIFIED, + ): + modified.append(path) + elif status == pygit2.GIT_STATUS_INDEX_NEW: + added.append(path) + elif status == pygit2.GIT_STATUS_WT_NEW: + unknown.append(path) + elif status == pygit2.GIT_STATUS_WT_DELETED: + deleted.append(path) + elif status == pygit2.GIT_STATUS_INDEX_DELETED: + removed.append(path) + else: + raise error.Abort( + b'unhandled case: status for %r is %r' % (path, status) + ) + + # TODO are we really always sure of status here? + return ( + False, + scmutil.status( + modified, added, removed, deleted, unknown, ignored, clean + ), + ) + + def flagfunc(self, buildfallback): + # TODO we can do better + return buildfallback() + + def getcwd(self): + # TODO is this a good way to do this? + return os.path.dirname( + os.path.dirname(pycompat.fsencode(self.git.path)) + ) + + def normalize(self, path): + assert util.normcase(path) == path, b'TODO handling of case folding' + return path + + @property + def _checklink(self): + return util.checklink(os.path.dirname(pycompat.fsencode(self.git.path))) + + def copies(self): + # TODO support copies? + return {} + + # # TODO what the heck is this + _filecache = set() + + def pendingparentchange(self): + # TODO: we need to implement the context manager bits and + # correctly stage/revert index edits. + return False + + def write(self, tr): + + if tr: + + def writeinner(category): + self.git.index.write() + + tr.addpending(b'gitdirstate', writeinner) + else: + self.git.index.write() + + def pathto(self, f, cwd=None): + if cwd is None: + cwd = self.getcwd() + # TODO core dirstate does something about slashes here + assert isinstance(f, bytes) + r = util.pathto(self._root, cwd, f) + return r + + def matches(self, match): + for x in self.git.index: + p = pycompat.fsencode(x.path) + if match(p): + yield p + + def normal(self, f, parentfiledata=None): + """Mark a file normal and clean.""" + # TODO: for now we just let libgit2 re-stat the file. We can + # clearly do better. + + def normallookup(self, f): + """Mark a file normal, but possibly dirty.""" + # TODO: for now we just let libgit2 re-stat the file. We can + # clearly do better. + + def walk(self, match, subrepos, unknown, ignored, full=True): + # TODO: we need to use .status() and not iterate the index, + # because the index doesn't force a re-walk and so `hg add` of + # a new file without an intervening call to status will + # silently do nothing. + r = {} + cwd = self.getcwd() + for path, status in self.git.status().items(): + if path.startswith('.hg/'): + continue + path = pycompat.fsencode(path) + if not match(path): + continue + # TODO construct the stat info from the status object? + try: + s = os.stat(os.path.join(cwd, path)) + except OSError as e: + if e.errno != errno.ENOENT: + raise + continue + r[path] = s + return r + + def savebackup(self, tr, backupname): + # TODO: figure out a strategy for saving index backups. + pass + + def restorebackup(self, tr, backupname): + # TODO: figure out a strategy for saving index backups. + pass + + def add(self, f): + self.git.index.add(pycompat.fsdecode(f)) + + def drop(self, f): + self.git.index.remove(pycompat.fsdecode(f)) + + def copied(self, path): + # TODO: track copies? + return None + + @contextlib.contextmanager + def parentchange(self): + # TODO: track this maybe? + yield diff --git a/hgext/git/gitlog.py b/hgext/git/gitlog.py new file mode 100644 --- /dev/null +++ b/hgext/git/gitlog.py @@ -0,0 +1,565 @@ +from __future__ import absolute_import + +import pygit2 + +from mercurial.i18n import _ + +from mercurial import ( + ancestor, + changelog as hgchangelog, + dagop, + encoding, + error, + manifest, + match as matchmod, + node as nodemod, + pycompat, +) +from mercurial.interfaces import ( + repository, + util as interfaceutil, +) +from mercurial.utils import stringutil +from . import ( + gitutil, + index, +) + + +class baselog(object): # revlog.revlog): + """Common implementations between changelog and manifestlog.""" + + def __init__(self, gr, db): + self.gitrepo = gr + self._db = db + + def __len__(self): + return int( + self._db.execute('SELECT COUNT(*) FROM changelog').fetchone()[0] + ) + + def rev(self, n): + if n == nodemod.nullid: + return -1 + t = self._db.execute( + 'SELECT rev FROM changelog WHERE node = ?', (gitutil.togitnode(n),) + ).fetchone() + if t is None: + raise error.LookupError(n, b'00changelog.i', _(b'no node %d')) + return t[0] + + def node(self, r): + if r == nodemod.nullrev: + return nodemod.nullid + t = self._db.execute( + 'SELECT node FROM changelog WHERE rev = ?', (r,) + ).fetchone() + if t is None: + raise error.LookupError(r, b'00changelog.i', _(b'no node')) + return nodemod.bin(t[0]) + + def hasnode(self, n): + t = self._db.execute( + 'SELECT node FROM changelog WHERE node = ?', (n,) + ).fetchone() + return t is not None + + +class baselogindex(object): + def __init__(self, log): + self._log = log + + def has_node(self, n): + return self._log.rev(n) != -1 + + def __len__(self): + return len(self._log) + + def __getitem__(self, idx): + p1rev, p2rev = self._log.parentrevs(idx) + # TODO: it's messy that the index leaks so far out of the + # storage layer that we have to implement things like reading + # this raw tuple, which exposes revlog internals. + return ( + # Pretend offset is just the index, since we don't really care. + idx, + # Same with lengths + idx, # length + idx, # rawsize + -1, # delta base + idx, # linkrev TODO is this right? + p1rev, + p2rev, + self._log.node(idx), + ) + + +# TODO: an interface for the changelog type? +class changelog(baselog): + def __contains__(self, rev): + try: + self.node(rev) + return True + except error.LookupError: + return False + + @property + def filteredrevs(self): + # TODO: we should probably add a refs/hg/ namespace for hidden + # heads etc, but that's an idea for later. + return set() + + @property + def index(self): + return baselogindex(self) + + @property + def nodemap(self): + r = { + nodemod.bin(v[0]): v[1] + for v in self._db.execute('SELECT node, rev FROM changelog') + } + r[nodemod.nullid] = nodemod.nullrev + return r + + def tip(self): + t = self._db.execute( + 'SELECT node FROM changelog ORDER BY rev DESC LIMIT 1' + ).fetchone() + if t: + return nodemod.bin(t[0]) + return nodemod.nullid + + def revs(self, start=0, stop=None): + if stop is None: + stop = self.tip() + t = self._db.execute( + 'SELECT rev FROM changelog ' + 'WHERE rev >= ? AND rev <= ? ' + 'ORDER BY REV ASC', + (start, stop), + ) + return (int(r[0]) for r in t) + + def _partialmatch(self, id): + if nodemod.wdirhex.startswith(id): + raise error.WdirUnsupported + candidates = [ + nodemod.bin(x[0]) + for x in self._db.execute( + 'SELECT node FROM changelog WHERE node LIKE ?', (id + b'%',) + ) + ] + if nodemod.nullhex.startswith(id): + candidates.append(nodemod.nullid) + if len(candidates) > 1: + raise error.AmbiguousPrefixLookupError( + id, b'00changelog.i', _(b'ambiguous identifier') + ) + if candidates: + return candidates[0] + return None + + def flags(self, rev): + return 0 + + def shortest(self, node, minlength=1): + nodehex = nodemod.hex(node) + for attempt in pycompat.xrange(minlength, len(nodehex) + 1): + candidate = nodehex[:attempt] + matches = int( + self._db.execute( + 'SELECT COUNT(*) FROM changelog WHERE node LIKE ?', + (nodehex + b'%',), + ).fetchone()[0] + ) + if matches == 1: + return candidate + return nodehex + + def headrevs(self, revs=None): + realheads = [ + int(x[0]) + for x in self._db.execute( + 'SELECT rev FROM changelog ' + 'INNER JOIN heads ON changelog.node = heads.node' + ) + ] + if revs: + return sorted([r for r in revs if r in realheads]) + return sorted(realheads) + + def changelogrevision(self, nodeorrev): + # Ensure we have a node id + if isinstance(nodeorrev, int): + n = self.node(nodeorrev) + else: + n = nodeorrev + # handle looking up nullid + if n == nodemod.nullid: + return hgchangelog._changelogrevision(extra={}) + hn = gitutil.togitnode(n) + # We've got a real commit! + files = [ + r[0] + for r in self._db.execute( + 'SELECT filename FROM changedfiles ' + 'WHERE node = ? and filenode != ?', + (hn, gitutil.nullgit), + ) + ] + filesremoved = [ + r[0] + for r in self._db.execute( + 'SELECT filename FROM changedfiles ' + 'WHERE node = ? and filenode = ?', + (hn, nodemod.nullhex), + ) + ] + c = self.gitrepo[hn] + return hgchangelog._changelogrevision( + manifest=n, # pretend manifest the same as the commit node + user=b'%s <%s>' + % (c.author.name.encode('utf8'), c.author.email.encode('utf8')), + # TODO: a fuzzy memory from hg-git hacking says this should be -offset + date=(c.author.time, c.author.offset), + files=files, + # TODO filesadded in the index + filesremoved=filesremoved, + description=c.message.encode('utf8'), + # TODO do we want to handle extra? how? + extra={b'branch': b'default'}, + ) + + def ancestors(self, revs, stoprev=0, inclusive=False): + revs = list(revs) + tip = self.rev(self.tip()) + for r in revs: + if r > tip: + raise IndexError(b'Invalid rev %r' % r) + return ancestor.lazyancestors( + self.parentrevs, revs, stoprev=stoprev, inclusive=inclusive + ) + + # Cleanup opportunity: this is *identical* to the revlog.py version + def descendants(self, revs): + return dagop.descendantrevs(revs, self.revs, self.parentrevs) + + def reachableroots(self, minroot, heads, roots, includepath=False): + return dagop._reachablerootspure( + self.parentrevs, minroot, roots, heads, includepath + ) + + # Cleanup opportunity: this is *identical* to the revlog.py version + def isancestor(self, a, b): + a, b = self.rev(a), self.rev(b) + return self.isancestorrev(a, b) + + # Cleanup opportunity: this is *identical* to the revlog.py version + def isancestorrev(self, a, b): + if a == nodemod.nullrev: + return True + elif a == b: + return True + elif a > b: + return False + return bool(self.reachableroots(a, [b], [a], includepath=False)) + + def parentrevs(self, rev): + n = self.node(rev) + hn = gitutil.togitnode(n) + c = self.gitrepo[hn] + p1 = p2 = nodemod.nullrev + if c.parents: + p1 = self.rev(c.parents[0].id.raw) + if len(c.parents) > 2: + raise error.Abort(b'TODO octopus merge handling') + if len(c.parents) == 2: + p2 = self.rev(c.parents[0].id.raw) + return p1, p2 + + # Private method is used at least by the tags code. + _uncheckedparentrevs = parentrevs + + def commonancestorsheads(self, a, b): + # TODO the revlog verson of this has a C path, so we probably + # need to optimize this... + a, b = self.rev(a), self.rev(b) + return [ + self.node(n) + for n in ancestor.commonancestorsheads(self.parentrevs, a, b) + ] + + def branchinfo(self, rev): + """Git doesn't do named branches, so just put everything on default.""" + return b'default', False + + def delayupdate(self, tr): + # TODO: I think we can elide this because we're just dropping + # an object in the git repo? + pass + + def add( + self, + manifest, + files, + desc, + transaction, + p1, + p2, + user, + date=None, + extra=None, + p1copies=None, + p2copies=None, + filesadded=None, + filesremoved=None, + ): + parents = [] + hp1, hp2 = gitutil.togitnode(p1), gitutil.togitnode(p2) + if p1 != nodemod.nullid: + parents.append(hp1) + if p2 and p2 != nodemod.nullid: + parents.append(hp2) + assert date is not None + timestamp, tz = date + sig = pygit2.Signature( + encoding.unifromlocal(stringutil.person(user)), + encoding.unifromlocal(stringutil.email(user)), + timestamp, + tz, + ) + oid = self.gitrepo.create_commit( + None, sig, sig, desc, gitutil.togitnode(manifest), parents + ) + # Set up an internal reference to force the commit into the + # changelog. Hypothetically, we could even use this refs/hg/ + # namespace to allow for anonymous heads on git repos, which + # would be neat. + self.gitrepo.references.create( + 'refs/hg/internal/latest-commit', oid, force=True + ) + # Reindex now to pick up changes. We omit the progress + # callback because this will be very quick. + index._index_repo(self.gitrepo, self._db) + return oid.raw + + +# TODO: Make a split between mutable and immutable manifest types here. +class gittreemanifest(object): + def __init__(self, gt, builderfn): + self._builderfn = builderfn + self._tree = gt + self._builder = None + + def __contains__(self, k): + k = pycompat.fsdecode(k) + if self._builder: + return self._builder.get(k) is not None + return k in self._tree + + def __getitem__(self, k): + if self._builder: + match = self._builder.get(k) + if match is None: + raise error.LookupError( + b'File %r not found in tree %r' % (k, self._tree.id.hex) + ) + return match + try: + return self._tree[k].id.raw + except ValueError: + raise error.LookupError( + b'File %r not found in tree %r' % (k, self._tree.id.hex) + ) + + def __setitem__(self, k, v): + if self._builder is None: + self._builder = self._builderfn() + self._builder.insert( + pycompat.fsdecode(k), gitutil.togitnode(v), pygit2.GIT_FILEMODE_BLOB + ) + + def setflag(self, p, flag): + up = pycompat.fsdecode(p) + oid = self._builder.get(up).id + if not flag: + self._builder.insert(up, oid, pygit2.GIT_FILEMODE_BLOB) + elif flag == b'x': + self._builder.insert(up, oid, pygit2.GIT_FILEMODE_BLOB_EXECUTABLE) + elif flag == b'l': + self._builder.insert(up, oid, pygit2.GIT_FILEMODE_LINK) + else: + raise ValueError(b'Illegal flag value %r on path %r' % flag, p) + + def flags(self, k): + # TODO flags handling + return b'' + + def _walkonetree(self, tree, match, subdir): + for te in tree: + # TODO: can we prune dir walks with the matcher? + realname = subdir + pycompat.fsencode(te.name) + if te.type == r'tree': + for inner in self._walkonetree( + self.gitrepo[te.id], match, realname + b'/' + ): + yield inner + if not match(realname): + continue + yield pycompat.fsencode(realname) + + def walk(self, match): + return self._walkonetree(self._tree, match, b'') + + def get(self, fname, default=None): + if fname in self: + return self[fname] + return default + + +@interfaceutil.implementer(repository.imanifestrevisionstored) +class gittreemanifestctx(object): + def __init__(self, repo, gittree): + self._repo = repo + self._tree = gittree + self._builder = None + + def _getbuilder(self): + if self._builder is None: + self._builder = self._repo.TreeBuilder(self._tree) + return self._builder + + def read(self): + return gittreemanifest(self._tree, self._getbuilder) + + def find(self, path): + self.read()[path] + + def copy(self): + return gittreemanifestctx(self._repo, self._tree) + + def write(self, transaction, link, p1, p2, added, removed, match=None): + # We're not (for now, anyway) going to audit filenames, so we + # can ignore added and removed. + + # TODO what does this match argument get used for? hopefully + # just narrow? + assert not match or isinstance(match, matchmod.alwaysmatcher) + return self._getbuilder().write().raw + + +class manifestlog(baselog): + def __getitem__(self, node): + return self.get(b'', node) + + def get(self, relpath, node): + if node == nodemod.nullid: + return manifest.memtreemanifestctx(self, relpath) + commit = self.gitrepo[gitutil.togitnode(node)] + t = commit.tree + if relpath: + parts = relpath.split(b'/') + for p in parts: + te = t[p] + t = self.gitrepo[te.id] + return gittreemanifestctx(self.gitrepo, t) + + +@interfaceutil.implementer(repository.ifilestorage) +class filelog(baselog): + def __init__(self, gr, db, path): + super(filelog, self).__init__(gr, db) + assert isinstance(path, bytes) + self.path = path + + def read(self, node): + return self.gitrepo[gitutil.togitnode(node)].data + + def lookup(self, node): + if len(node) not in (20, 40): + node = int(node) + if isinstance(node, int): + assert False, b'todo revnums for nodes' + if len(node) == 40: + node = nodemod.bin(node) + hnode = gitutil.togitnode(node) + if hnode in self.gitrepo: + return node + raise error.LookupError(self.path, node, _(b'no match found')) + + def cmp(self, node, text): + """Returns True if text is different than content at `node`.""" + return self.read(node) != text + + def add(self, text, meta, transaction, link, p1=None, p2=None): + assert not meta # Should we even try to handle this? + return self.gitrepo.create_blob(text).raw + + def __iter__(self): + for clrev in self._db.execute( + ''' +SELECT rev FROM changelog +INNER JOIN changedfiles ON changelog.node = changedfiles.node +WHERE changedfiles.filename = ? AND changedfiles.filenode != ? + ''', + (pycompat.fsdecode(self.path), gitutil.nullgit), + ): + yield clrev[0] + + def linkrev(self, fr): + return fr + + def rev(self, node): + row = self._db.execute( + ''' +SELECT rev FROM changelog +INNER JOIN changedfiles ON changelog.node = changedfiles.node +WHERE changedfiles.filename = ? AND changedfiles.filenode = ?''', + (pycompat.fsdecode(self.path), gitutil.togitnode(node)), + ).fetchone() + if row is None: + raise error.LookupError(self.path, node, _(b'no such node')) + return int(row[0]) + + def node(self, rev): + maybe = self._db.execute( + '''SELECT filenode FROM changedfiles +INNER JOIN changelog ON changelog.node = changedfiles.node +WHERE changelog.rev = ? AND filename = ? +''', + (rev, pycompat.fsdecode(self.path)), + ).fetchone() + if maybe is None: + raise IndexError('gitlog %r out of range %d' % (self.path, rev)) + return nodemod.bin(maybe[0]) + + def parents(self, node): + gn = gitutil.togitnode(node) + gp = pycompat.fsdecode(self.path) + ps = [] + for p in self._db.execute( + '''SELECT p1filenode, p2filenode FROM changedfiles +WHERE filenode = ? AND filename = ? +''', + (gn, gp), + ).fetchone(): + if p is None: + commit = self._db.execute( + "SELECT node FROM changedfiles " + "WHERE filenode = ? AND filename = ?", + (gn, gp), + ).fetchone()[0] + # This filelog is missing some data. Build the + # filelog, then recurse (which will always find data). + if pycompat.ispy3: + commit = commit.decode('ascii') + index.fill_in_filelog(self.gitrepo, self._db, commit, gp, gn) + return self.parents(node) + else: + ps.append(nodemod.bin(p)) + return ps + + def renamed(self, node): + # TODO: renames/copies + return False diff --git a/hgext/git/gitutil.py b/hgext/git/gitutil.py new file mode 100644 --- /dev/null +++ b/hgext/git/gitutil.py @@ -0,0 +1,28 @@ +"""utilities to assist in working with pygit2""" +from __future__ import absolute_import + +from mercurial.node import bin, hex, nullid + +from mercurial import pycompat + + +def togitnode(n): + """Wrapper to convert a Mercurial binary node to a unicode hexlified node. + + pygit2 and sqlite both need nodes as strings, not bytes. + """ + assert len(n) == 20 + if pycompat.ispy3: + return hex(n).decode('ascii') + return hex(n) + + +def fromgitnode(n): + """Opposite of togitnode.""" + assert len(n) == 40 + if pycompat.ispy3: + return bin(n.encode('ascii')) + return bin(n) + + +nullgit = togitnode(nullid) diff --git a/hgext/git/index.py b/hgext/git/index.py new file mode 100644 --- /dev/null +++ b/hgext/git/index.py @@ -0,0 +1,346 @@ +from __future__ import absolute_import + +import collections +import os +import sqlite3 + +import pygit2 + +from mercurial.i18n import _ + +from mercurial import ( + encoding, + error, + node as nodemod, + pycompat, +) + +from . import gitutil + + +_CURRENT_SCHEMA_VERSION = 1 +_SCHEMA = ( + """ +CREATE TABLE refs ( + -- node and name are unique together. There may be more than one name for + -- a given node, and there may be no name at all for a given node (in the + -- case of an anonymous hg head). + node TEXT NOT NULL, + name TEXT +); + +-- The "possible heads" of the repository, which we use to figure out +-- if we need to re-walk the changelog. +CREATE TABLE possible_heads ( + node TEXT NOT NULL +); + +-- The topological heads of the changelog, which hg depends on. +CREATE TABLE heads ( + node TEXT NOT NULL +); + +-- A total ordering of the changelog +CREATE TABLE changelog ( + rev INTEGER NOT NULL PRIMARY KEY, + node TEXT NOT NULL, + p1 TEXT, + p2 TEXT +); + +CREATE UNIQUE INDEX changelog_node_idx ON changelog(node); +CREATE UNIQUE INDEX changelog_node_rev_idx ON changelog(rev, node); + +-- Changed files for each commit, which lets us dynamically build +-- filelogs. +CREATE TABLE changedfiles ( + node TEXT NOT NULL, + filename TEXT NOT NULL, + -- 40 zeroes for deletions + filenode TEXT NOT NULL, +-- to handle filelog parentage: + p1node TEXT, + p1filenode TEXT, + p2node TEXT, + p2filenode TEXT +); + +CREATE INDEX changedfiles_nodes_idx + ON changedfiles(node); + +PRAGMA user_version=%d +""" + % _CURRENT_SCHEMA_VERSION +) + + +def _createdb(path): + # print('open db', path) + # import traceback + # traceback.print_stack() + db = sqlite3.connect(encoding.strfromlocal(path)) + db.text_factory = bytes + + res = db.execute('PRAGMA user_version').fetchone()[0] + + # New database. + if res == 0: + for statement in _SCHEMA.split(';'): + db.execute(statement.strip()) + + db.commit() + + elif res == _CURRENT_SCHEMA_VERSION: + pass + + else: + raise error.Abort(_(b'sqlite database has unrecognized version')) + + db.execute('PRAGMA journal_mode=WAL') + + return db + + +_OUR_ORDER = ( + pygit2.GIT_SORT_TOPOLOGICAL | pygit2.GIT_SORT_TIME | pygit2.GIT_SORT_REVERSE +) + +_DIFF_FLAGS = 1 << 21 # GIT_DIFF_FORCE_BINARY, which isn't exposed by pygit2 + + +def _find_nearest_ancestor_introducing_node( + db, gitrepo, file_path, walk_start, filenode +): + """Find the nearest ancestor that introduces a file node. + + Args: + db: a handle to our sqlite database. + gitrepo: A pygit2.Repository instance. + file_path: the path of a file in the repo + walk_start: a pygit2.Oid that is a commit where we should start walking + for our nearest ancestor. + + Returns: + A hexlified SHA that is the commit ID of the next-nearest parent. + """ + assert isinstance(file_path, str), 'file_path must be str, got %r' % type( + file_path + ) + assert isinstance(filenode, str), 'filenode must be str, got %r' % type( + filenode + ) + parent_options = { + row[0].decode('ascii') + for row in db.execute( + 'SELECT node FROM changedfiles ' + 'WHERE filename = ? AND filenode = ?', + (file_path, filenode), + ) + } + inner_walker = gitrepo.walk(walk_start, _OUR_ORDER) + for w in inner_walker: + if w.id.hex in parent_options: + return w.id.hex + raise error.ProgrammingError( + 'Unable to find introducing commit for %s node %s from %s', + (file_path, filenode, walk_start), + ) + + +def fill_in_filelog(gitrepo, db, startcommit, path, startfilenode): + """Given a starting commit and path, fill in a filelog's parent pointers. + + Args: + gitrepo: a pygit2.Repository + db: a handle to our sqlite database + startcommit: a hexlified node id for the commit to start at + path: the path of the file whose parent pointers we should fill in. + filenode: the hexlified node id of the file at startcommit + + TODO: make filenode optional + """ + assert isinstance( + startcommit, str + ), 'startcommit must be str, got %r' % type(startcommit) + assert isinstance( + startfilenode, str + ), 'startfilenode must be str, got %r' % type(startfilenode) + visit = collections.deque([(startcommit, startfilenode)]) + while visit: + cnode, filenode = visit.popleft() + commit = gitrepo[cnode] + parents = [] + for parent in commit.parents: + t = parent.tree + for comp in path.split('/'): + try: + t = gitrepo[t[comp].id] + except KeyError: + break + else: + introducer = _find_nearest_ancestor_introducing_node( + db, gitrepo, path, parent.id, t.id.hex + ) + parents.append((introducer, t.id.hex)) + p1node = p1fnode = p2node = p2fnode = gitutil.nullgit + for par, parfnode in parents: + found = int( + db.execute( + 'SELECT COUNT(*) FROM changedfiles WHERE ' + 'node = ? AND filename = ? AND filenode = ? AND ' + 'p1node NOT NULL', + (par, path, parfnode), + ).fetchone()[0] + ) + if found == 0: + assert par is not None + visit.append((par, parfnode)) + if parents: + p1node, p1fnode = parents[0] + if len(parents) == 2: + p2node, p2fnode = parents[1] + if len(parents) > 2: + raise error.ProgrammingError( + b"git support can't handle octopus merges" + ) + db.execute( + 'UPDATE changedfiles SET ' + 'p1node = ?, p1filenode = ?, p2node = ?, p2filenode = ? ' + 'WHERE node = ? AND filename = ? AND filenode = ?', + (p1node, p1fnode, p2node, p2fnode, commit.id.hex, path, filenode), + ) + db.commit() + + +def _index_repo(gitrepo, db, progress_factory=lambda *args, **kwargs: None): + # Identify all references so we can tell the walker to visit all of them. + all_refs = gitrepo.listall_references() + possible_heads = set() + prog = progress_factory(b'refs') + for pos, ref in enumerate(all_refs): + if prog is not None: + prog.update(pos) + if not ( + ref.startswith('refs/heads/') # local branch + or ref.startswith('refs/tags/') # tag + or ref.startswith('refs/remotes/') # remote branch + or ref.startswith('refs/hg/') # from this extension + ): + continue + try: + start = gitrepo.lookup_reference(ref).peel(pygit2.GIT_OBJ_COMMIT) + except ValueError: + # No commit to be found, so we don't care for hg's purposes. + continue + possible_heads.add(start.id) + # Optimization: if the list of heads hasn't changed, don't + # reindex, the changelog. This doesn't matter on small + # repositories, but on even moderately deep histories (eg cpython) + # this is a very important performance win. + # + # TODO: we should figure out how to incrementally index history + # (preferably by detecting rewinds!) so that we don't have to do a + # full changelog walk every time a new commit is created. + cache_heads = {x[0] for x in db.execute('SELECT node FROM possible_heads')} + walker = None + cur_cache_heads = {h.hex for h in possible_heads} + if cur_cache_heads == cache_heads: + return + for start in possible_heads: + if walker is None: + walker = gitrepo.walk(start, _OUR_ORDER) + else: + walker.push(start) + + # Empty out the existing changelog. Even for large-ish histories + # we can do the top-level "walk all the commits" dance very + # quickly as long as we don't need to figure out the changed files + # list. + db.execute('DELETE FROM changelog') + if prog is not None: + prog.complete() + prog = progress_factory(b'commits') + # This walker is sure to visit all the revisions in history, but + # only once. + for pos, commit in enumerate(walker): + if prog is not None: + prog.update(pos) + p1 = p2 = nodemod.nullhex + if len(commit.parents) > 2: + raise error.ProgrammingError( + ( + b"git support can't handle octopus merges, " + b"found a commit with %d parents :(" + ) + % len(commit.parents) + ) + if commit.parents: + p1 = commit.parents[0].id.hex + if len(commit.parents) == 2: + p2 = commit.parents[1].id.hex + db.execute( + 'INSERT INTO changelog (rev, node, p1, p2) VALUES(?, ?, ?, ?)', + (pos, commit.id.hex, p1, p2), + ) + + num_changedfiles = db.execute( + "SELECT COUNT(*) from changedfiles WHERE node = ?", + (commit.id.hex,), + ).fetchone()[0] + if not num_changedfiles: + files = {} + # I *think* we only need to check p1 for changed files + # (and therefore linkrevs), because any node that would + # actually have this commit as a linkrev would be + # completely new in this rev. + p1 = commit.parents[0].id.hex if commit.parents else None + if p1 is not None: + patchgen = gitrepo.diff(p1, commit.id.hex, flags=_DIFF_FLAGS) + else: + patchgen = commit.tree.diff_to_tree( + swap=True, flags=_DIFF_FLAGS + ) + new_files = (p.delta.new_file for p in patchgen) + files = { + nf.path: nf.id.hex + for nf in new_files + if nf.id.raw != nodemod.nullid + } + for p, n in files.items(): + # We intentionally set NULLs for any file parentage + # information so it'll get demand-computed later. We + # used to do it right here, and it was _very_ slow. + db.execute( + 'INSERT INTO changedfiles (' + 'node, filename, filenode, p1node, p1filenode, p2node, ' + 'p2filenode) VALUES(?, ?, ?, ?, ?, ?, ?)', + (commit.id.hex, p, n, None, None, None, None), + ) + db.execute('DELETE FROM heads') + db.execute('DELETE FROM possible_heads') + for hid in possible_heads: + h = hid.hex + db.execute('INSERT INTO possible_heads (node) VALUES(?)', (h,)) + haschild = db.execute( + 'SELECT COUNT(*) FROM changelog WHERE p1 = ? OR p2 = ?', (h, h) + ).fetchone()[0] + if not haschild: + db.execute('INSERT INTO heads (node) VALUES(?)', (h,)) + + db.commit() + if prog is not None: + prog.complete() + + +def get_index(gitrepo, progress_factory=lambda *args, **kwargs: None): + cachepath = os.path.join( + pycompat.fsencode(gitrepo.path), b'..', b'.hg', b'cache' + ) + if not os.path.exists(cachepath): + os.makedirs(cachepath) + dbpath = os.path.join(cachepath, b'git-commits.sqlite') + db = _createdb(dbpath) + # TODO check against gitrepo heads before doing a full index + # TODO thread a ui.progress call into this layer + _index_repo(gitrepo, db, progress_factory) + return db diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -1212,6 +1212,7 @@ 'hgext.fsmonitor', 'hgext.fastannotate', 'hgext.fsmonitor.pywatchman', + 'hgext.git', 'hgext.highlight', 'hgext.infinitepush', 'hgext.largefiles', diff --git a/tests/test-git-interop.t b/tests/test-git-interop.t new file mode 100644 --- /dev/null +++ b/tests/test-git-interop.t @@ -0,0 +1,188 @@ +This test requires pygit2: + > python -c 'import pygit2' || exit 80 + +Setup: + > GIT_AUTHOR_NAME='test'; export GIT_AUTHOR_NAME + > GIT_AUTHOR_EMAIL='test@example.org'; export GIT_AUTHOR_EMAIL + > GIT_AUTHOR_DATE="2007-01-01 00:00:00 +0000"; export GIT_AUTHOR_DATE + > GIT_COMMITTER_NAME="$GIT_AUTHOR_NAME"; export GIT_COMMITTER_NAME + > GIT_COMMITTER_EMAIL="$GIT_AUTHOR_EMAIL"; export GIT_COMMITTER_EMAIL + > GIT_COMMITTER_DATE="$GIT_AUTHOR_DATE"; export GIT_COMMITTER_DATE + + > count=10 + > gitcommit() { + > GIT_AUTHOR_DATE="2007-01-01 00:00:$count +0000"; + > GIT_COMMITTER_DATE="$GIT_AUTHOR_DATE" + > git commit "$@" >/dev/null 2>/dev/null || echo "git commit error" + > count=`expr $count + 1` + > } + + > echo "[extensions]" >> $HGRCPATH + > echo "git=" >> $HGRCPATH + +Make a new repo with git: + $ mkdir foo + $ cd foo + $ git init + Initialized empty Git repository in $TESTTMP/foo/.git/ +Ignore the .hg directory within git: + $ echo .hg >> .git/info/exclude + $ echo alpha > alpha + $ git add alpha + $ gitcommit -am 'Add alpha' + $ echo beta > beta + $ git add beta + $ gitcommit -am 'Add beta' + $ echo gamma > gamma + $ git status + On branch master + Untracked files: + (use "git add ..." to include in what will be committed) + gamma + + nothing added to commit but untracked files present (use "git add" to track) + +Without creating the .hg, hg status fails: + $ hg status + abort: no repository found in '$TESTTMP/foo' (.hg not found)! + [255] +But if you run hg init --git, it works: + $ hg init --git + $ hg id --traceback + 3d9be8deba43 tip master + $ hg status + ? gamma +Log works too: + $ hg log + changeset: 1:3d9be8deba43 + bookmark: master + tag: tip + user: test + date: Mon Jan 01 00:00:11 2007 +0000 + summary: Add beta + + changeset: 0:c5864c9d16fb + user: test + date: Mon Jan 01 00:00:10 2007 +0000 + summary: Add alpha + + + +and bookmarks: + $ hg bookmarks + * master 1:3d9be8deba43 + +diff even works transparently in both systems: + $ echo blah >> alpha + $ git diff + diff --git a/alpha b/alpha + index 4a58007..faed1b7 100644 + --- a/alpha + +++ b/alpha + @@ -1* +1,2 @@ (glob) + alpha + +blah + $ hg diff --git + diff --git a/alpha b/alpha + --- a/alpha + +++ b/alpha + @@ -1,1 +1,2 @@ + alpha + +blah + +Remove a file, it shows as such: + $ rm alpha + $ hg status + ! alpha + ? gamma + +Revert works: + $ hg revert alpha --traceback + $ hg status + ? gamma + $ git status + On branch master + Untracked files: + (use "git add ..." to include in what will be committed) + gamma + + nothing added to commit but untracked files present (use "git add" to track) + +Add shows sanely in both: + $ hg add gamma + $ hg status + A gamma + $ hg files + alpha + beta + gamma + $ git ls-files + alpha + beta + gamma + $ git status + On branch master + Changes to be committed: + (use "git restore --staged ..." to unstage) + new file: gamma + + +forget does what it should as well: + $ hg forget gamma + $ hg status + ? gamma + $ git status + On branch master + Untracked files: + (use "git add ..." to include in what will be committed) + gamma + + nothing added to commit but untracked files present (use "git add" to track) + +hg log FILE + + $ echo a >> alpha + $ hg ci -m 'more alpha' --traceback + $ echo b >> beta + $ hg ci -m 'more beta' + $ echo a >> alpha + $ hg ci -m 'even more alpha' + $ hg log -G alpha + @ changeset: 4:bd975ddde71c + : bookmark: master + : tag: tip + : user: test + : date: Thu Jan 01 00:00:00 1970 +0000 + : summary: even more alpha + : + o changeset: 2:77f597222800 + : user: test + : date: Thu Jan 01 00:00:00 1970 +0000 + : summary: more alpha + : + o changeset: 0:c5864c9d16fb + user: test + date: Mon Jan 01 00:00:10 2007 +0000 + summary: Add alpha + + $ hg log -G beta + o changeset: 3:b40d4fed5e27 + : user: test + : date: Thu Jan 01 00:00:00 1970 +0000 + : summary: more beta + : + o changeset: 1:3d9be8deba43 + | user: test + ~ date: Mon Jan 01 00:00:11 2007 +0000 + summary: Add beta + + +hg annotate + + $ hg annotate alpha + 0: alpha + 2: a + 4: a + $ hg annotate beta + 1: beta + 3: b