This is an archive of the discontinued Mercurial Phabricator instance.

Differential D2631

[DRAFT] xdiff: skip trimmed lines when preparing the hashtable
AbandonedPublic

Authored by quark on Mar 3 2018, 9:51 PM.

Download Raw Diff

Details

Reviewers

None

Group Reviewers

hg-reviewers

Summary

NOTE: I'm still reading xdiffi.c to understand whether this is a safe optimization or not.

xdiff has a "xdl_trim_ends" function that removes common prefix and suffix.

Previously, xdiff will build a hashtable for all lines. That is a waste of
time for trimmed lines. This diff changes the logic so trimmed lines will be
ignored when building the hashtable. Note: the hashtable is still needed for
shifting purpose, so it does not blindly take whatever xdl_trim_ends says,
but also looks around.

For the following test case:

#!python
open('a','w').write(''.join('%s\n' % (i % 100000) for i in xrange(10000000)))
open('b','w').write(''.join('%s\n' % (i % 100000) for i in xrange(10000001)))

This series reduces xdiff's time for the above case from 1.1 seconds (D2604)
to 0.6 seconds.

Benchmarking on commands.py, it's 1/4 faster:

hg perfbdiff --count 3000 --blocks --xdiff .hg/store/data/mercurial/commands.py.i 1
# before
! wall 2.050600 comb 2.050000 user 2.040000 sys 0.010000 (best of 5)
# after
! wall 1.510821 comb 1.500000 user 1.500000 sys 0.000000 (best of 6)

However, GNU diffutils can perform even better (<0.1 seconds), there are
still things to catch up.

Diff Detail

Repository

rHG Mercurial

Lint

Lint Skipped

Unit

Unit Tests Skipped

Event Timeline

quark created this revision.Mar 3 2018, 9:51 PM

Herald added a reviewer: hg-reviewers. · View Herald TranscriptMar 3 2018, 9:51 PM

Herald added a subscriber: mercurial-devel. · View Herald Transcript

quark retitled this revision from [RFC] xdiff: skip trimmed lines when preparing the hashtable to [DRAFT] xdiff: skip trimmed lines when preparing the hashtable.Mar 3 2018, 9:52 PM

quark edited the summary of this revision. (Show Details)Mar 3 2018, 10:18 PM

quark added a child revision: D2634: [DRAFT] xdiff: avoid hashing trimmed lines.Mar 3 2018, 11:21 PM

quark abandoned this revision.Mar 4 2018, 7:50 PM

Revision Contents
Changeset List

			Path	Packages
M			mercurial/thirdparty/xdiff/xprepare.c (47 lines)

Status	Author	Revision
Abandoned	quark	D2634 [DRAFT] xdiff: avoid hashing trimmed lines
Abandoned	quark	D2631 [DRAFT] xdiff: skip trimmed lines when preparing the hashtable
Abandoned	quark	D2630 xdiff: move hashtable calculation to a separate function
Abandoned	quark	D2629 xdiff: do not rely on hashtable in xdl_trim_ends
Abandoned	quark	D2628 xdiff: expand xdl_optimize_ctxs

Diff 6530

mercurial/thirdparty/xdiff/xprepare.c

	static int xdl_classify_record(unsigned int pass, xdlclassifier_t cf, xrecord_t *rhash,			static int xdl_classify_record(unsigned int pass, xdlclassifier_t cf, xrecord_t *rhash,
	unsigned int hbits, xrecord_t *rec);			unsigned int hbits, xrecord_t *rec);
	static int xdl_prepare_ctx(unsigned int pass, mmfile_t mf, long narec, xpparam_t const xpp,			static int xdl_prepare_ctx(unsigned int pass, mmfile_t mf, long narec, xpparam_t const xpp,
	xdlclassifier_t cf, xdfile_t xdf);			xdlclassifier_t cf, xdfile_t xdf);
	static void xdl_free_ctx(xdfile_t *xdf);			static void xdl_free_ctx(xdfile_t *xdf);
	static int xdl_clean_mmatch(char const *dis, long i, long s, long e);			static int xdl_clean_mmatch(char const *dis, long i, long s, long e);
	static int xdl_cleanup_records(xdlclassifier_t cf, xdfile_t xdf1, xdfile_t *xdf2);			static int xdl_cleanup_records(xdlclassifier_t cf, xdfile_t xdf1, xdfile_t *xdf2);
	static int xdl_trim_ends(xdfile_t xdf1, xdfile_t xdf2);			static int xdl_trim_ends(xdfile_t xdf1, xdfile_t xdf2);
				static long xdl_trim_reserved_lines(xdfile_t xdf1, xdfile_t xdf2);




	static int xdl_init_classifier(xdlclassifier_t *cf, long size, long flags) {			static int xdl_init_classifier(xdlclassifier_t *cf, long size, long flags) {
	cf->flags = flags;			cf->flags = flags;

	cf->hbits = xdl_hashbits((unsigned int) size);			cf->hbits = xdl_hashbits((unsigned int) size);
	return -1;			return -1;
	}			}

	/*			/*
	* Adjust hash values for records (lines) in a file so the hash values become			* Adjust hash values for records (lines) in a file so the hash values become
	* unique. This makes future calculation faster - they can just compare "ha"			* unique. This makes future calculation faster - they can just compare "ha"
	* instead of comparing line content.			* instead of comparing line content.
	*/			*/
	static int xdl_prepare_hashtable(unsigned int pass, mmfile_t *mf,			static int xdl_prepare_hashtable(unsigned int pass, long reserved, mmfile_t
	xpparam_t const xpp, xdlclassifier_t cf, xdfile_t *xdf) {			mf, xpparam_t const xpp, xdlclassifier_t cf, xdfile_t xdf)
				{
	xrecord_t **rhash = NULL;			xrecord_t **rhash = NULL;
	long nrec = xdf->nrec;			long nrec;

	unsigned int hbits;			unsigned int hbits;
	long hsize;			long hsize;
	long i;			long i;
				long start = xdf->dstart - reserved;
				long end = xdf->dend + reserved;

				if (start < 0)
				start = 0;
				if (end >= xdf->nrec)
				end = xdf->nrec - 1;

				nrec = end - start;

	hbits = xdl_hashbits((unsigned int) nrec);			hbits = xdl_hashbits((unsigned int) nrec);
	hsize = 1 << hbits;			hsize = 1 << hbits;
	if (!(rhash = (xrecord_t *) xdl_malloc(hsize sizeof(xrecord_t *))))			if (!(rhash = (xrecord_t *) xdl_malloc(hsize sizeof(xrecord_t *))))
	goto abort;			goto abort;
	memset(rhash, 0, hsize * sizeof(xrecord_t *));			memset(rhash, 0, hsize * sizeof(xrecord_t *));

	for (i = 0; i < nrec; ++i) {			for (i = start; i <= end; i++) {
	if (xdl_classify_record(pass, cf, rhash, hbits, xdf->recs[i]) < 0)			if (xdl_classify_record(pass, cf, rhash, hbits, xdf->recs[i]) < 0)
	goto abort;			goto abort;
	}			}

	xdf->hbits = hbits;			xdf->hbits = hbits;
	xdf->rhash = rhash;			xdf->rhash = rhash;

	return 0;			return 0;
	xdl_free(xdf->ha);			xdl_free(xdf->ha);
	xdl_free(xdf->recs);			xdl_free(xdf->recs);
	xdl_cha_free(&xdf->rcha);			xdl_cha_free(&xdf->rcha);
	}			}


	int xdl_prepare_env(mmfile_t mf1, mmfile_t mf2, xpparam_t const *xpp,			int xdl_prepare_env(mmfile_t mf1, mmfile_t mf2, xpparam_t const *xpp,
	xdfenv_t *xe) {			xdfenv_t *xe) {
	long enl1, enl2, sample;			long enl1, enl2, sample, reserved;
	xdlclassifier_t cf;			xdlclassifier_t cf;

	memset(&cf, 0, sizeof(cf));			memset(&cf, 0, sizeof(cf));

	sample = XDL_GUESS_NLINES1;			sample = XDL_GUESS_NLINES1;

	enl1 = xdl_guess_lines(mf1, sample) + 1;			enl1 = xdl_guess_lines(mf1, sample) + 1;
	enl2 = xdl_guess_lines(mf2, sample) + 1;			enl2 = xdl_guess_lines(mf2, sample) + 1;

	if (xdl_trim_ends(&xe->xdf1, &xe->xdf2) < 0) {			if (xdl_trim_ends(&xe->xdf1, &xe->xdf2) < 0) {
	xdl_free_ctx(&xe->xdf2);			xdl_free_ctx(&xe->xdf2);
	xdl_free_ctx(&xe->xdf1);			xdl_free_ctx(&xe->xdf1);
	xdl_free_classifier(&cf);			xdl_free_classifier(&cf);
	return -1;			return -1;
	}			}

	if (xdl_prepare_hashtable(1, mf1, xpp, &cf, &xe->xdf1) < 0) {			reserved = xdl_trim_reserved_lines(&xe->xdf1, &xe->xdf2);
				if (xdl_prepare_hashtable(1, reserved, mf1, xpp, &cf, &xe->xdf1) < 0) {
	xdl_free_ctx(&xe->xdf1);			xdl_free_ctx(&xe->xdf1);
	xdl_free_ctx(&xe->xdf2);			xdl_free_ctx(&xe->xdf2);
	xdl_free_classifier(&cf);			xdl_free_classifier(&cf);
	return -1;			return -1;
	}			}
	if (xdl_prepare_hashtable(2, mf2, xpp, &cf, &xe->xdf2) < 0) {			if (xdl_prepare_hashtable(2, reserved, mf2, xpp, &cf, &xe->xdf2) < 0) {
	xdl_free_ctx(&xe->xdf1);			xdl_free_ctx(&xe->xdf1);
	xdl_free_ctx(&xe->xdf2);			xdl_free_ctx(&xe->xdf2);
	xdl_free_classifier(&cf);			xdl_free_classifier(&cf);
	return -1;			return -1;
	}			}

	if (xdl_cleanup_records(&cf, &xe->xdf1, &xe->xdf2) < 0) {			if (xdl_cleanup_records(&cf, &xe->xdf1, &xe->xdf2) < 0) {
	xdl_free_ctx(&xe->xdf2);			xdl_free_ctx(&xe->xdf2);
	}			}
	xdf2->nreff = nreff;			xdf2->nreff = nreff;

	xdl_free(dis);			xdl_free(dis);

	return 0;			return 0;
	}			}


	/*			/*
	* Early trim initial and terminal matching records.			* Early trim initial and terminal matching records.
	*/			*/
	static int xdl_trim_ends(xdfile_t xdf1, xdfile_t xdf2) {			static int xdl_trim_ends(xdfile_t xdf1, xdfile_t xdf2) {
	long i, lim;			long i, lim;
	xrecord_t recs1, recs2;			xrecord_t recs1, recs2;

	recs1 = xdf1->recs;			recs1 = xdf1->recs;
	break;			break;
	}			}

	xdf1->dend = xdf1->nrec - i - 1;			xdf1->dend = xdf1->nrec - i - 1;
	xdf2->dend = xdf2->nrec - i - 1;			xdf2->dend = xdf2->nrec - i - 1;

	return 0;			return 0;
	}			}


				/*
				* Return "reserved lines" for possible hunk shifting. Normally, only look at
				* lines in dstart..dend range. But hunk shifting also needs accurate line
				* hashes. Estimated hunk size and reserve lines for shifting purpose.
				*
				* This would be used by xdl_prepare_hashtable, to build accurate hash values.
				*/
				static long xdl_trim_reserved_lines(xdfile_t xdf1, xdfile_t xdf2) {
				long lines = 0;
				if (xdf1->dend > xdf1->dstart)
				lines += xdf1->dend - xdf1->dstart;
				if (xdf2->dend > xdf2->dstart)
				lines += xdf2->dend - xdf2->dstart;
				return lines;
				}

Diff	ID	Base	Description	Created	Lint	Unit
Base			Base
Diff 1	6530			Mar 3 2018, 9:51 PM	★	★