Mercurial > hg4j
changeset 242:ad6a046943be
Improved reading of sparse revisions from a revlog
author | Artem Tikhomirov <tikhomirov.artem@gmail.com> |
---|---|
date | Thu, 23 Jun 2011 15:19:07 +0200 (2011-06-23) |
parents | d3ab16739736 |
children | 0e01f9182e16 |
files | cmdline/org/tmatesoft/hg/console/Main.java src/org/tmatesoft/hg/internal/RevlogStream.java src/org/tmatesoft/hg/repo/HgChangelog.java src/org/tmatesoft/hg/repo/HgDataFile.java |
diffstat | 4 files changed, 240 insertions(+), 158 deletions(-) [+] |
line wrap: on
line diff
--- a/cmdline/org/tmatesoft/hg/console/Main.java Thu Jun 23 15:16:34 2011 +0200 +++ b/cmdline/org/tmatesoft/hg/console/Main.java Thu Jun 23 15:19:07 2011 +0200 @@ -72,10 +72,10 @@ public static void main(String[] args) throws Exception { Main m = new Main(args); - m.testSubrepos(); +// m.testSubrepos(); // m.testReadWorkingCopy(); // m.testParents(); -// m.testEffectiveFileLog(); + m.testEffectiveFileLog(); // m.testCatAtCsetRevision(); // m.testMergeState(); // m.testFileStatus(); @@ -132,10 +132,22 @@ } } - // -R \temp\hg\hg4j-50 src/org/tmatesoft/hg/internal/RevlogStream.java + /* + * -R \temp\hg\hg4j-50 src/org/tmatesoft/hg/internal/RevlogStream.java + * + * -R \temp\hg\cpython Lib/doctest.py, range 15907..68588, total 251 revision + * no improvement (collect linkRev, hgchangelog.range([])) 10890 ms + * improved history logic in HgDataFile (minimize reads of close revisions): + * with no sort (defect for tool-created repos) took 10500 ms + * with sort (to order revisions from linkRev before use) 610 ms + * HgChangelog.range() - 92 calls + * RevlogStream with separate iterate(int[] sortedRevisions,...) + * RevlogStream.ReaderN1.range(): 185 380 ms + */ private void testEffectiveFileLog() { for (String fname : cmdLineOpts.getList("")) { System.out.println(fname); + final long start = System.currentTimeMillis(); HgDataFile fn = hgRepo.getFileNode(fname); if (fn.exists()) { fn.history(new HgChangelog.Inspector() { @@ -144,6 +156,7 @@ } }); } + System.out.printf("Done: %d\n", System.currentTimeMillis() - start); } }
--- a/src/org/tmatesoft/hg/internal/RevlogStream.java Thu Jun 23 15:16:34 2011 +0200 +++ b/src/org/tmatesoft/hg/internal/RevlogStream.java Thu Jun 23 15:19:07 2011 +0200 @@ -200,121 +200,55 @@ } // XXX may cache [start .. end] from index with a single read (pre-read) - Lifecycle.BasicCallback cb = null; - DataAccess daIndex = null, daData = null; - daIndex = getIndexStream(); - if (needData && !inline) { - daData = getDataStream(); - } + ReaderN1 r = new ReaderN1(needData, inspector); try { - byte[] nodeidBuf = new byte[20]; - DataAccess lastUserData = null; - int i; - boolean extraReadsToBaseRev = false; - if (needData && getBaseRevision(start) < start) { - i = getBaseRevision(start); - extraReadsToBaseRev = true; - } else { - i = start; - } - - daIndex.seek(getIndexOffsetInt(i)); - - if (inspector instanceof Lifecycle) { - cb = new Lifecycle.BasicCallback(); - ((Lifecycle) inspector).start(end - start + 1, cb, cb); - } - - for (; i <= end; i++ ) { - if (inline && needData) { - // inspector reading data (though FilterDataAccess) may have affected index position - daIndex.seek(getIndexOffsetInt(i)); - } - long l = daIndex.readLong(); // 0 - long offset = i == 0 ? 0 : (l >>> 16); - @SuppressWarnings("unused") - int flags = (int) (l & 0X0FFFF); - int compressedLen = daIndex.readInt(); // +8 - int actualLen = daIndex.readInt(); // +12 - int baseRevision = daIndex.readInt(); // +16 - int linkRevision = daIndex.readInt(); // +20 - int parent1Revision = daIndex.readInt(); - int parent2Revision = daIndex.readInt(); - // Hg has 32 bytes here, uses 20 for nodeid, and keeps 12 last bytes empty - daIndex.readBytes(nodeidBuf, 0, 20); // +32 - daIndex.skip(12); - DataAccess userDataAccess = null; - if (needData) { - int streamOffset; - DataAccess streamDataAccess; - if (inline) { - streamDataAccess = daIndex; - streamOffset = getIndexOffsetInt(i) + REVLOGV1_RECORD_SIZE; // don't need to do seek as it's actual position in the index stream + r.start(end - start + 1); + r.range(start, end); + } catch (IOException ex) { + throw new HgBadStateException(ex); // FIXME need better handling + } finally { + r.finish(); + } + } + + /** + * Effective alternative to {@link #iterate(int, int, boolean, Inspector) batch read}, when only few selected + * revisions are of interest. + * @param sortedRevisions revisions to walk, in ascending order. + * @param needData whether inspector needs access to header only + * @param inspector callback to process entries + */ + public void iterate(int[] sortedRevisions, boolean needData, Inspector inspector) { + final int indexSize = revisionCount(); + if (indexSize == 0 || sortedRevisions.length == 0) { + return; + } + if (sortedRevisions[0] > indexSize || sortedRevisions[sortedRevisions.length - 1] > indexSize) { + throw new IllegalArgumentException(String.format("Can't iterate [%d, %d] in range [0..%d]", sortedRevisions[0], sortedRevisions[sortedRevisions.length - 1], indexSize)); + } + + ReaderN1 r = new ReaderN1(needData, inspector); + try { + r.start(sortedRevisions.length); + for (int i = 0; i < sortedRevisions.length; ) { + int x = i; + i++; + while (i < sortedRevisions.length) { + if (sortedRevisions[i] == sortedRevisions[i-1] + 1) { + i++; } else { - streamOffset = (int) offset; - streamDataAccess = daData; - daData.seek(streamOffset); - } - final boolean patchToPrevious = baseRevision != i; // the only way I found to tell if it's a patch - if (streamDataAccess.isEmpty()) { - userDataAccess = new DataAccess(); // empty - } else { - final byte firstByte = streamDataAccess.readByte(); - if (firstByte == 0x78 /* 'x' */) { - userDataAccess = new InflaterDataAccess(streamDataAccess, streamOffset, compressedLen, patchToPrevious ? -1 : actualLen); - } else if (firstByte == 0x75 /* 'u' */) { - userDataAccess = new FilterDataAccess(streamDataAccess, streamOffset+1, compressedLen-1); - } else { - // XXX Python impl in fact throws exception when there's not 'x', 'u' or '0' - // but I don't see reason not to return data as is - userDataAccess = new FilterDataAccess(streamDataAccess, streamOffset, compressedLen); - } - } - // XXX - if (patchToPrevious) { - // this is a patch - LinkedList<PatchRecord> patches = new LinkedList<PatchRecord>(); - while (!userDataAccess.isEmpty()) { - PatchRecord pr = PatchRecord.read(userDataAccess); -// System.out.printf("PatchRecord:%d %d %d\n", pr.start, pr.end, pr.len); - patches.add(pr); - } - userDataAccess.done(); - // - byte[] userData = apply(lastUserData, actualLen, patches); - userDataAccess = new ByteArrayDataAccess(userData); - } - } else { - if (inline) { - daIndex.skip(compressedLen); - } - } - if (!extraReadsToBaseRev || i >= start) { - inspector.next(i, actualLen, baseRevision, linkRevision, parent1Revision, parent2Revision, nodeidBuf, userDataAccess); - } - if (cb != null) { - if (cb.isStopped()) { break; } } - if (userDataAccess != null) { - userDataAccess.reset(); - if (lastUserData != null) { - lastUserData.done(); - } - lastUserData = userDataAccess; + // commitRevisions[x..i-1] are sequential + if (!r.range(sortedRevisions[x], sortedRevisions[i-1])) { + return; } } } catch (IOException ex) { throw new HgBadStateException(ex); // FIXME need better handling } finally { - if (inspector instanceof Lifecycle) { - ((Lifecycle) inspector).finish(cb); - } - daIndex.done(); - if (daData != null) { - daData.done(); - } + r.finish(); } } @@ -393,6 +327,161 @@ } } + /** + * operation with single file open/close and multiple diverse reads. + * XXX initOutline might need similar extraction to keen N1 format knowledge + */ + class ReaderN1 { + private final Inspector inspector; + private final boolean needData; + private DataAccess daIndex = null, daData = null; + private Lifecycle.BasicCallback cb = null; + private int lastRevisionRead = BAD_REVISION; + private DataAccess lastUserData; + + public ReaderN1(boolean needData, Inspector insp) { + assert insp != null; + this.needData = needData; + inspector = insp; + } + + public void start(int totalWork) { + daIndex = getIndexStream(); + if (needData && !inline) { + daData = getDataStream(); + } + if (inspector instanceof Lifecycle) { + cb = new Lifecycle.BasicCallback(); + ((Lifecycle) inspector).start(totalWork, cb, cb); + } + } + + public void finish() { + if (lastUserData != null) { + lastUserData.done(); + lastUserData = null; + } + if (inspector instanceof Lifecycle) { + ((Lifecycle) inspector).finish(cb); + } + daIndex.done(); + if (daData != null) { + daData.done(); + } + } + + public boolean range(int start, int end) throws IOException { +// System.out.printf("RevlogStream.ReaderN1.range(): [%d, %d]\n", start, end); + byte[] nodeidBuf = new byte[20]; + int i; + boolean extraReadsToBaseRev = false; // to indicate we read revision prior to start. XXX not sure can't do without + // it (i.e. replace with i >= start) + if (needData && (i = getBaseRevision(start)) < start) { + // if lastRevisionRead in [baseRevision(start), start) can reuse lastUserData + // doesn't make sense to reuse if lastRevisionRead == start (too much to change in the cycle below). + if (lastRevisionRead != BAD_REVISION && i <= lastRevisionRead && lastRevisionRead < start) { + i = lastRevisionRead + 1; // start with first not-yet-read revision + extraReadsToBaseRev = i < start; + } else { + if (lastUserData != null) { + lastUserData.done(); + lastUserData = null; + } + extraReadsToBaseRev = true; + } + } else { + // don't need to clean lastUserData as it's always null when !needData + i = start; + } + + daIndex.seek(getIndexOffsetInt(i)); + + for (; i <= end; i++ ) { + if (inline && needData) { + // inspector reading data (though FilterDataAccess) may have affected index position + daIndex.seek(getIndexOffsetInt(i)); + } + long l = daIndex.readLong(); // 0 + long offset = i == 0 ? 0 : (l >>> 16); + @SuppressWarnings("unused") + int flags = (int) (l & 0X0FFFF); + int compressedLen = daIndex.readInt(); // +8 + int actualLen = daIndex.readInt(); // +12 + int baseRevision = daIndex.readInt(); // +16 + int linkRevision = daIndex.readInt(); // +20 + int parent1Revision = daIndex.readInt(); + int parent2Revision = daIndex.readInt(); + // Hg has 32 bytes here, uses 20 for nodeid, and keeps 12 last bytes empty + daIndex.readBytes(nodeidBuf, 0, 20); // +32 + daIndex.skip(12); + DataAccess userDataAccess = null; + if (needData) { + int streamOffset; + DataAccess streamDataAccess; + if (inline) { + streamDataAccess = daIndex; + streamOffset = getIndexOffsetInt(i) + REVLOGV1_RECORD_SIZE; // don't need to do seek as it's actual position in the index stream + } else { + streamOffset = (int) offset; + streamDataAccess = daData; + daData.seek(streamOffset); + } + final boolean patchToPrevious = baseRevision != i; // the only way I found to tell if it's a patch + if (streamDataAccess.isEmpty()) { + userDataAccess = new DataAccess(); // empty + } else { + final byte firstByte = streamDataAccess.readByte(); + if (firstByte == 0x78 /* 'x' */) { + userDataAccess = new InflaterDataAccess(streamDataAccess, streamOffset, compressedLen, patchToPrevious ? -1 : actualLen); + } else if (firstByte == 0x75 /* 'u' */) { + userDataAccess = new FilterDataAccess(streamDataAccess, streamOffset+1, compressedLen-1); + } else { + // XXX Python impl in fact throws exception when there's not 'x', 'u' or '0' + // but I don't see reason not to return data as is + userDataAccess = new FilterDataAccess(streamDataAccess, streamOffset, compressedLen); + } + } + // XXX + if (patchToPrevious) { + // this is a patch + LinkedList<PatchRecord> patches = new LinkedList<PatchRecord>(); + while (!userDataAccess.isEmpty()) { + PatchRecord pr = PatchRecord.read(userDataAccess); +// System.out.printf("PatchRecord:%d %d %d\n", pr.start, pr.end, pr.len); + patches.add(pr); + } + userDataAccess.done(); + // + byte[] userData = apply(lastUserData, actualLen, patches); + userDataAccess = new ByteArrayDataAccess(userData); + } + } else { + if (inline) { + daIndex.skip(compressedLen); + } + } + if (!extraReadsToBaseRev || i >= start) { + inspector.next(i, actualLen, baseRevision, linkRevision, parent1Revision, parent2Revision, nodeidBuf, userDataAccess); + } + if (cb != null) { + if (cb.isStopped()) { + return false; + } + } + if (userDataAccess != null) { + userDataAccess.reset(); + if (lastUserData != null) { + lastUserData.done(); + } + lastUserData = userDataAccess; + } + } + lastRevisionRead = end; + return true; + } + } + + private static int[] toArray(List<Integer> l) { int[] rv = new int[l.size()]; for (int i = 0; i < rv.length; i++) {
--- a/src/org/tmatesoft/hg/repo/HgChangelog.java Thu Jun 23 15:16:34 2011 +0200 +++ b/src/org/tmatesoft/hg/repo/HgChangelog.java Thu Jun 23 15:19:07 2011 +0200 @@ -65,21 +65,28 @@ return c.result; } + /** + * Access individual revisions. Note, regardless of supplied revision order, inspector gets + * changesets strictly in the order they are in the changelog. + * @param inspector callback to get changesets + * @param revisions revisions to read, unrestricted ordering. + */ public void range(final HgChangelog.Inspector inspector, final int... revisions) { - if (revisions == null || revisions.length == 0) { + Arrays.sort(revisions); + rangeInternal(inspector, revisions); + } + + /** + * Friends-only version of {@link #range(Inspector, int...)}, when callers know array is sorted + */ + /*package-local*/ void rangeInternal(HgChangelog.Inspector inspector, int[] sortedRevisions) { + if (sortedRevisions == null || sortedRevisions.length == 0) { return; } - RevlogStream.Inspector i = new RevlogStream.Inspector() { - private final RawCsetParser delegate = new RawCsetParser(inspector); - - public void next(int revisionNumber, int actualLen, int baseRevision, int linkRevision, int parent1Revision, int parent2Revision, byte[] nodeid, DataAccess da) { - if (Arrays.binarySearch(revisions, revisionNumber) >= 0) { - delegate.next(revisionNumber, actualLen, baseRevision, linkRevision, parent1Revision, parent2Revision, nodeid, da); - } - } - }; - Arrays.sort(revisions); - content.iterate(revisions[0], revisions[revisions.length - 1], true, i); + if (inspector == null) { + throw new IllegalArgumentException(); + } + content.iterate(sortedRevisions, true, new RawCsetParser(inspector)); } public RawChangeset changeset(Nodeid nid) {
--- a/src/org/tmatesoft/hg/repo/HgDataFile.java Thu Jun 23 15:16:34 2011 +0200 +++ b/src/org/tmatesoft/hg/repo/HgDataFile.java Thu Jun 23 15:19:07 2011 +0200 @@ -224,53 +224,26 @@ throw new IllegalArgumentException(); } final int[] commitRevisions = new int[end - start + 1]; + final boolean[] needsSorting = { false }; RevlogStream.Inspector insp = new RevlogStream.Inspector() { int count = 0; - public void next(int revisionNumber, int actualLen, int baseRevision, int linkRevision, int parent1Revision, int parent2Revision, byte[] nodeid, DataAccess data) { + if (count > 0) { + if (commitRevisions[count -1] > linkRevision) { + needsSorting[0] = true; + } + } commitRevisions[count++] = linkRevision; } }; content.iterate(start, end, false, insp); final HgChangelog changelog = getRepo().getChangelog(); - /* - * changelog.range(inspector, commitRevisions); - * Not effective when changes are sparse and far from each other. - * However, it's only single file read, unlike code below (few reads of sequential blocks) - * Need to weight tradeoffs of file read and iteration of sparse files. - */ - - // - final int HistoricallyCloseCommits = 50; // XXX perhaps, shall increase/decrease based on changelog.revisionCount() - // (huge changelog => memory mapped files, each file re-read is more expensive than iterating over records in one read? - // - // try short sequences on neighboring revisions. - Arrays.sort(commitRevisions); // automatic tools (svnmerge?) produce unnatural file history - // (e.g. cpython/Lib/doctest.py, revision 164 points to 63509, 165 - to 38453) - for (int i = 0; i < commitRevisions.length; ) { - int x = i; - i++; - boolean sequential = true; - while (i < commitRevisions.length) { - if (commitRevisions[i] == commitRevisions[i-1] + 1) { - i++; - } else if (commitRevisions[i] - commitRevisions[i-1] < HistoricallyCloseCommits) { - // close enough, but not sequential - sequential = false; - i++; - } else { - break; - } - } - if (sequential) { - // commitRevisions[x..i-1] are sequential - changelog.range(commitRevisions[x], commitRevisions[i-1], inspector); - } else { - int[] revs = new int[i-x]; - System.arraycopy(commitRevisions, x, revs, 0, i-x); - changelog.range(inspector, revs); - } + if (needsSorting[0]) { + // automatic tools (svnmerge?) produce unnatural file history + // (e.g. cpython/Lib/doctest.py, revision 164 points to cset 63509, 165 - to 38453) + Arrays.sort(commitRevisions); } + changelog.range(inspector, commitRevisions); } // for a given local revision of the file, find out local revision in the changelog