changeset 242:ad6a046943be

Improved reading of sparse revisions from a revlog
author Artem Tikhomirov <tikhomirov.artem@gmail.com>
date Thu, 23 Jun 2011 15:19:07 +0200 (2011-06-23)
parents d3ab16739736
children 0e01f9182e16
files cmdline/org/tmatesoft/hg/console/Main.java src/org/tmatesoft/hg/internal/RevlogStream.java src/org/tmatesoft/hg/repo/HgChangelog.java src/org/tmatesoft/hg/repo/HgDataFile.java
diffstat 4 files changed, 240 insertions(+), 158 deletions(-) [+]
line wrap: on
line diff
--- a/cmdline/org/tmatesoft/hg/console/Main.java	Thu Jun 23 15:16:34 2011 +0200
+++ b/cmdline/org/tmatesoft/hg/console/Main.java	Thu Jun 23 15:19:07 2011 +0200
@@ -72,10 +72,10 @@
 
 	public static void main(String[] args) throws Exception {
 		Main m = new Main(args);
-		m.testSubrepos();
+//		m.testSubrepos();
 //		m.testReadWorkingCopy();
 //		m.testParents();
-//		m.testEffectiveFileLog();
+		m.testEffectiveFileLog();
 //		m.testCatAtCsetRevision();
 //		m.testMergeState();
 //		m.testFileStatus();
@@ -132,10 +132,22 @@
 		}
 	}
 	
-	// -R \temp\hg\hg4j-50 src/org/tmatesoft/hg/internal/RevlogStream.java
+	/*
+	 *  -R \temp\hg\hg4j-50 src/org/tmatesoft/hg/internal/RevlogStream.java
+	 *  
+	 *  -R \temp\hg\cpython Lib/doctest.py, range 15907..68588, total 251 revision
+	 *  no improvement (collect linkRev, hgchangelog.range([]))							10890 ms
+	 *  improved history logic in HgDataFile (minimize reads of close revisions): 
+	 *  		with no sort (defect for tool-created repos)					took	10500 ms
+	 *  		with sort (to order revisions from linkRev before use)					  610 ms
+	 *  			HgChangelog.range() - 92 calls
+	 *  RevlogStream with separate iterate(int[] sortedRevisions,...)
+	 *  		RevlogStream.ReaderN1.range(): 185										  380 ms 
+	 */
 	private void testEffectiveFileLog() {
 		for (String fname : cmdLineOpts.getList("")) {
 			System.out.println(fname);
+			final long start = System.currentTimeMillis();
 			HgDataFile fn = hgRepo.getFileNode(fname);
 			if (fn.exists()) {
 				fn.history(new HgChangelog.Inspector() {
@@ -144,6 +156,7 @@
 					}
 				});
 			}
+			System.out.printf("Done: %d\n", System.currentTimeMillis() - start);
 		}
 	}
 	
--- a/src/org/tmatesoft/hg/internal/RevlogStream.java	Thu Jun 23 15:16:34 2011 +0200
+++ b/src/org/tmatesoft/hg/internal/RevlogStream.java	Thu Jun 23 15:19:07 2011 +0200
@@ -200,121 +200,55 @@
 		}
 		// XXX may cache [start .. end] from index with a single read (pre-read)
 		
-		Lifecycle.BasicCallback cb = null;
-		DataAccess daIndex = null, daData = null;
-		daIndex = getIndexStream();
-		if (needData && !inline) {
-			daData = getDataStream();
-		}
+		ReaderN1 r = new ReaderN1(needData, inspector);
 		try {
-			byte[] nodeidBuf = new byte[20];
-			DataAccess lastUserData = null;
-			int i;
-			boolean extraReadsToBaseRev = false;
-			if (needData && getBaseRevision(start) < start) {
-				i = getBaseRevision(start);
-				extraReadsToBaseRev = true;
-			} else {
-				i = start;
-			}
-			
-			daIndex.seek(getIndexOffsetInt(i));
-			
-			if (inspector instanceof Lifecycle) {
-				cb = new Lifecycle.BasicCallback();
-				((Lifecycle) inspector).start(end - start + 1, cb, cb);
-			}
-			
-			for (; i <= end; i++ ) {
-				if (inline && needData) {
-					// inspector reading data (though FilterDataAccess) may have affected index position
-					daIndex.seek(getIndexOffsetInt(i));
-				}
-				long l = daIndex.readLong(); // 0
-				long offset = i == 0 ? 0 : (l >>> 16);
-				@SuppressWarnings("unused")
-				int flags = (int) (l & 0X0FFFF);
-				int compressedLen = daIndex.readInt(); // +8
-				int actualLen = daIndex.readInt(); // +12
-				int baseRevision = daIndex.readInt(); // +16
-				int linkRevision = daIndex.readInt(); // +20
-				int parent1Revision = daIndex.readInt();
-				int parent2Revision = daIndex.readInt();
-				// Hg has 32 bytes here, uses 20 for nodeid, and keeps 12 last bytes empty
-				daIndex.readBytes(nodeidBuf, 0, 20); // +32
-				daIndex.skip(12);
-				DataAccess userDataAccess = null;
-				if (needData) {
-					int streamOffset;
-					DataAccess streamDataAccess;
-					if (inline) {
-						streamDataAccess = daIndex;
-						streamOffset = getIndexOffsetInt(i) + REVLOGV1_RECORD_SIZE; // don't need to do seek as it's actual position in the index stream
+			r.start(end - start + 1);
+			r.range(start, end);
+		} catch (IOException ex) {
+			throw new HgBadStateException(ex); // FIXME need better handling
+		} finally {
+			r.finish();
+		}
+	}
+	
+	/**
+	 * Effective alternative to {@link #iterate(int, int, boolean, Inspector) batch read}, when only few selected 
+	 * revisions are of interest.
+	 * @param sortedRevisions revisions to walk, in ascending order.
+	 * @param needData whether inspector needs access to header only
+	 * @param inspector callback to process entries
+	 */
+	public void iterate(int[] sortedRevisions, boolean needData, Inspector inspector) {
+		final int indexSize = revisionCount();
+		if (indexSize == 0 || sortedRevisions.length == 0) {
+			return;
+		}
+		if (sortedRevisions[0] > indexSize || sortedRevisions[sortedRevisions.length - 1] > indexSize) {
+			throw new IllegalArgumentException(String.format("Can't iterate [%d, %d] in range [0..%d]", sortedRevisions[0], sortedRevisions[sortedRevisions.length - 1], indexSize));
+		}
+
+		ReaderN1 r = new ReaderN1(needData, inspector);
+		try {
+			r.start(sortedRevisions.length);
+			for (int i = 0; i < sortedRevisions.length; ) {
+				int x = i;
+				i++;
+				while (i < sortedRevisions.length) {
+					if (sortedRevisions[i] == sortedRevisions[i-1] + 1) {
+						i++;
 					} else {
-						streamOffset = (int) offset;
-						streamDataAccess = daData;
-						daData.seek(streamOffset);
-					}
-					final boolean patchToPrevious = baseRevision != i; // the only way I found to tell if it's a patch
-					if (streamDataAccess.isEmpty()) {
-						userDataAccess = new DataAccess(); // empty
-					} else {
-						final byte firstByte = streamDataAccess.readByte();
-						if (firstByte == 0x78 /* 'x' */) {
-							userDataAccess = new InflaterDataAccess(streamDataAccess, streamOffset, compressedLen, patchToPrevious ? -1 : actualLen);
-						} else if (firstByte == 0x75 /* 'u' */) {
-							userDataAccess = new FilterDataAccess(streamDataAccess, streamOffset+1, compressedLen-1);
-						} else {
-							// XXX Python impl in fact throws exception when there's not 'x', 'u' or '0'
-							// but I don't see reason not to return data as is 
-							userDataAccess = new FilterDataAccess(streamDataAccess, streamOffset, compressedLen);
-						}
-					}
-					// XXX 
-					if (patchToPrevious) {
-						// this is a patch
-						LinkedList<PatchRecord> patches = new LinkedList<PatchRecord>();
-						while (!userDataAccess.isEmpty()) {
-							PatchRecord pr = PatchRecord.read(userDataAccess);
-//							System.out.printf("PatchRecord:%d %d %d\n", pr.start, pr.end, pr.len);
-							patches.add(pr);
-						}
-						userDataAccess.done();
-						//
-						byte[] userData = apply(lastUserData, actualLen, patches);
-						userDataAccess = new ByteArrayDataAccess(userData);
-					}
-				} else {
-					if (inline) {
-						daIndex.skip(compressedLen);
-					}
-				}
-				if (!extraReadsToBaseRev || i >= start) {
-					inspector.next(i, actualLen, baseRevision, linkRevision, parent1Revision, parent2Revision, nodeidBuf, userDataAccess);
-				}
-				if (cb != null) {
-					if (cb.isStopped()) {
 						break;
 					}
 				}
-				if (userDataAccess != null) {
-					userDataAccess.reset();
-					if (lastUserData != null) {
-						lastUserData.done();
-					}
-					lastUserData = userDataAccess;
+				// commitRevisions[x..i-1] are sequential
+				if (!r.range(sortedRevisions[x], sortedRevisions[i-1])) {
+					return;
 				}
 			}
 		} catch (IOException ex) {
 			throw new HgBadStateException(ex); // FIXME need better handling
 		} finally {
-			if (inspector instanceof Lifecycle) {
-				((Lifecycle) inspector).finish(cb);
-			}
-			daIndex.done();
-			if (daData != null) {
-				daData.done();
-			}
+			r.finish();
 		}
 	}
 
@@ -393,6 +327,161 @@
 		}
 	}
 	
+	/**
+	 * operation with single file open/close and multiple diverse reads.
+	 * XXX initOutline might need similar extraction to keen N1 format knowledge  
+	 */
+	class ReaderN1 {
+		private final Inspector inspector;
+		private final boolean needData;
+		private DataAccess daIndex = null, daData = null;
+		private Lifecycle.BasicCallback cb = null;
+		private int lastRevisionRead = BAD_REVISION;
+		private DataAccess lastUserData;
+
+		public ReaderN1(boolean needData, Inspector insp) {
+			assert insp != null;
+			this.needData = needData;
+			inspector = insp;
+		}
+		
+		public void start(int totalWork) {
+			daIndex = getIndexStream();
+			if (needData && !inline) {
+				daData = getDataStream();
+			}
+			if (inspector instanceof Lifecycle) {
+				cb = new Lifecycle.BasicCallback();
+				((Lifecycle) inspector).start(totalWork, cb, cb);
+			}
+		}
+
+		public void finish() {
+			if (lastUserData != null) {
+				lastUserData.done();
+				lastUserData = null;
+			}
+			if (inspector instanceof Lifecycle) {
+				((Lifecycle) inspector).finish(cb);
+			}
+			daIndex.done();
+			if (daData != null) {
+				daData.done();
+			}
+		}
+		
+		public boolean range(int start, int end) throws IOException {
+//			System.out.printf("RevlogStream.ReaderN1.range(): [%d, %d]\n", start, end);
+			byte[] nodeidBuf = new byte[20];
+			int i;
+			boolean extraReadsToBaseRev = false; // to indicate we read revision prior to start. XXX not sure can't do without
+			// it (i.e. replace with i >= start)
+			if (needData && (i = getBaseRevision(start)) < start) {
+				// if lastRevisionRead in [baseRevision(start), start)  can reuse lastUserData
+				// doesn't make sense to reuse if lastRevisionRead == start (too much to change in the cycle below). 
+				if (lastRevisionRead != BAD_REVISION && i <= lastRevisionRead && lastRevisionRead < start) {
+					i = lastRevisionRead + 1; // start with first not-yet-read revision
+					extraReadsToBaseRev = i < start;
+				} else {
+					if (lastUserData != null) {
+						lastUserData.done();
+						lastUserData = null;
+					}
+					extraReadsToBaseRev = true;
+				}
+			} else {
+				// don't need to clean lastUserData as it's always null when !needData
+				i = start;
+			}
+			
+			daIndex.seek(getIndexOffsetInt(i));
+			
+			for (; i <= end; i++ ) {
+				if (inline && needData) {
+					// inspector reading data (though FilterDataAccess) may have affected index position
+					daIndex.seek(getIndexOffsetInt(i));
+				}
+				long l = daIndex.readLong(); // 0
+				long offset = i == 0 ? 0 : (l >>> 16);
+				@SuppressWarnings("unused")
+				int flags = (int) (l & 0X0FFFF);
+				int compressedLen = daIndex.readInt(); // +8
+				int actualLen = daIndex.readInt(); // +12
+				int baseRevision = daIndex.readInt(); // +16
+				int linkRevision = daIndex.readInt(); // +20
+				int parent1Revision = daIndex.readInt();
+				int parent2Revision = daIndex.readInt();
+				// Hg has 32 bytes here, uses 20 for nodeid, and keeps 12 last bytes empty
+				daIndex.readBytes(nodeidBuf, 0, 20); // +32
+				daIndex.skip(12);
+				DataAccess userDataAccess = null;
+				if (needData) {
+					int streamOffset;
+					DataAccess streamDataAccess;
+					if (inline) {
+						streamDataAccess = daIndex;
+						streamOffset = getIndexOffsetInt(i) + REVLOGV1_RECORD_SIZE; // don't need to do seek as it's actual position in the index stream
+					} else {
+						streamOffset = (int) offset;
+						streamDataAccess = daData;
+						daData.seek(streamOffset);
+					}
+					final boolean patchToPrevious = baseRevision != i; // the only way I found to tell if it's a patch
+					if (streamDataAccess.isEmpty()) {
+						userDataAccess = new DataAccess(); // empty
+					} else {
+						final byte firstByte = streamDataAccess.readByte();
+						if (firstByte == 0x78 /* 'x' */) {
+							userDataAccess = new InflaterDataAccess(streamDataAccess, streamOffset, compressedLen, patchToPrevious ? -1 : actualLen);
+						} else if (firstByte == 0x75 /* 'u' */) {
+							userDataAccess = new FilterDataAccess(streamDataAccess, streamOffset+1, compressedLen-1);
+						} else {
+							// XXX Python impl in fact throws exception when there's not 'x', 'u' or '0'
+							// but I don't see reason not to return data as is 
+							userDataAccess = new FilterDataAccess(streamDataAccess, streamOffset, compressedLen);
+						}
+					}
+					// XXX 
+					if (patchToPrevious) {
+						// this is a patch
+						LinkedList<PatchRecord> patches = new LinkedList<PatchRecord>();
+						while (!userDataAccess.isEmpty()) {
+							PatchRecord pr = PatchRecord.read(userDataAccess);
+//							System.out.printf("PatchRecord:%d %d %d\n", pr.start, pr.end, pr.len);
+							patches.add(pr);
+						}
+						userDataAccess.done();
+						//
+						byte[] userData = apply(lastUserData, actualLen, patches);
+						userDataAccess = new ByteArrayDataAccess(userData);
+					}
+				} else {
+					if (inline) {
+						daIndex.skip(compressedLen);
+					}
+				}
+				if (!extraReadsToBaseRev || i >= start) {
+					inspector.next(i, actualLen, baseRevision, linkRevision, parent1Revision, parent2Revision, nodeidBuf, userDataAccess);
+				}
+				if (cb != null) {
+					if (cb.isStopped()) {
+						return false;
+					}
+				}
+				if (userDataAccess != null) {
+					userDataAccess.reset();
+					if (lastUserData != null) {
+						lastUserData.done();
+					}
+					lastUserData = userDataAccess;
+				}
+			}
+			lastRevisionRead = end;
+			return true;
+		}
+	}
+
+	
 	private static int[] toArray(List<Integer> l) {
 		int[] rv = new int[l.size()];
 		for (int i = 0; i < rv.length; i++) {
--- a/src/org/tmatesoft/hg/repo/HgChangelog.java	Thu Jun 23 15:16:34 2011 +0200
+++ b/src/org/tmatesoft/hg/repo/HgChangelog.java	Thu Jun 23 15:19:07 2011 +0200
@@ -65,21 +65,28 @@
 		return c.result;
 	}
 
+	/**
+	 * Access individual revisions. Note, regardless of supplied revision order, inspector gets
+	 * changesets strictly in the order they are in the changelog.
+	 * @param inspector callback to get changesets
+	 * @param revisions revisions to read, unrestricted ordering.
+	 */
 	public void range(final HgChangelog.Inspector inspector, final int... revisions) {
-		if (revisions == null || revisions.length == 0) {
+		Arrays.sort(revisions);
+		rangeInternal(inspector, revisions);
+	}
+
+	/**
+	 * Friends-only version of {@link #range(Inspector, int...)}, when callers know array is sorted
+	 */
+	/*package-local*/ void rangeInternal(HgChangelog.Inspector inspector, int[] sortedRevisions) {
+		if (sortedRevisions == null || sortedRevisions.length == 0) {
 			return;
 		}
-		RevlogStream.Inspector i = new RevlogStream.Inspector() {
-			private final RawCsetParser delegate = new RawCsetParser(inspector);
-
-			public void next(int revisionNumber, int actualLen, int baseRevision, int linkRevision, int parent1Revision, int parent2Revision, byte[] nodeid, DataAccess da) {
-				if (Arrays.binarySearch(revisions, revisionNumber) >= 0) {
-					delegate.next(revisionNumber, actualLen, baseRevision, linkRevision, parent1Revision, parent2Revision, nodeid, da);
-				}
-			}
-		};
-		Arrays.sort(revisions);
-		content.iterate(revisions[0], revisions[revisions.length - 1], true, i);
+		if (inspector == null) {
+			throw new IllegalArgumentException();
+		}
+		content.iterate(sortedRevisions, true, new RawCsetParser(inspector));
 	}
 	
 	public RawChangeset changeset(Nodeid nid) {
--- a/src/org/tmatesoft/hg/repo/HgDataFile.java	Thu Jun 23 15:16:34 2011 +0200
+++ b/src/org/tmatesoft/hg/repo/HgDataFile.java	Thu Jun 23 15:19:07 2011 +0200
@@ -224,53 +224,26 @@
 			throw new IllegalArgumentException();
 		}
 		final int[] commitRevisions = new int[end - start + 1];
+		final boolean[] needsSorting = { false };
 		RevlogStream.Inspector insp = new RevlogStream.Inspector() {
 			int count = 0;
-			
 			public void next(int revisionNumber, int actualLen, int baseRevision, int linkRevision, int parent1Revision, int parent2Revision, byte[] nodeid, DataAccess data) {
+				if (count > 0) {
+					if (commitRevisions[count -1] > linkRevision) {
+						needsSorting[0] = true;
+					}
+				}
 				commitRevisions[count++] = linkRevision;
 			}
 		};
 		content.iterate(start, end, false, insp);
 		final HgChangelog changelog = getRepo().getChangelog();
-		/*
-		 * changelog.range(inspector, commitRevisions);
-		 * Not effective when changes are sparse and far from each other.
-		 * However, it's only single file read, unlike code below (few reads of sequential blocks)
-		 * Need to weight tradeoffs of file read and iteration of sparse files.
-		 */
-		 
-		//
-		final int HistoricallyCloseCommits = 50; // XXX perhaps, shall increase/decrease based on changelog.revisionCount() 
-		// (huge changelog => memory mapped files, each file re-read is more expensive than iterating over records in one read?
-		//
-		// try short sequences on neighboring revisions.
-		Arrays.sort(commitRevisions); // automatic tools (svnmerge?) produce unnatural file history 
-		// (e.g. cpython/Lib/doctest.py, revision 164 points to 63509, 165 - to 38453) 
-		for (int i = 0; i < commitRevisions.length; ) {
-			int x = i;
-			i++;
-			boolean sequential = true;
-			while (i < commitRevisions.length) {
-				if (commitRevisions[i] == commitRevisions[i-1] + 1) {
-					i++;
-				} else if (commitRevisions[i] - commitRevisions[i-1] < HistoricallyCloseCommits) {
-					// close enough, but not sequential
-					sequential = false;
-					i++;
-				} else {
-					break;
-				}
-			}
-			if (sequential) {
-				// commitRevisions[x..i-1] are sequential
-				changelog.range(commitRevisions[x], commitRevisions[i-1], inspector);
-			} else {
-				int[] revs = new int[i-x];
-				System.arraycopy(commitRevisions, x, revs, 0, i-x);
-				changelog.range(inspector, revs);
-			}
+		if (needsSorting[0]) {
+			// automatic tools (svnmerge?) produce unnatural file history
+			// (e.g. cpython/Lib/doctest.py, revision 164 points to cset 63509, 165 - to 38453) 
+			Arrays.sort(commitRevisions);
 		}
+		changelog.range(inspector, commitRevisions);
 	}
 	
 	// for a given local revision of the file, find out local revision in the changelog