changeset 197:3a7696fb457c

Investigate optimization options to allow fast processing of huge repositories. Fix defect in StatusCollector that lead to wrong result comparing first revision to empty repo (-1 to 0), due to same TIP constant value
author Artem Tikhomirov <tikhomirov.artem@gmail.com>
date Tue, 19 Apr 2011 03:49:29 +0200
parents e2115da4cf6a
children 33a7d76f067b
files cmdline/org/tmatesoft/hg/console/Log.java cmdline/org/tmatesoft/hg/console/Main.java design.txt src/org/tmatesoft/hg/core/Nodeid.java src/org/tmatesoft/hg/repo/HgRepository.java src/org/tmatesoft/hg/repo/HgStatusCollector.java src/org/tmatesoft/hg/repo/Revlog.java
diffstat 7 files changed, 131 insertions(+), 45 deletions(-) [+]
line wrap: on
line diff
--- a/cmdline/org/tmatesoft/hg/console/Log.java	Mon Apr 18 18:04:24 2011 +0200
+++ b/cmdline/org/tmatesoft/hg/console/Log.java	Tue Apr 19 03:49:29 2011 +0200
@@ -35,6 +35,8 @@
  */
 public class Log {
 
+	// -agentlib:hprof=heap=sites,depth=10,etc might be handy to debug speed/memory issues
+	
 	public static void main(String[] args) throws Exception {
 		Options cmdLineOpts = Options.parse(args);
 		HgRepository hgRepo = cmdLineOpts.findRepository();
@@ -45,7 +47,7 @@
 		final Dump dump = new Dump(hgRepo);
 		dump.complete = cmdLineOpts.getBoolean("--debug");
 		dump.verbose = cmdLineOpts.getBoolean("-v", "--verbose");
-		dump.reverseOrder = true;
+		dump.reverseOrder = false;
 		HgLogCommand cmd = new HgLogCommand(hgRepo);
 		for (String u : cmdLineOpts.getList("-u", "--user")) {
 			cmd.user(u);
@@ -58,6 +60,7 @@
 			cmd.limit(limit);
 		}
 		List<String> files = cmdLineOpts.getList("");
+		final long start = System.currentTimeMillis();
 		if (files.isEmpty()) {
 			if (limit == -1) {
 				// no revisions and no limit
@@ -90,8 +93,9 @@
 				dump.complete();
 			}
 		}
-		//
-		// XXX new ChangelogWalker().setFile("hello.c").setRevisionRange(1, 4).accept(new Visitor);
+//		cmd = null;
+		System.out.println("Total time:" + (System.currentTimeMillis() - start));
+//		Main.force_gc();
 	}
 	
 	private static int fixRange(int[] start_end, boolean reverse, int limit) {
--- a/cmdline/org/tmatesoft/hg/console/Main.java	Mon Apr 18 18:04:24 2011 +0200
+++ b/cmdline/org/tmatesoft/hg/console/Main.java	Tue Apr 19 03:49:29 2011 +0200
@@ -225,6 +225,15 @@
 		}
 	}
 
+	static void force_gc() {
+		Runtime.getRuntime().runFinalization();
+		Runtime.getRuntime().gc();
+		Thread.yield();
+		Runtime.getRuntime().runFinalization();
+		Runtime.getRuntime().gc();
+		Thread.yield();
+	}
+
 	private static class StatusDump implements HgStatusInspector {
 		public boolean hideStatusPrefix = false; // hg status -n option
 		public boolean showCopied = true; // -C
--- a/design.txt	Mon Apr 18 18:04:24 2011 +0200
+++ b/design.txt	Tue Apr 19 03:49:29 2011 +0200
@@ -60,6 +60,7 @@
 
 ? subrepos in log, status (-S) and manifest commands
 
+? when p1 == -1, and p2 != -1, does HgStatusCollector.change() give correct result?
 
 Commands to get CommandContext where they may share various caches (e.g. StatusCollector)
 Perhaps, abstract classes for all Inspectors (i.e. StatusCollector.Inspector) for users to use as base classes to protect from change?
@@ -92,6 +93,13 @@
   + ConfigFile to strip comments from values (#)
 
 <<<<<
+Performance.
+after pooling/caching in HgStatusCollector and HgChangeset
+hg log --debug -r 0:5000 and same via Log/HgLogCommand: approx. 220 seconds vs 279 seconds. Mem. cons. 20 vs 80 mb.
+after further changes in HgStatusCollector (to read ahead 5 elements, 50 max cache, fixed bug with -1) - hg4j dumps 5000 in
+93 seconds, memory consumption about 50-56 Mb
+
+<<<<<
 
 Tests:
 DataAccess - readBytes(length > memBufferSize, length*2 > memBufferSize) - to check impl is capable to read huge chunks of data, regardless of own buffer size
--- a/src/org/tmatesoft/hg/core/Nodeid.java	Mon Apr 18 18:04:24 2011 +0200
+++ b/src/org/tmatesoft/hg/core/Nodeid.java	Tue Apr 19 03:49:29 2011 +0200
@@ -46,11 +46,23 @@
 	 */
 	public Nodeid(byte[] binaryRepresentation, boolean shallClone) {
 		// 5 int fields => 32 bytes
-		// byte[20] => 48 bytes
+		// byte[20] => 48 bytes (16 bytes is Nodeid with one field, 32 bytes for byte[20] 
 		if (binaryRepresentation == null || binaryRepresentation.length != 20) {
 			throw new IllegalArgumentException();
 		}
-		this.binaryData = shallClone ? binaryRepresentation.clone() : binaryRepresentation;
+		/*
+		 * byte[].clone() is not reflected when ran with -agentlib:hprof=heap=sites
+		 * thus not to get puzzled why there are N Nodeids and much less byte[] instances,
+		 * may use following code to see N byte[] as well.
+		 *
+		if (shallClone) {
+			binaryData = new byte[20];
+			System.arraycopy(binaryRepresentation, 0, binaryData, 0, 20);
+		} else {
+			binaryData = binaryRepresentation;
+		}
+		*/
+		binaryData = shallClone ? binaryRepresentation.clone() : binaryRepresentation;
 	}
 
 	@Override
--- a/src/org/tmatesoft/hg/repo/HgRepository.java	Mon Apr 18 18:04:24 2011 +0200
+++ b/src/org/tmatesoft/hg/repo/HgRepository.java	Tue Apr 19 03:49:29 2011 +0200
@@ -45,8 +45,8 @@
  */
 public final class HgRepository {
 
-	// if new constants added, consider fixing HgInternals#badLocalRevision
-	public static final int TIP = -1;
+	// if new constants added, consider fixing HgInternals#wrongLocalRevision
+	public static final int TIP = -3;
 	public static final int BAD_REVISION = Integer.MIN_VALUE;
 	public static final int WORKING_COPY = -2;
 
--- a/src/org/tmatesoft/hg/repo/HgStatusCollector.java	Mon Apr 18 18:04:24 2011 +0200
+++ b/src/org/tmatesoft/hg/repo/HgStatusCollector.java	Tue Apr 19 03:49:29 2011 +0200
@@ -51,10 +51,11 @@
 	// no cache limit, no nodeids and fname caching - OOME on changeset 1035
 	// no cache limit, but with cached nodeids and filenames - 1730+
 	// cache limit 100 - 19+ minutes to process 10000, and still working (too long, stopped)
-	private final int cacheMaxSize = 100; // do not keep too much manifest revisions
+	private final int cacheMaxSize = 50; // do not keep too much manifest revisions
 	private PathPool pathPool;
 	private final Pool<Nodeid> cacheNodes;
 	private final Pool<String> cacheFilenames; // XXX in fact, need to think if use of PathPool directly instead is better solution
+	private final ManifestRevisionInspector emptyFakeState;
 	
 
 	public HgStatusCollector(HgRepository hgRepo) {
@@ -62,10 +63,10 @@
 		cache = new TreeMap<Integer, ManifestRevisionInspector>();
 		cacheNodes = new Pool<Nodeid>();
 		cacheFilenames = new Pool<String>();
-		ManifestRevisionInspector emptyFakeState = new ManifestRevisionInspector(null, null);
+
+		emptyFakeState = new ManifestRevisionInspector(null, null);
 		emptyFakeState.begin(-1, null);
-		emptyFakeState.end(-1); // FIXME HgRepo.TIP == -1 as well, need to distinguish fake "prior to first" revision from "the very last" 
-		cache.put(-1, emptyFakeState);
+		emptyFakeState.end(-1);
 	}
 	
 	public HgRepository getRepo() {
@@ -75,7 +76,10 @@
 	private ManifestRevisionInspector get(int rev) {
 		ManifestRevisionInspector i = cache.get(rev);
 		if (i == null) {
-			if (cache.size() > cacheMaxSize) {
+			if (rev == -1) {
+				return emptyFakeState;
+			}
+			while (cache.size() > cacheMaxSize) {
 				// assume usually we go from oldest to newest, hence remove oldest as most likely to be no longer necessary
 				cache.remove(cache.firstKey());
 			}
@@ -85,6 +89,51 @@
 		}
 		return i;
 	}
+
+	private boolean cached(int revision) {
+		return cache.containsKey(revision) || revision == -1;
+	}
+	
+	private void initCacheRange(int minRev, int maxRev) {
+		while (cache.size() > cacheMaxSize) {
+			// assume usually we go from oldest to newest, hence remove oldest as most likely to be no longer necessary
+			cache.remove(cache.firstKey());
+		}
+		repo.getManifest().walk(minRev, maxRev, new HgManifest.Inspector() {
+			private ManifestRevisionInspector delegate;
+			private boolean cacheHit; // range may include revisions we already know about, do not re-create them
+
+			public boolean begin(int revision, Nodeid nid) {
+				assert delegate == null;
+				if (cache.containsKey(revision)) { // don't need to check emptyFakeState hit as revision never -1 here
+					cacheHit = true;
+				} else {
+					cache.put(revision, delegate = new ManifestRevisionInspector(cacheNodes, cacheFilenames));
+					// cache may grow bigger than max size here, but it's ok as present simplistic cache clearing mechanism may
+					// otherwise remove entries we just added
+					delegate.begin(revision, nid);
+					cacheHit = false;
+				}
+				return true;
+			}
+
+			public boolean next(Nodeid nid, String fname, String flags) {
+				if (!cacheHit) {
+					delegate.next(nid, fname, flags);
+				}
+				return true;
+			}
+			
+			public boolean end(int revision) {
+				if (!cacheHit) {
+					delegate.end(revision);
+				}
+				cacheHit = false;				
+				delegate = null;
+				return true;
+			}
+		});
+	}
 	
 	/*package-local*/ ManifestRevisionInspector raw(int rev) {
 		return get(rev);
@@ -110,10 +159,12 @@
 		repo.getChangelog().parents(rev, parents, null, null);
 		walk(parents[0], rev, inspector);
 	}
-
+	
 	// I assume revision numbers are the same for changelog and manifest - here 
 	// user would like to pass changelog revision numbers, and I use them directly to walk manifest.
 	// if this assumption is wrong, fix this (lookup manifest revisions from changeset).
+	// rev1 and rev2 may be -1 to indicate comparison to empty repository
+	// argument order matters 
 	public void walk(int rev1, int rev2, HgStatusInspector inspector) {
 		if (rev1 == rev2) {
 			throw new IllegalArgumentException();
@@ -124,44 +175,42 @@
 		if (inspector instanceof Record) {
 			((Record) inspector).init(rev1, rev2, this);
 		}
+		final int lastManifestRevision = repo.getManifest().getLastRevision();
 		if (rev1 == TIP) {
-			rev1 = repo.getManifest().getLastRevision();
+			rev1 = lastManifestRevision;
 		}
 		if (rev2 == TIP) {
-			rev2 = repo.getManifest().getLastRevision(); 
+			rev2 = lastManifestRevision; 
 		}
 		// in fact, rev1 and rev2 are often next (or close) to each other,
 		// thus, we can optimize Manifest reads here (manifest.walk(rev1, rev2))
 		ManifestRevisionInspector r1, r2 ;
-		if (!cache.containsKey(rev1) && !cache.containsKey(rev2) && Math.abs(rev1 - rev2) < 5 /*subjective equivalent of 'close enough'*/) {
-			int minRev = rev1 < rev2 ? rev1 : rev2;
-			int maxRev = minRev == rev1 ? rev2 : rev1;
-			if (minRev > 0) {
-				minRev--; // expand range a bit
-				// XXX perhaps, if revlog.baseRevision is cheap, shall expand minRev up to baseRevision
-				// which gonna be read anyway
+		boolean need1 = !cached(rev1), need2 = !cached(rev2);
+		if (need1 || need2) {
+			int minRev, maxRev;
+			if (need1 && need2 && Math.abs(rev1 - rev2) < 5 /*subjective equivalent of 'close enough'*/) {
+				minRev = rev1 < rev2 ? rev1 : rev2;
+				maxRev = minRev == rev1 ? rev2 : rev1;
+				if (minRev > 0) {
+					minRev--; // expand range a bit
+				}
+				initCacheRange(minRev, maxRev);
+				need1 = need2 = false;
 			}
-	
-			repo.getManifest().walk(minRev, maxRev, new HgManifest.Inspector() {
-				private ManifestRevisionInspector delegate;
-
-				public boolean begin(int revision, Nodeid nid) {
-					cache.put(revision, delegate = new ManifestRevisionInspector(cacheNodes, cacheFilenames));
-					delegate.begin(revision, nid);
-					return true;
-				}
-
-				public boolean next(Nodeid nid, String fname, String flags) {
-					delegate.next(nid, fname, flags);
-					return true;
-				}
-				
-				public boolean end(int revision) {
-					delegate.end(revision);
-					delegate = null;
-					return true;
-				}
-			});
+			// either both unknown and far from each other, or just one of them.
+			// read with neighbors to save potential subsequent calls for neighboring elements
+			// XXX perhaps, if revlog.baseRevision is cheap, shall expand minRev up to baseRevision
+			// which going to be read anyway
+			if (need1) {
+				minRev = rev1;
+				maxRev = rev1 < lastManifestRevision-5 ? rev1+5 : lastManifestRevision;
+				initCacheRange(minRev, maxRev);
+			}
+			if (need2) {
+				minRev = rev2;
+				maxRev = rev2 < lastManifestRevision-5 ? rev2+5 : lastManifestRevision;
+				initCacheRange(minRev, maxRev);
+			}
 		}
 		r1 = get(rev1);
 		r2 = get(rev2);
@@ -396,7 +445,11 @@
 				nid = idsPool.unify(nid);
 			}
 			idsMap.put(fname, nid);
-			flagsMap.put(fname, flags);
+			if (flags != null) {
+				// TreeMap$Entry takes 32 bytes. No reason to keep null for such price
+				// Perhaps, Map<String, Pair<Nodeid, String>> might be better solution
+				flagsMap.put(fname, flags);
+			}
 			return true;
 		}
 
--- a/src/org/tmatesoft/hg/repo/Revlog.java	Mon Apr 18 18:04:24 2011 +0200
+++ b/src/org/tmatesoft/hg/repo/Revlog.java	Tue Apr 19 03:49:29 2011 +0200
@@ -240,7 +240,7 @@
 					}
 				}
 			};
-			stream.iterate(0, -1, false, insp);
+			stream.iterate(0, TIP, false, insp);
 		}
 		
 		public Set<Nodeid> allNodes() {