diff src/org/tmatesoft/hg/repo/HgStatusCollector.java @ 197:3a7696fb457c

Investigate optimization options to allow fast processing of huge repositories. Fix defect in StatusCollector that lead to wrong result comparing first revision to empty repo (-1 to 0), due to same TIP constant value
author Artem Tikhomirov <tikhomirov.artem@gmail.com>
date Tue, 19 Apr 2011 03:49:29 +0200
parents c9b305df0b89
children 047b1dec7a04
line wrap: on
line diff
--- a/src/org/tmatesoft/hg/repo/HgStatusCollector.java	Mon Apr 18 18:04:24 2011 +0200
+++ b/src/org/tmatesoft/hg/repo/HgStatusCollector.java	Tue Apr 19 03:49:29 2011 +0200
@@ -51,10 +51,11 @@
 	// no cache limit, no nodeids and fname caching - OOME on changeset 1035
 	// no cache limit, but with cached nodeids and filenames - 1730+
 	// cache limit 100 - 19+ minutes to process 10000, and still working (too long, stopped)
-	private final int cacheMaxSize = 100; // do not keep too much manifest revisions
+	private final int cacheMaxSize = 50; // do not keep too much manifest revisions
 	private PathPool pathPool;
 	private final Pool<Nodeid> cacheNodes;
 	private final Pool<String> cacheFilenames; // XXX in fact, need to think if use of PathPool directly instead is better solution
+	private final ManifestRevisionInspector emptyFakeState;
 	
 
 	public HgStatusCollector(HgRepository hgRepo) {
@@ -62,10 +63,10 @@
 		cache = new TreeMap<Integer, ManifestRevisionInspector>();
 		cacheNodes = new Pool<Nodeid>();
 		cacheFilenames = new Pool<String>();
-		ManifestRevisionInspector emptyFakeState = new ManifestRevisionInspector(null, null);
+
+		emptyFakeState = new ManifestRevisionInspector(null, null);
 		emptyFakeState.begin(-1, null);
-		emptyFakeState.end(-1); // FIXME HgRepo.TIP == -1 as well, need to distinguish fake "prior to first" revision from "the very last" 
-		cache.put(-1, emptyFakeState);
+		emptyFakeState.end(-1);
 	}
 	
 	public HgRepository getRepo() {
@@ -75,7 +76,10 @@
 	private ManifestRevisionInspector get(int rev) {
 		ManifestRevisionInspector i = cache.get(rev);
 		if (i == null) {
-			if (cache.size() > cacheMaxSize) {
+			if (rev == -1) {
+				return emptyFakeState;
+			}
+			while (cache.size() > cacheMaxSize) {
 				// assume usually we go from oldest to newest, hence remove oldest as most likely to be no longer necessary
 				cache.remove(cache.firstKey());
 			}
@@ -85,6 +89,51 @@
 		}
 		return i;
 	}
+
+	private boolean cached(int revision) {
+		return cache.containsKey(revision) || revision == -1;
+	}
+	
+	private void initCacheRange(int minRev, int maxRev) {
+		while (cache.size() > cacheMaxSize) {
+			// assume usually we go from oldest to newest, hence remove oldest as most likely to be no longer necessary
+			cache.remove(cache.firstKey());
+		}
+		repo.getManifest().walk(minRev, maxRev, new HgManifest.Inspector() {
+			private ManifestRevisionInspector delegate;
+			private boolean cacheHit; // range may include revisions we already know about, do not re-create them
+
+			public boolean begin(int revision, Nodeid nid) {
+				assert delegate == null;
+				if (cache.containsKey(revision)) { // don't need to check emptyFakeState hit as revision never -1 here
+					cacheHit = true;
+				} else {
+					cache.put(revision, delegate = new ManifestRevisionInspector(cacheNodes, cacheFilenames));
+					// cache may grow bigger than max size here, but it's ok as present simplistic cache clearing mechanism may
+					// otherwise remove entries we just added
+					delegate.begin(revision, nid);
+					cacheHit = false;
+				}
+				return true;
+			}
+
+			public boolean next(Nodeid nid, String fname, String flags) {
+				if (!cacheHit) {
+					delegate.next(nid, fname, flags);
+				}
+				return true;
+			}
+			
+			public boolean end(int revision) {
+				if (!cacheHit) {
+					delegate.end(revision);
+				}
+				cacheHit = false;				
+				delegate = null;
+				return true;
+			}
+		});
+	}
 	
 	/*package-local*/ ManifestRevisionInspector raw(int rev) {
 		return get(rev);
@@ -110,10 +159,12 @@
 		repo.getChangelog().parents(rev, parents, null, null);
 		walk(parents[0], rev, inspector);
 	}
-
+	
 	// I assume revision numbers are the same for changelog and manifest - here 
 	// user would like to pass changelog revision numbers, and I use them directly to walk manifest.
 	// if this assumption is wrong, fix this (lookup manifest revisions from changeset).
+	// rev1 and rev2 may be -1 to indicate comparison to empty repository
+	// argument order matters 
 	public void walk(int rev1, int rev2, HgStatusInspector inspector) {
 		if (rev1 == rev2) {
 			throw new IllegalArgumentException();
@@ -124,44 +175,42 @@
 		if (inspector instanceof Record) {
 			((Record) inspector).init(rev1, rev2, this);
 		}
+		final int lastManifestRevision = repo.getManifest().getLastRevision();
 		if (rev1 == TIP) {
-			rev1 = repo.getManifest().getLastRevision();
+			rev1 = lastManifestRevision;
 		}
 		if (rev2 == TIP) {
-			rev2 = repo.getManifest().getLastRevision(); 
+			rev2 = lastManifestRevision; 
 		}
 		// in fact, rev1 and rev2 are often next (or close) to each other,
 		// thus, we can optimize Manifest reads here (manifest.walk(rev1, rev2))
 		ManifestRevisionInspector r1, r2 ;
-		if (!cache.containsKey(rev1) && !cache.containsKey(rev2) && Math.abs(rev1 - rev2) < 5 /*subjective equivalent of 'close enough'*/) {
-			int minRev = rev1 < rev2 ? rev1 : rev2;
-			int maxRev = minRev == rev1 ? rev2 : rev1;
-			if (minRev > 0) {
-				minRev--; // expand range a bit
-				// XXX perhaps, if revlog.baseRevision is cheap, shall expand minRev up to baseRevision
-				// which gonna be read anyway
+		boolean need1 = !cached(rev1), need2 = !cached(rev2);
+		if (need1 || need2) {
+			int minRev, maxRev;
+			if (need1 && need2 && Math.abs(rev1 - rev2) < 5 /*subjective equivalent of 'close enough'*/) {
+				minRev = rev1 < rev2 ? rev1 : rev2;
+				maxRev = minRev == rev1 ? rev2 : rev1;
+				if (minRev > 0) {
+					minRev--; // expand range a bit
+				}
+				initCacheRange(minRev, maxRev);
+				need1 = need2 = false;
 			}
-	
-			repo.getManifest().walk(minRev, maxRev, new HgManifest.Inspector() {
-				private ManifestRevisionInspector delegate;
-
-				public boolean begin(int revision, Nodeid nid) {
-					cache.put(revision, delegate = new ManifestRevisionInspector(cacheNodes, cacheFilenames));
-					delegate.begin(revision, nid);
-					return true;
-				}
-
-				public boolean next(Nodeid nid, String fname, String flags) {
-					delegate.next(nid, fname, flags);
-					return true;
-				}
-				
-				public boolean end(int revision) {
-					delegate.end(revision);
-					delegate = null;
-					return true;
-				}
-			});
+			// either both unknown and far from each other, or just one of them.
+			// read with neighbors to save potential subsequent calls for neighboring elements
+			// XXX perhaps, if revlog.baseRevision is cheap, shall expand minRev up to baseRevision
+			// which going to be read anyway
+			if (need1) {
+				minRev = rev1;
+				maxRev = rev1 < lastManifestRevision-5 ? rev1+5 : lastManifestRevision;
+				initCacheRange(minRev, maxRev);
+			}
+			if (need2) {
+				minRev = rev2;
+				maxRev = rev2 < lastManifestRevision-5 ? rev2+5 : lastManifestRevision;
+				initCacheRange(minRev, maxRev);
+			}
 		}
 		r1 = get(rev1);
 		r2 = get(rev2);
@@ -396,7 +445,11 @@
 				nid = idsPool.unify(nid);
 			}
 			idsMap.put(fname, nid);
-			flagsMap.put(fname, flags);
+			if (flags != null) {
+				// TreeMap$Entry takes 32 bytes. No reason to keep null for such price
+				// Perhaps, Map<String, Pair<Nodeid, String>> might be better solution
+				flagsMap.put(fname, flags);
+			}
 			return true;
 		}