changeset 268:c5980f287cc4

Use StringProxy when parsing manifest to minimize number of useless conversions and array instances
author Artem Tikhomirov <tikhomirov.artem@gmail.com>
date Tue, 23 Aug 2011 22:30:56 +0200 (2011-08-23)
parents ec921ef0628e
children 7af843ecc378
files src/org/tmatesoft/hg/repo/HgManifest.java test/org/tmatesoft/hg/test/MapTagsToFileRevisions.java
diffstat 2 files changed, 40 insertions(+), 39 deletions(-) [+]
line wrap: on
line diff
--- a/src/org/tmatesoft/hg/repo/HgManifest.java	Tue Aug 23 21:27:56 2011 +0200
+++ b/src/org/tmatesoft/hg/repo/HgManifest.java	Tue Aug 23 22:30:56 2011 +0200
@@ -154,18 +154,29 @@
 		T get();
 	}
 	
-	private static class PoolStringProxy {
+	/**
+	 * When Pool uses Strings directly,
+	 * ManifestParser creates new String instance with new char[] value, and does byte->char conversion.
+	 * For cpython repo, walk(0..10k), there are over 16 million filenames, of them only 3020 unique.
+	 * This means there are 15.9 million useless char[] instances and byte->char conversions  
+	 * 
+	 * When String is wrapped into {@link StringProxy}, there's extra overhead of byte[] representation
+	 * of the String, but these are only for unique Strings (3020 in the example above). Besides, I save
+	 * useless char[] and byte->char conversions. 
+	 */
+	private static class StringProxy {
 		private byte[] data;
-		private int start, length;
-		private int hash;
+		private int start; 
+		private final int hash, length;
 		private String result;
 
-		public void init(byte[] data, int start, int length) {
+		public StringProxy(byte[] data, int start, int length) {
 			this.data = data;
 			this.start = start;
 			this.length = length;
 
-			// copy from String.hashCode()
+			// copy from String.hashCode(). In fact, not necessarily match result of String(data).hashCode
+			// just need some nice algorithm here
 			int h = 0;
 			byte[] d = data;
 			for (int i = 0, off = start, len = length; i < len; i++) {
@@ -176,26 +187,14 @@
 
 		@Override
 		public boolean equals(Object obj) {
-			if (false == obj instanceof PoolStringProxy) {
+			if (false == obj instanceof StringProxy) {
 				return false;
 			}
-			PoolStringProxy o = (PoolStringProxy) obj;
+			StringProxy o = (StringProxy) obj;
 			if (o.result != null && result != null) {
 				return result.equals(o.result);
 			}
-			if (o.result == null && result != null || o.result != null && result == null) {
-				String s; PoolStringProxy noString; 
-				if (o.result == null) {
-					s = result;
-					noString = o;
-				} else {
-					s = o.result;
-					noString = this;
-				}
-				
-			}
-			// both are null
-			if (o.length != length) {
+			if (o.length != length || o.hash != hash) {
 				return false;
 			}
 			for (int i = 0, x = o.start, y = start; i < length; i++) {
@@ -213,30 +212,33 @@
 		public String freeze() {
 			if (result == null) {
 				result = new String(data, start, length);
-				data = null;
-				start = length = -1;
+				// release reference to bigger data array, make a copy of relevant part only
+				byte[] d = new byte[length];
+				System.arraycopy(data, start, d, 0, length);
+				data = d;
+				start = 0;
 			}
 			return result;
 		}
 	}
 
-	private static class ManifestParser implements RevlogStream.Inspector/*, Lifecycle */{
+	private static class ManifestParser implements RevlogStream.Inspector {
 		private boolean gtg = true; // good to go
 		private final Inspector inspector;
 		private Pool<Nodeid> nodeidPool, thisRevPool;
-		private final Pool<String> fnamePool;
-		private final Pool<String> flagsPool;
+		private final Pool<StringProxy> fnamePool;
+		private final Pool<StringProxy> flagsPool;
 		private byte[] nodeidLookupBuffer = new byte[20]; // get reassigned each time new Nodeid is added to pool
 		
 		public ManifestParser(Inspector delegate) {
 			assert delegate != null;
 			inspector = delegate;
 			nodeidPool = new Pool<Nodeid>();
-			fnamePool = new Pool<String>();
-			flagsPool = new Pool<String>();
+			fnamePool = new Pool<StringProxy>();
+			flagsPool = new Pool<StringProxy>();
 			thisRevPool = new Pool<Nodeid>();
 		}
-
+		
 		public void next(int revisionNumber, int actualLen, int baseRevision, int linkRevision, int parent1Revision, int parent2Revision, byte[] nodeid, DataAccess da) {
 			if (!gtg) {
 				return;
@@ -252,7 +254,10 @@
 					int x = i;
 					for( ; data[i] != '\n' && i < actualLen; i++) {
 						if (fname == null && data[i] == 0) {
-							fname = fnamePool.unify(new String(data, x, i - x));
+							StringProxy px = fnamePool.unify(new StringProxy(data, x, i - x));
+							// if (cached = fnamePool.unify(px))== px then cacheMiss, else cacheHit
+							// cpython 0..10k: hits: 15 989 152, misses: 3020
+							fname = px.freeze();
 							x = i+1;
 						}
 					}
@@ -272,7 +277,8 @@
 						if (nodeidLen + x < i) {
 							// 'x' and 'l' for executable bits and symlinks?
 							// hg --debug manifest shows 644 for each regular file in my repo
-							flags = flagsPool.unify(new String(data, x + nodeidLen, i-x-nodeidLen));
+							// for cpython 0..10k, there are 4361062 flagPool checks, and there's only 1 unique flag
+							flags = flagsPool.unify(new StringProxy(data, x + nodeidLen, i-x-nodeidLen)).freeze();
 						}
 						gtg = gtg && inspector.next(nid, fname, flags);
 					}
@@ -292,15 +298,6 @@
 				throw new HgBadStateException(ex);
 			}
 		}
-//
-//		public void start(int count, Callback callback, Object token) {
-//		}
-//
-//		public void finish(Object token) {
-//			System.out.println(fnamePool);
-//			System.out.println(nodeidPool);
-//			System.out.printf("Free mem once parse done: %,d\n", Runtime.getRuntime().freeMemory());
-//		}
 	}
 	
 	private static class RevisionMapper implements RevlogStream.Inspector, Lifecycle {
--- a/test/org/tmatesoft/hg/test/MapTagsToFileRevisions.java	Tue Aug 23 21:27:56 2011 +0200
+++ b/test/org/tmatesoft/hg/test/MapTagsToFileRevisions.java	Tue Aug 23 22:30:56 2011 +0200
@@ -66,6 +66,7 @@
 	}
 
 	private void manifestWalk() throws Exception {
+		System.out.println(System.getProperty("java.version"));
 		final long start = System.currentTimeMillis();
 		final HgRepository repository = new HgLookup().detect(new File("/temp/hg/cpython"));
 		repository.getManifest().walk(0, 10000, new HgManifest.Inspector() {
@@ -87,6 +88,9 @@
 		// cpython -r 1000: 484 files, -r 2000: 1015 files. Iteration 1000..2000; fnamePool.size:1019 nodeidPool.size:2989
 		// nodeidPool for two subsequent revisions only: 840. 37 sec for 0..10000. 99 sec for 0..20k
 		// 0..10000 fnamePool: hits:15989152, misses:3020
+		//
+		// With Pool<StringProxy> for fname and flags, Nodeid's ascii2bin through local array, overall byte[] iteration, 
+		// 0..10k is 34 seconds now
 		System.out.printf("Total time: %d ms\n", System.currentTimeMillis() - start);
 		System.out.printf("Free mem: %,d\n", Runtime.getRuntime().freeMemory());
 	}