changeset 262:3dcd3dd90c77

Improve manifest parsing: decode bytes to chars once, minimize arraycopy on String instantiation, keep set of file revisions from previous manifest only
author Artem Tikhomirov <tikhomirov.artem@gmail.com>
date Thu, 18 Aug 2011 03:46:36 +0200
parents 436bb5f65ce1
children 31f67be94e71
files src/org/tmatesoft/hg/core/Nodeid.java src/org/tmatesoft/hg/internal/Pool.java src/org/tmatesoft/hg/repo/HgManifest.java
diffstat 3 files changed, 87 insertions(+), 28 deletions(-) [+]
line wrap: on
line diff
--- a/src/org/tmatesoft/hg/core/Nodeid.java	Wed Aug 17 20:30:06 2011 +0200
+++ b/src/org/tmatesoft/hg/core/Nodeid.java	Thu Aug 18 03:46:36 2011 +0200
@@ -163,7 +163,7 @@
 			throw new IllegalArgumentException();
 		}
 		// XXX is better impl for String possible?
-		return fromAscii(asciiRepresentation.getBytes(), 0, 40);
+		return fromAscii(asciiRepresentation.toCharArray(), 0, 40);
 	}
 	
 	/**
@@ -183,7 +183,8 @@
 			if (hiNibble >= 16 || lowNibble >= 16) {
 				throw new IllegalArgumentException(String.format("Characters '%c%c' (%1$d and %2$d) at index %d are not valid hex digits", asciiRepresentation[j-2], asciiRepresentation[j-1], j-2));
 			}
-			data[i] = (byte) (((hiNibble << 4) | lowNibble) & 0xFF);
+			b = (((hiNibble << 4) | lowNibble) & 0xFF);
+			data[i] = (byte) b;
 			zeroBytes = zeroBytes && b == 0;
 		}
 		if (zeroBytes) {
@@ -191,4 +192,12 @@
 		}
 		return new Nodeid(data, false);
 	}
+	
+	public static Nodeid fromAscii(char[] asciiRepresentation, int offset, int length) {
+		byte[] b = new byte[length];
+		for (int i = 0; i < b.length; i++) {
+			b[i] = (byte) asciiRepresentation[offset+i];
+		}
+		return fromAscii(b, 0, b.length);
+	}
 }
--- a/src/org/tmatesoft/hg/internal/Pool.java	Wed Aug 17 20:30:06 2011 +0200
+++ b/src/org/tmatesoft/hg/internal/Pool.java	Thu Aug 18 03:46:36 2011 +0200
@@ -18,6 +18,8 @@
 
 import java.util.HashMap;
 
+import org.tmatesoft.hg.util.SparseSet;
+
 /**
  * Instance pooling.
  * 
@@ -25,7 +27,20 @@
  * @author TMate Software Ltd.
  */
 public class Pool<T> {
-	private final HashMap<T,T> unify = new HashMap<T, T>();
+	private final HashMap<T,T> unify;
+//	private final SparseSet<T> unify = new SparseSet<T>();
+	
+	public Pool() {
+		unify = new HashMap<T, T>();
+	}
+	
+	public Pool(int sizeHint) {
+		if (sizeHint <= 0) {
+			unify = new HashMap<T, T>();
+		} else {
+			unify = new HashMap<T, T>(sizeHint * 4 / 3, 0.75f);
+		}
+	}
 	
 	public T unify(T t) {
 		T rv = unify.get(t);
@@ -37,14 +52,30 @@
 		return rv;
 	}
 	
+	public boolean contains(T t) {
+		return unify.containsKey(t);
+	}
+	
+	public void record(T t) {
+		unify.put(t, t);
+	}
+	
+	public void clear() {
+		unify.clear();
+	}
+
+	public int size() {
+		return unify.size();
+	}
+
 	@Override
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
 		sb.append(Pool.class.getSimpleName());
 		sb.append('<');
-		if (!unify.isEmpty()) {
-			sb.append(unify.keySet().iterator().next().getClass().getName());
-		}
+//		if (!unify.isEmpty()) {
+//			sb.append(unify.keySet().iterator().next().getClass().getName());
+//		}
 		sb.append('>');
 		sb.append(':');
 		sb.append(unify.size());
--- a/src/org/tmatesoft/hg/repo/HgManifest.java	Wed Aug 17 20:30:06 2011 +0200
+++ b/src/org/tmatesoft/hg/repo/HgManifest.java	Thu Aug 18 03:46:36 2011 +0200
@@ -152,10 +152,10 @@
 	private static class ManifestParser implements RevlogStream.Inspector {
 		private boolean gtg = true; // good to go
 		private final Inspector inspector;
-		private final Pool<Nodeid> nodeidPool;
+		private Pool<Nodeid> nodeidPool;
 		private final Pool<String> fnamePool;
 		private final Pool<String> flagsPool;
-
+		
 		public ManifestParser(Inspector delegate) {
 			assert delegate != null;
 			inspector = delegate;
@@ -163,41 +163,60 @@
 			fnamePool = new Pool<String>();
 			flagsPool = new Pool<String>();
 		}
-
+		
 		public void next(int revisionNumber, int actualLen, int baseRevision, int linkRevision, int parent1Revision, int parent2Revision, byte[] nodeid, DataAccess da) {
 			if (!gtg) {
 				return;
 			}
 			try {
 				gtg = gtg && inspector.begin(revisionNumber, new Nodeid(nodeid, true), linkRevision);
-				int i;
+				Pool<Nodeid> thisRevPool = new Pool<Nodeid>(nodeidPool.size()); // supply hint to minimize map resize/rehash
 				String fname = null;
 				String flags = null;
 				Nodeid nid = null;
-				byte[] data = da.byteArray();
-				for (i = 0; gtg && i < actualLen; i++) {
-					int x = i;
-					for( ; data[i] != '\n' && i < actualLen; i++) {
-						if (fname == null && data[i] == 0) {
-							fname = fnamePool.unify(new String(data, x, i - x));
-							x = i+1;
+				final char[] nodeidConvertCache = new char[40];
+				String data = new String(da.byteArray());
+				final int dataLen = data.length(); // due to byte->char conversion, may be different
+				for (int x = 0; gtg && x < dataLen; x++) {
+					int start = x;
+					x = data.indexOf('\n', x+1);
+					assert x != -1;
+					int z = data.indexOf('\0', start+1);
+					assert z!= -1;
+					assert z < x;
+					fname = data.substring(start, z);
+					if (fnamePool.contains(fname)) {
+						fname = fnamePool.unify(fname);
+					} else {
+						fnamePool.record(fname = new String(fname));
+					}
+					z++; // cursor at first char of nodeid
+					int nodeidLen = x-z < 40 ? x-z : 40; // if x-z > 40, there are flags
+					data.getChars(z, z+nodeidLen, nodeidConvertCache, 0);
+					nid = nodeidPool.unify(Nodeid.fromAscii(nodeidConvertCache, 0, nodeidLen));
+					thisRevPool.record(nid); // memorize revision for the next iteration. 
+					if (x-z > 40) {
+						// 'x' and 'l' for executable bits and symlinks?
+						// hg --debug manifest shows 644 for each regular file in my repo
+						// for cpython repo, there are 755 in hg --debug output when 'x' flag is present
+						flags = data.substring(z + nodeidLen, x);
+						if (flagsPool.contains(flags)) {
+							flags = flagsPool.unify(flags);
+						} else {
+							flagsPool.record(flags = new String(flags));
 						}
 					}
-					if (i < actualLen) {
-						assert data[i] == '\n'; 
-						int nodeidLen = i - x < 40 ? i-x : 40;
-						nid = nodeidPool.unify(Nodeid.fromAscii(data, x, nodeidLen));
-						if (nodeidLen + x < i) {
-							// 'x' and 'l' for executable bits and symlinks?
-							// hg --debug manifest shows 644 for each regular file in my repo
-							flags = flagsPool.unify(new String(data, x + nodeidLen, i-x-nodeidLen));
-						}
-						gtg = gtg && inspector.next(nid, fname, flags);
-					}
+					gtg = gtg && inspector.next(nid, fname, flags);
 					nid = null;
 					fname = flags = null;
 				}
 				gtg = gtg && inspector.end(revisionNumber);
+				//
+				// keep only actual file revisions, found at this version 
+				// (next manifest is likely to refer to most of them, although in specific cases 
+				// like commit in another branch a lot may be useless)
+				nodeidPool.clear();
+				nodeidPool = thisRevPool;
 			} catch (IOException ex) {
 				throw new HgBadStateException(ex);
 			}