# HG changeset patch # User Artem Tikhomirov # Date 1314131456 -7200 # Node ID c5980f287cc4ef86ea306554053c3cb120248175 # Parent ec921ef0628ec451326aa3f57329e465d34ef2a6 Use StringProxy when parsing manifest to minimize number of useless conversions and array instances diff -r ec921ef0628e -r c5980f287cc4 src/org/tmatesoft/hg/repo/HgManifest.java --- a/src/org/tmatesoft/hg/repo/HgManifest.java Tue Aug 23 21:27:56 2011 +0200 +++ b/src/org/tmatesoft/hg/repo/HgManifest.java Tue Aug 23 22:30:56 2011 +0200 @@ -154,18 +154,29 @@ T get(); } - private static class PoolStringProxy { + /** + * When Pool uses Strings directly, + * ManifestParser creates new String instance with new char[] value, and does byte->char conversion. + * For cpython repo, walk(0..10k), there are over 16 million filenames, of them only 3020 unique. + * This means there are 15.9 million useless char[] instances and byte->char conversions + * + * When String is wrapped into {@link StringProxy}, there's extra overhead of byte[] representation + * of the String, but these are only for unique Strings (3020 in the example above). Besides, I save + * useless char[] and byte->char conversions. + */ + private static class StringProxy { private byte[] data; - private int start, length; - private int hash; + private int start; + private final int hash, length; private String result; - public void init(byte[] data, int start, int length) { + public StringProxy(byte[] data, int start, int length) { this.data = data; this.start = start; this.length = length; - // copy from String.hashCode() + // copy from String.hashCode(). In fact, not necessarily match result of String(data).hashCode + // just need some nice algorithm here int h = 0; byte[] d = data; for (int i = 0, off = start, len = length; i < len; i++) { @@ -176,26 +187,14 @@ @Override public boolean equals(Object obj) { - if (false == obj instanceof PoolStringProxy) { + if (false == obj instanceof StringProxy) { return false; } - PoolStringProxy o = (PoolStringProxy) obj; + StringProxy o = (StringProxy) obj; if (o.result != null && result != null) { return result.equals(o.result); } - if (o.result == null && result != null || o.result != null && result == null) { - String s; PoolStringProxy noString; - if (o.result == null) { - s = result; - noString = o; - } else { - s = o.result; - noString = this; - } - - } - // both are null - if (o.length != length) { + if (o.length != length || o.hash != hash) { return false; } for (int i = 0, x = o.start, y = start; i < length; i++) { @@ -213,30 +212,33 @@ public String freeze() { if (result == null) { result = new String(data, start, length); - data = null; - start = length = -1; + // release reference to bigger data array, make a copy of relevant part only + byte[] d = new byte[length]; + System.arraycopy(data, start, d, 0, length); + data = d; + start = 0; } return result; } } - private static class ManifestParser implements RevlogStream.Inspector/*, Lifecycle */{ + private static class ManifestParser implements RevlogStream.Inspector { private boolean gtg = true; // good to go private final Inspector inspector; private Pool nodeidPool, thisRevPool; - private final Pool fnamePool; - private final Pool flagsPool; + private final Pool fnamePool; + private final Pool flagsPool; private byte[] nodeidLookupBuffer = new byte[20]; // get reassigned each time new Nodeid is added to pool public ManifestParser(Inspector delegate) { assert delegate != null; inspector = delegate; nodeidPool = new Pool(); - fnamePool = new Pool(); - flagsPool = new Pool(); + fnamePool = new Pool(); + flagsPool = new Pool(); thisRevPool = new Pool(); } - + public void next(int revisionNumber, int actualLen, int baseRevision, int linkRevision, int parent1Revision, int parent2Revision, byte[] nodeid, DataAccess da) { if (!gtg) { return; @@ -252,7 +254,10 @@ int x = i; for( ; data[i] != '\n' && i < actualLen; i++) { if (fname == null && data[i] == 0) { - fname = fnamePool.unify(new String(data, x, i - x)); + StringProxy px = fnamePool.unify(new StringProxy(data, x, i - x)); + // if (cached = fnamePool.unify(px))== px then cacheMiss, else cacheHit + // cpython 0..10k: hits: 15 989 152, misses: 3020 + fname = px.freeze(); x = i+1; } } @@ -272,7 +277,8 @@ if (nodeidLen + x < i) { // 'x' and 'l' for executable bits and symlinks? // hg --debug manifest shows 644 for each regular file in my repo - flags = flagsPool.unify(new String(data, x + nodeidLen, i-x-nodeidLen)); + // for cpython 0..10k, there are 4361062 flagPool checks, and there's only 1 unique flag + flags = flagsPool.unify(new StringProxy(data, x + nodeidLen, i-x-nodeidLen)).freeze(); } gtg = gtg && inspector.next(nid, fname, flags); } @@ -292,15 +298,6 @@ throw new HgBadStateException(ex); } } -// -// public void start(int count, Callback callback, Object token) { -// } -// -// public void finish(Object token) { -// System.out.println(fnamePool); -// System.out.println(nodeidPool); -// System.out.printf("Free mem once parse done: %,d\n", Runtime.getRuntime().freeMemory()); -// } } private static class RevisionMapper implements RevlogStream.Inspector, Lifecycle { diff -r ec921ef0628e -r c5980f287cc4 test/org/tmatesoft/hg/test/MapTagsToFileRevisions.java --- a/test/org/tmatesoft/hg/test/MapTagsToFileRevisions.java Tue Aug 23 21:27:56 2011 +0200 +++ b/test/org/tmatesoft/hg/test/MapTagsToFileRevisions.java Tue Aug 23 22:30:56 2011 +0200 @@ -66,6 +66,7 @@ } private void manifestWalk() throws Exception { + System.out.println(System.getProperty("java.version")); final long start = System.currentTimeMillis(); final HgRepository repository = new HgLookup().detect(new File("/temp/hg/cpython")); repository.getManifest().walk(0, 10000, new HgManifest.Inspector() { @@ -87,6 +88,9 @@ // cpython -r 1000: 484 files, -r 2000: 1015 files. Iteration 1000..2000; fnamePool.size:1019 nodeidPool.size:2989 // nodeidPool for two subsequent revisions only: 840. 37 sec for 0..10000. 99 sec for 0..20k // 0..10000 fnamePool: hits:15989152, misses:3020 + // + // With Pool for fname and flags, Nodeid's ascii2bin through local array, overall byte[] iteration, + // 0..10k is 34 seconds now System.out.printf("Total time: %d ms\n", System.currentTimeMillis() - start); System.out.printf("Free mem: %,d\n", Runtime.getRuntime().freeMemory()); }