changeset 304:85b8efde5586

Use memory-friendly set implementation to canonicalize filenames and nodeids
author Artem Tikhomirov <tikhomirov.artem@gmail.com>
date Wed, 21 Sep 2011 18:26:16 +0200
parents 2ffcbf360fd5
children ae8d116f4ee2
files src/org/tmatesoft/hg/core/Nodeid.java src/org/tmatesoft/hg/internal/Pool2.java src/org/tmatesoft/hg/repo/HgManifest.java src/org/tmatesoft/hg/util/DirectHashSet.java src/org/tmatesoft/hg/util/SparseSet.java test/org/tmatesoft/hg/test/MapTagsToFileRevisions.java
diffstat 6 files changed, 150 insertions(+), 325 deletions(-) [+]
line wrap: on
line diff
--- a/src/org/tmatesoft/hg/core/Nodeid.java	Tue Sep 20 04:43:39 2011 +0200
+++ b/src/org/tmatesoft/hg/core/Nodeid.java	Wed Sep 21 18:26:16 2011 +0200
@@ -76,8 +76,11 @@
 	
 	@Override
 	public boolean equals(Object o) {
+		if (o == this) {
+			return true;
+		}
 		if (o instanceof Nodeid) {
-			return this == o || equalsTo(((Nodeid) o).binaryData);
+			return equalsTo(((Nodeid) o).binaryData);
 		}
 		return false;
 	}
--- a/src/org/tmatesoft/hg/internal/Pool2.java	Tue Sep 20 04:43:39 2011 +0200
+++ b/src/org/tmatesoft/hg/internal/Pool2.java	Wed Sep 21 18:26:16 2011 +0200
@@ -16,7 +16,8 @@
  */
 package org.tmatesoft.hg.internal;
 
-import org.tmatesoft.hg.util.SparseSet;
+import org.tmatesoft.hg.util.DirectHashSet;
+
 
 /**
  *
@@ -24,7 +25,7 @@
  * @author TMate Software Ltd.
  */
 public class Pool2<T> {
-	private final SparseSet<T> unify = new SparseSet<T>();
+	private final DirectHashSet<T> unify = new DirectHashSet<T>();
 	
 	public Pool2() {
 	}
@@ -57,10 +58,6 @@
 	public int size() {
 		return unify.size();
 	}
-	
-	public void x() {
-		unify.dump();
-	}
 
 	@Override
 	public String toString() {
--- a/src/org/tmatesoft/hg/repo/HgManifest.java	Tue Sep 20 04:43:39 2011 +0200
+++ b/src/org/tmatesoft/hg/repo/HgManifest.java	Wed Sep 21 18:26:16 2011 +0200
@@ -29,7 +29,7 @@
 import org.tmatesoft.hg.internal.DigestHelper;
 import org.tmatesoft.hg.internal.Experimental;
 import org.tmatesoft.hg.internal.Lifecycle;
-import org.tmatesoft.hg.internal.Pool;
+import org.tmatesoft.hg.internal.Pool2;
 import org.tmatesoft.hg.internal.RevlogStream;
 import org.tmatesoft.hg.util.Path;
 
@@ -281,17 +281,17 @@
 		private boolean gtg = true; // good to go
 		private final Inspector inspector;
 		private final Inspector2 inspector2;
-		private Pool<Nodeid> nodeidPool, thisRevPool;
-		private final Pool<PathProxy> fnamePool;
+		private Pool2<Nodeid> nodeidPool, thisRevPool;
+		private final Pool2<PathProxy> fnamePool;
 		private byte[] nodeidLookupBuffer = new byte[20]; // get reassigned each time new Nodeid is added to pool
 		
 		public ManifestParser(Inspector delegate) {
 			assert delegate != null;
 			inspector = delegate;
 			inspector2 = delegate instanceof Inspector2 ? (Inspector2) delegate : null;
-			nodeidPool = new Pool<Nodeid>();
-			fnamePool = new Pool<PathProxy>();
-			thisRevPool = new Pool<Nodeid>();
+			nodeidPool = new Pool2<Nodeid>();
+			fnamePool = new Pool2<PathProxy>();
+			thisRevPool = new Pool2<Nodeid>();
 		}
 		
 		public void next(int revisionNumber, int actualLen, int baseRevision, int linkRevision, int parent1Revision, int parent2Revision, byte[] nodeid, DataAccess da) {
@@ -354,7 +354,7 @@
 				// (next manifest is likely to refer to most of them, although in specific cases 
 				// like commit in another branch a lot may be useless)
 				nodeidPool.clear();
-				Pool<Nodeid> t = nodeidPool;
+				Pool2<Nodeid> t = nodeidPool;
 				nodeidPool = thisRevPool;
 				thisRevPool = t;
 			} catch (IOException ex) {
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/org/tmatesoft/hg/util/DirectHashSet.java	Wed Sep 21 18:26:16 2011 +0200
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2011 TMate Software Ltd
+ *  
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * For information on how to redistribute this software under
+ * the terms of a license other than GNU General Public License
+ * contact TMate Software at support@hg4j.com
+ */
+package org.tmatesoft.hg.util;
+
+import org.tmatesoft.hg.internal.Experimental;
+
+/**
+ * Memory-friendly alternative to HashSet. With slightly worse performance than that of HashSet, uses n * sizeof(HashMap.Entry) less memory 
+ * (i.e. for set of 50k elements saves more than 1 Mb of memory). Besides, elements of this set can be obtained (not only queried for presence) -  
+ * the option essential for canonical mappings (aka Pool)
+ * 
+ * @author Artem Tikhomirov
+ * @author TMate Software Ltd.
+ */
+@Experimental
+public class DirectHashSet<T> {
+	
+	private Object[] table;
+	private int size;
+	private int threshold;
+
+	public DirectHashSet() {
+		this (16);
+	}
+
+	public DirectHashSet(int capacity) {
+        int result = 2;
+        while (result < capacity) {
+        	result <<= 1;
+        }
+		table = new Object[result];
+		threshold = result - (result >>> 2);
+	}
+
+	/**
+	 * Add element to the set.
+	 * @param o element, shall not be <code>null</code>
+	 * @return previous element from the set equal to the argument
+	 */
+	@SuppressWarnings("unchecked")
+	public T put(T o) {
+		final int h = hash(o);
+		final int mask = table.length - 1;
+		int i = h & mask;
+		Object t;
+		while ((t = table[i]) != null) {
+			if (t.equals(o)) {
+				table[i] = o;
+				return (T) t;
+			}
+			i = (i+1) & mask;
+		}
+		table[i] = o;
+		if (++size >= threshold) {
+			resize();
+		}
+		return null;
+	}
+	
+	/**
+	 * Query set for the element.
+	 * @param o element, shall not be <code>null</code>
+	 * @return element from the set, if present
+	 */
+	@SuppressWarnings("unchecked")
+	public T get(T o) {
+		final int h = hash(o);
+		final int mask = table.length - 1;
+		int i = h & mask;
+		Object t;
+		while ((t = table[i]) != null) {
+			if (t == o || t.equals(o)) {
+				return (T) t;
+			}
+			i = (i+1) & mask;
+		}
+		return null;
+	}
+	
+	public int size() {
+		return size;
+	}
+	
+	public void clear() {
+		Object[] t = table;
+		for (int i = 0, top = t.length; i < top; i++) {
+			t[i] = null;
+		}
+		size = 0;
+	}
+	
+	private void resize() {
+		final int newSize = table.length << 1;
+		final int newMask = newSize - 1;
+		Object[] newTable = new Object[newSize];
+		for (int i = 0, size = table.length; i < size; i++) {
+			Object t = table[i];
+			if (t != null) {
+				table[i] = null;
+				int x = hash(t) & newMask;
+				while (newTable[x] != null) {
+					x = (x+1) & newMask;
+				}
+				newTable[x] = t;
+			}
+		}
+		table = newTable;
+		threshold = newSize - (newSize >>> 2);
+	}
+
+	private static int hash(Object o) {
+		int h = o.hashCode();
+//		return h;
+		// HashMap.newHash()
+		h ^= (h >>> 20) ^ (h >>> 12);
+        return h ^ (h >>> 7) ^ (h >>> 4);
+	}
+
+}
--- a/src/org/tmatesoft/hg/util/SparseSet.java	Tue Sep 20 04:43:39 2011 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,309 +0,0 @@
-/*
- * Copyright (c) 2011 TMate Software Ltd
- *  
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * For information on how to redistribute this software under
- * the terms of a license other than GNU General Public License
- * contact TMate Software at support@hg4j.com
- */
-package org.tmatesoft.hg.util;
-
-import java.util.Arrays;
-
-import org.tmatesoft.hg.internal.Experimental;
-
-/**
- * WORK IN PROGRESS, DO NOT USE
- * Memory-friendly alternative to HashMap-backed Pool. Set where object can be obtained (not only queried for presence)
- * 
- * cpython repo, use of HashMap Pool results in ~6 Mb of Map.Entry and Map.Entry[],
- * while use of SparseSet result in 2 Mb. 
- * 
- * @author Artem Tikhomirov
- * @author TMate Software Ltd.
- */
-@Experimental(reason="Requires tuning to accomodate to collection size. Present state (6-6-6) is too much for a lot of uses")
-public class SparseSet<T> {
-	
-	public static void main(String[] args) {
-		SparseSet<String> ss = new SparseSet<String>();
-		String one = Integer.toString(156), two = Integer.toString(1024), three = Integer.toString(1123123);
-		ss.put(one);
-		ss.put(two);
-		ss.put(three);
-		System.out.println(one == ss.get(one));
-		System.out.println(two == ss.get(two));
-		System.out.println(three == ss.get(three));
-		System.out.println(null == ss.get("one"));
-		System.out.println(one == ss.get(Integer.toString(156)));
-		System.out.println(two == ss.get(Integer.toString(1024)));
-		System.out.println(three == ss.get(Integer.toString(1123123)));
-		ss.dump();
-	}
-
-	@SuppressWarnings("unused")
-	private static final int MASK_8BIT = 0xFF, MASK_7BIT = 0x7F, MASK_6BIT = 0x3F, MASK_5BIT = 0x1F, MASK_4BIT = 0x0F;
-	private static final int I1_SHIFT = 15, I2_SHIFT = 6, I3_SHIFT = 0;
-	// 6, 5, 5
-	private static final int I1_MASK = MASK_7BIT, I2_MASK = MASK_4BIT, I3_MASK = MASK_4BIT;
-
-	private final int[] fixups = new int[] {0x1, 0x10, 0xA, 0xD, 0x1F }; // rehash attempts
-	private final IndexBranch[] level2 = new IndexBranch[I1_MASK + 1];
-	private int size = 0;
-	
-
-	//
-	int directPut, neighborPut;
-	int[] fixupPut1 = new int[fixups.length], fixupPut2 = new int[fixups.length];;
-
-	public void put(T o) {
-		final int hash = hash(o);
-		final int i1 = (hash >>> I1_SHIFT) & I1_MASK, i2 = (hash >>> I2_SHIFT) & I2_MASK, i3 = (hash >>> I3_SHIFT) & I3_MASK;
-		LeafBranch l3 = leafBranchPut(i1, i2);
-		int res;
-		if ((res = l3.put(i3, o)) != 0) {
-			size++;
-			if (res == 1) {
-				directPut++;
-			} else if (res == 2) {
-				neighborPut++;
-			}
-			return;
-		}
-		for (int i = 0; i < fixups.length; i++) {
-			int fixup = fixups[i];
-			l3 = leafBranchPut(i1 ^ fixup, i2);
-			if (l3.putIfEmptyOrSame(i3, o)) {
-				size++;
-				fixupPut1[i]++;
-				return;
-			}
-			l3 = leafBranchPut(i1, i2 ^ fixup);
-			if (l3.putIfEmptyOrSame(i3, o)) {
-				size++;
-				fixupPut2[i]++;
-				return;
-			}
-		}
-		throw new IllegalStateException(String.valueOf(o));
-	}
-	
-	@SuppressWarnings("unchecked")
-	public T get(T o) {
-		final int hash = hash(o);
-		final int i1 = (hash >>> I1_SHIFT) & I1_MASK, i2 = (hash >>> I2_SHIFT) & I2_MASK, i3 = (hash >>> I3_SHIFT) & I3_MASK;
-		//
-		LeafBranch l3 = leafBranchGet(i1, i2);
-		if (l3 == null) {
-			return null;
-		}
-		Object c;
-		if ((c = l3.get(i3, o)) != null) {
-			return c == l3 ? null : (T) c;
-		}
-		if ((c = l3.get(i3 ^ 0x1, o)) != null) {
-			return c == l3 ? null : (T) c;
-		}
-		if ((c = l3.get(i3 ^ 0x2, o)) != null) {
-			return c == l3 ? null : (T) c;
-		}
-		if ((c = l3.get(i3 ^ 0x3, o)) != null) {
-			return c == l3 ? null : (T) c;
-		}
-		for (int fixup : fixups) {
-			Object data = leafValueGet(i1 ^ fixup, i2, i3);
-			if (data == null) {
-				return null;
-			}
-			if (o.equals(data)) {
-				return (T)data;
-			}
-			data = leafValueGet(i1, i2 ^ fixup, i3);
-			if (data == null) {
-				return null;
-			}
-			if (o.equals(data)) {
-				return (T)data;
-			}
-		}
-		dump();
-		throw new IllegalStateException(String.format("[%d,%d,%d] hash: 0x%X, hash2: 0x%X, %s", i1, i2, i3, o.hashCode(), hash, o));
-	}
-
-	public int size() {
-		return size;
-	}
-	private LeafBranch leafBranchPut(int i1, int i2) {
-		IndexBranch l2 = level2[i1];
-		if (l2 == null) {
-			level2[i1] = l2 = new IndexBranch();
-		}
-		LeafBranch l3 = l2.leafs[i2];
-		if (l3 == null) {
-			l2.leafs[i2] = l3 = new LeafBranch();
-		}
-		return l3;
-	}
-
-	// unlike regular collection clear, keeps all allocated arrays to minimize gc/reallocate costs
-	// do force clean, use #drop
-	public void clear() {
-		for (int i1 = 0; i1 < level2.length; i1++) {
-			IndexBranch l2 = level2[i1];
-			if (l2 == null) {
-				continue;
-			}
-			for (int i2 = 0; i2 < l2.leafs.length; i2++) {
-				LeafBranch l3 = l2.leafs[i2];
-				if (l3 == null) {
-					continue;
-				}
-				for (int i3 = 0; i3 < l3.data.length; i3++) {
-					l3.data[i3] = null;
-				}
-			}
-		}
-		reset();
-	}
-	
-	public void drop() {
-		reset();
-		for (int i1 = 0; i1 < level2.length; level2[i1++] = null);
-	}
-	
-	private void reset() {
-		size = 0;
-		directPut = neighborPut = 0;
-		Arrays.fill(fixupPut1, 0);
-		Arrays.fill(fixupPut2, 0);
-	}
-
-	private LeafBranch leafBranchGet(int i1, int i2) {
-		IndexBranch l2 = level2[i1];
-		if (l2 == null) {
-			return null;
-		}
-		return l2.leafs[i2];
-	}
-
-	private Object leafValueGet(int i1, int i2, int i3) {
-		IndexBranch l2 = level2[i1];
-		if (l2 == null) {
-			return null;
-		}
-		LeafBranch l3 = l2.leafs[i2];
-		if (l3 == null) {
-			return null;
-		}
-		return l3.data[i3];
-	}
-	
-	private int hash(Object o) {
-		int h = o.hashCode();
-		// HashMap.newHash()
-		h ^= (h >>> 20) ^ (h >>> 12);
-        return h ^ (h >>> 7) ^ (h >>> 4);
-	}
-
-	@Override
-	public String toString() {
-		return String.format("SparseSet (0x%02X-0x%02X-0x%02X), %d elements. Direct: %d. Resolutions: neighbour: %d, i1: %s. i2: %s", I1_MASK, I2_MASK, I3_MASK, size, directPut, neighborPut, Arrays.toString(fixupPut1), Arrays.toString(fixupPut2));
-	}
-
-	public void dump() {
-		int count = 0;
-		System.out.println(toString());
-		for (int i = 0; i < level2.length; i++) {
-			IndexBranch l2 = level2[i];
-			if (l2 == null) {
-				continue;
-			}
-			for (int j = 0; j < l2.leafs.length; j++) {
-				LeafBranch l3 = l2.leafs[j];
-				if (l3 == null) {
-					continue;
-				}
-				for (int k = 0; k < l3.data.length; k++) {
-					Object d = l3.data[k];
-					if (d != null) {
-						System.out.printf("[%3d,%3d,%3d] %s\n", i,j,k,d);
-						count++;
-					}
-				}
-			}
-		}
-		System.out.printf("Total: %d elements\n", count);
-	}
-
-	private static class IndexBranch {
-		private final LeafBranch[] leafs = new LeafBranch[64];
-	}
-	
-	private static final class LeafBranch {
-		public final Object[] data = new Object[64];
-
-		public int put(int ix, Object d) {
-			if (putIfEmptyOrSame(ix, d)) {
-				return 1;
-			}
-			// try neighbour elements
-			if (putIfEmptyOrSame(ix ^ 0x1, d) || putIfEmptyOrSame(ix ^ 0x2, d) || putIfEmptyOrSame(ix ^ 0x3, d)) {
-				return 2;
-			}
-			return 0;
-		}
-
-		public boolean putIfEmptyOrSame(int ix, Object d) {
-			if (data[ix] == null || data[ix].equals(d)) {
-				data[ix] = d;
-				return true;
-			}
-			return false;
-		}
-
-		/**
-		 * <code>null</code> result indicates further checks make sense
-		 * @return <code>this</code> if there's no entry at all, <code>null</code> if entry doesn't match, or entry value itself otherwise
-		 */
-		public Object get(int ix, Object o) {
-			if (data[ix] == null) {
-				return this;
-			}
-			if (data[ix].equals(o)) {
-				return data[ix];
-			}
-			return null;
-		}
-	}
-
-	//
-	// 8 bits per level
-//	int i1 = (hash >>> 24) & 0xFF, i2 = (hash >>> 16) & 0xFF , i3 = (hash >>> 8) & 0xFF, i4 = hash & 0xFF;
-	//
-	// 10, 8, 8 and 6 bits
-//	final int i1 = (hash >>> 22) & 0x3FF, i2 = (hash >>> 14) & 0xFF , i3 = (hash >>> 6) & 0xFF, i4 = hash & 0x3F;
-	//
-	// 8, 6, 6, 6, 6
-	// 10, 6, 6, 6, 4
-	//
-	// 6, 5, 5, 5 = 21 bit
-//	hash = hash ^ (hash >>> 24); // incorporate upper byte we don't use into lower to value it
-//final int i1 = (hash >>> 18) & 0x3F, i2 = (hash >>> 12) & 0x1F , i3 = (hash >>> 7) & 0x1F, i4 = (hash >>> 2) & 0x1F;
-// 6, 5, 5
-//hash = hash ^ (hash >>> 16);
-//final int i1 = (hash >>> 10) & 0x3F, i2 = (hash >>> 5) & 0x1F , i3 = hash & 0x1F;
-//
-// 6, 6, 6
-//final int i1 = (hash >>> 15) & 0x3F, i2 = (hash >>> 6) & 0x3F , i3 = hash & 0x3F;
-//
-// 8, 5, 5
-
-}
--- a/test/org/tmatesoft/hg/test/MapTagsToFileRevisions.java	Tue Sep 20 04:43:39 2011 +0200
+++ b/test/org/tmatesoft/hg/test/MapTagsToFileRevisions.java	Wed Sep 21 18:26:16 2011 +0200
@@ -39,8 +39,8 @@
 //		Pattern p = Pattern.compile("^doc/[^/]*?\\.[0-9]\\.(x|ht)ml");
 //		System.out.println(p.matcher("doc/asd.2.xml").matches());
 //		System.out.println(p.matcher("doc/zxc.6.html").matches());
-		m.collectTagsPerFile();
-//		m.manifestWalk();
+//		m.collectTagsPerFile();
+		m.manifestWalk();
 		m = null;
 		System.gc();
 		System.out.printf("Free mem: %,d\n", Runtime.getRuntime().freeMemory());
@@ -92,6 +92,7 @@
 		//
 		// With Pool<StringProxy> for fname and flags, Nodeid's ascii2bin through local array, overall byte[] iteration, 
 		// 0..10k is 34 seconds now
+		// Another run, 23 seconds now, seems nothing has been changed. Switched to Pool2 with DirectHashSet: 22,5 seconds
 		System.out.printf("Total time: %d ms\n", System.currentTimeMillis() - start);
 		System.out.printf("Free mem: %,d\n", Runtime.getRuntime().freeMemory());
 	}