changeset 329:694ebabb5cb3

Refactor revlog patch mechanism, towards patch merging
author Artem Tikhomirov <tikhomirov.artem@gmail.com>
date Thu, 13 Oct 2011 03:30:50 +0200
parents a674b8590362
children 9747a786a34d
files src/org/tmatesoft/hg/internal/IntVector.java src/org/tmatesoft/hg/internal/Patch.java src/org/tmatesoft/hg/internal/RevlogStream.java src/org/tmatesoft/hg/repo/HgBundle.java src/org/tmatesoft/hg/repo/HgChangelog.java test/org/tmatesoft/hg/test/MapTagsToFileRevisions.java
diffstat 6 files changed, 220 insertions(+), 115 deletions(-) [+]
line wrap: on
line diff
--- a/src/org/tmatesoft/hg/internal/IntVector.java	Wed Oct 05 07:13:57 2011 +0200
+++ b/src/org/tmatesoft/hg/internal/IntVector.java	Thu Oct 13 03:30:50 2011 +0200
@@ -57,6 +57,15 @@
 	public int size() {
 		return count;
 	}
+	
+	public void clear() {
+		count = 0;
+	}
+	
+	public void trimToSize() {
+		data = toArray(true);
+	}
+
 
 	public int[] toArray() {
 		int[] rv = new int[count];
@@ -77,8 +86,7 @@
 
 	private void grow() {
 		if (increment == 0) {
-			// throw specific exception right away
-			return;
+			throw new UnsupportedOperationException("This vector is not allowed to expand");
 		}
 		int newCapacity = increment < 0 ? data.length << 1 : data.length + increment;
 		assert newCapacity > 0 && newCapacity != data.length : newCapacity;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/org/tmatesoft/hg/internal/Patch.java	Thu Oct 13 03:30:50 2011 +0200
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2011 TMate Software Ltd
+ *  
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * For information on how to redistribute this software under
+ * the terms of a license other than GNU General Public License
+ * contact TMate Software at support@hg4j.com
+ */
+package org.tmatesoft.hg.internal;
+
+import java.io.IOException;
+import java.util.ArrayList;
+
+/**
+ * @see http://mercurial.selenic.com/wiki/BundleFormat, in Changelog group description
+ * 
+ * range [start..end] in original source gets replaced with data of length (do not keep, use data.length instead)
+ * range [end(i)..start(i+1)] is copied from the source
+ * 
+ * @author Artem Tikhomirov
+ * @author TMate Software Ltd.
+ */
+public final class Patch {
+	private final IntVector starts, ends;
+	private final ArrayList<byte[]> data;
+	
+	public Patch() {
+		starts = new IntVector();
+		ends = new IntVector();
+		data = new ArrayList<byte[]>();
+	}
+	
+	public int count() {
+		return data.size();
+	}
+
+	// number of bytes this patch will add (or remove, if negative) from the base revision
+	private int patchSizeDelta() {
+		int rv = 0;
+		int prevEnd = 0;
+		for (int i = 0, x = data.size(); i < x; i++) {
+			final int start = starts.get(i);
+			final int len = data.get(i).length;
+			rv += start - prevEnd; // would copy from original
+			rv += len; // and add new
+			prevEnd = ends.get(i);
+		}
+		rv -= prevEnd;
+		return rv;
+	}
+	
+	public byte[] apply(DataAccess baseRevisionContent, int outcomeLen) throws IOException {
+		if (outcomeLen == -1) {
+			outcomeLen = baseRevisionContent.length() + patchSizeDelta();
+		}
+		int prevEnd = 0, destIndex = 0;
+		byte[] rv = new byte[outcomeLen];
+		for (int i = 0, x = data.size(); i < x; i++) {
+			final int start = starts.get(i);
+			baseRevisionContent.seek(prevEnd);
+			// copy source bytes that were not modified (up to start of the record)
+			baseRevisionContent.readBytes(rv, destIndex, start - prevEnd);
+			destIndex += start - prevEnd;
+			// insert new data from the patch, if any
+			byte[] d = data.get(i);
+			System.arraycopy(d, 0, rv, destIndex, d.length);
+			destIndex += d.length;
+			prevEnd = ends.get(i);
+		}
+		baseRevisionContent.seek(prevEnd);
+		// copy everything in the source past last record's end
+		baseRevisionContent.readBytes(rv, destIndex, (int) (baseRevisionContent.length() - prevEnd));
+		return rv;
+	}
+	
+	public void clear() {
+		starts.clear();
+		ends.clear();
+		data.clear();
+	}
+	
+	/**
+	 * Initialize instance from stream. Any previous patch information (i.e. if instance if reused) is cleared first.
+	 * Read up to the end of DataAccess and interpret data as patch records.
+	 */
+	public void read(DataAccess da) throws IOException {
+		clear();
+		while (!da.isEmpty()) {
+			readOne(da);
+		}
+	}
+
+	/**
+	 * Caller is responsible to ensure stream got some data to read
+	 */
+	public void readOne(DataAccess da) throws IOException {
+		int s = da.readInt();
+		int e = da.readInt();
+		int len = da.readInt();
+		byte[] src = new byte[len];
+		da.readBytes(src, 0, len);
+		starts.add(s);
+		ends.add(e);
+		data.add(src);
+	}
+
+/*
+	private void add(Patch another, int index) {
+		starts.add(another.starts.get(index));
+		ends.add(another.ends.get(index));
+		data.add(another.data.get(index));
+	}
+
+	/**
+	 * Modify this patch with subsequent patch 
+	 * /
+	public void apply(Patch another) {
+		Patch r = new Patch();
+		int p1AppliedPos = 0;
+		int p1PrevEnd = 0;
+		for (int i = 0, j = 0, iMax = another.count(), jMax = this.count(); i < iMax; i++) {
+			int newerPatchEntryStart = another.starts.get(i);
+			int olderPatchEntryEnd;
+			
+			while (j < jMax) {
+				if (starts.get(j) < newerPatchEntryStart) {
+					if (starts.get(j)+data.get(j).length <= newerPatchEntryStart) {
+						r.add(this, j);
+					} else {
+						int newLen = newerPatchEntryStart - starts.get(j);
+						int newEnd = ends.get(j) <= newerPatchEntryStart ? ends.get(j) : newerPatchEntryStart; 
+						r.add(starts.get(j), newEnd, data.get(j), newLen);
+						break;
+					}
+				}
+				p1AppliedPos += starts.get(j) - p1PrevEnd;
+				p1AppliedPos += data.get(j).length;
+				p1PrevEnd = ends.get(j);
+				j++;
+			}
+			r.add(newerPatchEntryStart, another.ends.get(i), another.data.get(i));
+			p1AppliedPos += newerPatchEntryStart + p1PrevEnd - another.data.get(i).length;
+			// either j == jMax and another(i, i+1, ..., iMax) need to be just copied
+			// or new patch entry starts before end of one of original patch entries
+			if (olderPatchEntryEnd > (destPosition + newerPatchEntryStart)) {
+				destPosition += starts.get(j) - prevEnd; // count those in the original stream up to old patch start
+				int newLen = newerPatchEntryStart - destPosition;
+			}
+		}
+	}
+*/
+}
\ No newline at end of file
--- a/src/org/tmatesoft/hg/internal/RevlogStream.java	Wed Oct 05 07:13:57 2011 +0200
+++ b/src/org/tmatesoft/hg/internal/RevlogStream.java	Thu Oct 13 03:30:50 2011 +0200
@@ -21,8 +21,6 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
 import java.util.zip.Inflater;
 
 import org.tmatesoft.hg.core.HgBadStateException;
@@ -392,20 +390,17 @@
 		public boolean range(int start, int end) throws IOException {
 			byte[] nodeidBuf = new byte[20];
 			int i;
-			boolean extraReadsToBaseRev = false; // to indicate we read revision prior to start. XXX not sure can't do without
 			// it (i.e. replace with i >= start)
 			if (needData && (i = getBaseRevision(start)) < start) {
 				// if lastRevisionRead in [baseRevision(start), start)  can reuse lastUserData
 				// doesn't make sense to reuse if lastRevisionRead == start (too much to change in the cycle below). 
 				if (lastRevisionRead != BAD_REVISION && i <= lastRevisionRead && lastRevisionRead < start) {
 					i = lastRevisionRead + 1; // start with first not-yet-read revision
-					extraReadsToBaseRev = i < start;
 				} else {
 					if (lastUserData != null) {
 						lastUserData.done();
 						lastUserData = null;
 					}
-					extraReadsToBaseRev = true;
 				}
 			} else {
 				// don't need to clean lastUserData as it's always null when !needData
@@ -415,7 +410,7 @@
 			daIndex.seek(getIndexOffsetInt(i));
 			//
 			// reuse some instances
-			final ArrayList<PatchRecord> patches = new ArrayList<PatchRecord>();
+			final Patch patch = new Patch();
 			final Inflater inflater = new Inflater();
 			// can share buffer between instances of InflaterDataAccess as I never read any two of them in parallel
 			final byte[] inflaterBuffer = new byte[1024];
@@ -470,12 +465,7 @@
 					// XXX 
 					if (patchToPrevious) {
 						// this is a patch
-						patches.clear(); // won't hurt to ensure there are no leftovers, even if we already cleaned
-						while (!userDataAccess.isEmpty()) {
-							PatchRecord pr = PatchRecord.read(userDataAccess);
-//							System.out.printf("PatchRecord:%d %d %d\n", pr.start, pr.end, pr.len);
-							patches.add(pr);
-						}
+						patch.read(userDataAccess);
 						userDataAccess.done();
 						//
 						// it shall be reset at the end of prev iteration, when it got assigned from userDataAccess
@@ -483,9 +473,9 @@
 						// Alternatively, userDataAccess.done() above may be responsible to reset Inflater (if it's InflaterDataAccess)
 						lastUserData.reset();
 //						final long startMeasuring = System.currentTimeMillis(); // TIMING
-						byte[] userData = apply(lastUserData, actualLen, patches);
+						byte[] userData = patch.apply(lastUserData, actualLen);
 //						applyTime += (System.currentTimeMillis() - startMeasuring); // TIMING
-						patches.clear(); // do not keep any reference, allow PatchRecord to be gc'd
+						patch.clear(); // do not keep any reference, allow byte[] data to be gc'd
 						userDataAccess = new ByteArrayDataAccess(userData);
 					}
 				} else {
@@ -493,7 +483,7 @@
 						daIndex.skip(compressedLen);
 					}
 				}
-				if (!extraReadsToBaseRev || i >= start) {
+				if (i >= start) {
 //					final long startMeasuring = System.currentTimeMillis(); // TIMING
 					inspector.next(i, actualLen, baseRevision, linkRevision, parent1Revision, parent2Revision, nodeidBuf, userDataAccess);
 //					inspectorTime += (System.currentTimeMillis() - startMeasuring); // TIMING
@@ -517,75 +507,6 @@
 	}
 
 	
-	// mpatch.c : apply()
-	// FIXME need to implement patch merge (fold, combine, gather and discard from aforementioned mpatch.[c|py]), also see Revlog and Mercurial PDF
-	public/*for HgBundle; until moved to better place*/static byte[] apply(DataAccess baseRevisionContent, int outcomeLen, List<PatchRecord> patch) throws IOException {
-		int last = 0, destIndex = 0;
-		if (outcomeLen == -1) {
-			outcomeLen = baseRevisionContent.length();
-			for (int i = 0, x = patch.size(); i < x; i++) {
-				PatchRecord pr = patch.get(i);
-				outcomeLen += pr.start - last + pr.len;
-				last = pr.end;
-			}
-			outcomeLen -= last;
-			last = 0;
-		}
-		byte[] rv = new byte[outcomeLen];
-		for (int i = 0, x = patch.size(); i < x; i++) {
-			PatchRecord pr = patch.get(i);
-			baseRevisionContent.seek(last);
-			baseRevisionContent.readBytes(rv, destIndex, pr.start-last);
-			destIndex += pr.start - last;
-			System.arraycopy(pr.data, 0, rv, destIndex, pr.data.length);
-			destIndex += pr.data.length;
-			last = pr.end;
-		}
-		baseRevisionContent.seek(last);
-		baseRevisionContent.readBytes(rv, destIndex, (int) (baseRevisionContent.length() - last));
-		return rv;
-	}
-
-	// @see http://mercurial.selenic.com/wiki/BundleFormat, in Changelog group description
-	public static class PatchRecord {
-		/*
-		   Given there are pr1 and pr2:
-		     pr1.start to pr1.end will be replaced with pr's data (of pr1.len)
-		     pr1.end to pr2.start gets copied from base
-		 */
-		public int start, end, len;
-		public byte[] data;
-
-		// TODO consider PatchRecord that only records data position (absolute in data source), and acquires data as needed 
-		private PatchRecord(int p1, int p2, int length, byte[] src) {
-			start = p1;
-			end = p2;
-			len = length;
-			data = src;
-		}
-
-		/*package-local*/ static PatchRecord read(byte[] data, int offset) {
-			final int x = offset; // shorthand
-			int p1 =  ((data[x] & 0xFF)<< 24)    | ((data[x+1] & 0xFF) << 16) | ((data[x+2] & 0xFF) << 8)  | (data[x+3] & 0xFF);
-			int p2 =  ((data[x+4] & 0xFF) << 24) | ((data[x+5] & 0xFF) << 16) | ((data[x+6] & 0xFF) << 8)  | (data[x+7] & 0xFF);
-			int len = ((data[x+8] & 0xFF) << 24) | ((data[x+9] & 0xFF) << 16) | ((data[x+10] & 0xFF) << 8) | (data[x+11] & 0xFF);
-			byte[] dataCopy = new byte[len];
-			System.arraycopy(data, x+12, dataCopy, 0, len);
-			return new PatchRecord(p1, p2, len, dataCopy);
-		}
-
-		public /*for HgBundle*/ static PatchRecord read(DataAccess da) throws IOException {
-			int p1 = da.readInt();
-			int p2 = da.readInt();
-			int len = da.readInt();
-			byte[] src = new byte[len];
-			da.readBytes(src, 0, len);
-			return new PatchRecord(p1, p2, len, src);
-		}
-	}
-
-	// FIXME byte[] data might be too expensive, for few usecases it may be better to have intermediate Access object (when we don't need full data 
-	// instantly - e.g. calculate hash, or comparing two revisions
 	public interface Inspector {
 		// XXX boolean retVal to indicate whether to continue?
 		// TODO specify nodeid and data length, and reuse policy (i.e. if revlog stream doesn't reuse nodeid[] for each call)
--- a/src/org/tmatesoft/hg/repo/HgBundle.java	Wed Oct 05 07:13:57 2011 +0200
+++ b/src/org/tmatesoft/hg/repo/HgBundle.java	Thu Oct 13 03:30:50 2011 +0200
@@ -19,7 +19,6 @@
 import java.io.File;
 import java.io.IOException;
 import java.util.LinkedList;
-import java.util.List;
 
 import org.tmatesoft.hg.core.HgBadStateException;
 import org.tmatesoft.hg.core.HgException;
@@ -31,7 +30,7 @@
 import org.tmatesoft.hg.internal.DataAccessProvider;
 import org.tmatesoft.hg.internal.DigestHelper;
 import org.tmatesoft.hg.internal.InflaterDataAccess;
-import org.tmatesoft.hg.internal.RevlogStream;
+import org.tmatesoft.hg.internal.Patch;
 import org.tmatesoft.hg.repo.HgChangelog.RawChangeset;
 import org.tmatesoft.hg.util.CancelledException;
 
@@ -239,7 +238,7 @@
 
 		public boolean element(GroupElement ge) {
 			try {
-				System.out.printf("  %s %s %s %s; patches:%d\n", ge.node(), ge.firstParent(), ge.secondParent(), ge.cset(), ge.patches().size());
+				System.out.printf("  %s %s %s %s; patches:%d\n", ge.node(), ge.firstParent(), ge.secondParent(), ge.cset(), ge.patch().count());
 			} catch (Exception ex) {
 				ex.printStackTrace(); // FIXME
 			}
@@ -397,10 +396,11 @@
 		}
 	}
 
+	// FIXME GroupElement exposes some internal API!!! (DataAccess)
 	public static class GroupElement {
 		private final byte[] header; // byte[80] takes 120 bytes, 4 Nodeids - 192
 		private final DataAccess dataAccess;
-		private List<RevlogStream.PatchRecord> patches;
+		private Patch patches;
 
 		GroupElement(byte[] fourNodeids, DataAccess rawDataAccess) {
 			assert fourNodeids != null && fourNodeids.length == 80;
@@ -432,21 +432,17 @@
 			return dataAccess;
 		}
 		
-		public List<RevlogStream.PatchRecord> patches() throws IOException {
+		/*package-local*/ Patch patch() throws IOException {
 			if (patches == null) {
 				dataAccess.reset();
-				LinkedList<RevlogStream.PatchRecord> p = new LinkedList<RevlogStream.PatchRecord>();
-				while (!dataAccess.isEmpty()) {
-					RevlogStream.PatchRecord pr = RevlogStream.PatchRecord.read(dataAccess);
-					p.add(pr);
-				}
-				patches = p;
+				patches = new Patch();
+				patches.read(dataAccess);
 			}
 			return patches;
 		}
 
 		public byte[] apply(DataAccess baseContent) throws IOException {
-			return RevlogStream.apply(baseContent, -1, patches());
+			return patch().apply(baseContent, -1);
 		}
 	}
 }
--- a/src/org/tmatesoft/hg/repo/HgChangelog.java	Wed Oct 05 07:13:57 2011 +0200
+++ b/src/org/tmatesoft/hg/repo/HgChangelog.java	Thu Oct 13 03:30:50 2011 +0200
@@ -32,7 +32,6 @@
 import java.util.TimeZone;
 
 import org.tmatesoft.hg.core.HgBadStateException;
-import org.tmatesoft.hg.core.HgLogCommand;
 import org.tmatesoft.hg.core.Nodeid;
 import org.tmatesoft.hg.internal.DataAccess;
 import org.tmatesoft.hg.internal.IterateControlMediator;
--- a/test/org/tmatesoft/hg/test/MapTagsToFileRevisions.java	Wed Oct 05 07:13:57 2011 +0200
+++ b/test/org/tmatesoft/hg/test/MapTagsToFileRevisions.java	Thu Oct 13 03:30:50 2011 +0200
@@ -40,7 +40,8 @@
 	public static void main(String[] args) throws Exception {
 		MapTagsToFileRevisions m = new MapTagsToFileRevisions();
 		System.out.printf("Free mem: %,d\n", Runtime.getRuntime().freeMemory());
-		m.collectTagsPerFile();
+		m.measurePatchAffectsArbitraryRevisionRead();
+//		m.collectTagsPerFile();
 //		m.manifestWalk();
 //		m.changelogWalk();
 //		m.revisionMap();
@@ -50,6 +51,24 @@
 		System.out.printf("Free mem: %,d\n", Runtime.getRuntime().freeMemory());
 	}
 	
+
+	// revision == 2406  -   5 ms per run (baseRevision == 2406)
+	// revision == 2405  -  69 ms per run (baseRevision == 1403)
+	private void measurePatchAffectsArbitraryRevisionRead() throws Exception {
+		final HgRepository repository = new HgLookup().detect(new File("/temp/hg/cpython"));
+		final DoNothingManifestInspector insp = new DoNothingManifestInspector();
+		final int revision = 2405;
+		// warm-up.
+		repository.getManifest().walk(revision, revision, insp);
+		final int runs = 10;
+		final long start = System.nanoTime();
+		for (int i = 0; i < runs; i++) {
+			repository.getManifest().walk(revision, revision, insp);
+		}
+		final long end = System.nanoTime();
+		System.out.printf("%d ms per run\n", (end - start)/ (runs*1000000));
+	}
+
 	/*
 	 * .hgtags, 261 revisions
 	 * Approach 1: total 83, init: 0, iteration: 82
@@ -174,20 +193,7 @@
 		System.out.println(System.getProperty("java.version"));
 		final long start = System.currentTimeMillis();
 		final HgRepository repository = new HgLookup().detect(new File("/temp/hg/cpython"));
-		repository.getManifest().walk(0, 10000, new HgManifest.Inspector2() {
-			public boolean begin(int mainfestRevision, Nodeid nid, int changelogRevision) {
-				return true;
-			}
-			public boolean next(Nodeid nid, String fname, String flags) {
-				throw new HgBadStateException(HgManifest.Inspector2.class.getName());
-			}
-			public boolean next(Nodeid nid, Path fname, Flags flags) {
-				return true;
-			}
-			public boolean end(int manifestRevision) {
-				return true;
-			}
-		});
+		repository.getManifest().walk(0, 10000, new DoNothingManifestInspector());
 		// cpython: 1,1 sec for 0..1000, 43 sec for 0..10000, 115 sec for 0..20000 (Pool with HashMap)
 		// 2,4 sec for 1000..2000
 		// cpython -r 1000: 484 files, -r 2000: 1015 files. Iteration 1000..2000; fnamePool.size:1019 nodeidPool.size:2989
@@ -240,7 +246,7 @@
 		final IntMap<List<TagInfo>> tagLocalRev2TagInfo = new IntMap<List<TagInfo>>(allTags.length);
 		System.out.printf("Collecting manifests for %d tags\n", allTags.length);
 		final int[] tagLocalRevs = collectLocalTagRevisions(clogrmap, allTags, tagLocalRev2TagInfo);
-		System.out.printf("Prepared tag revisions to analyze: %d ms\n", System.currentTimeMillis() - start);
+		System.out.printf("Prepared %d tag revisions to analyze: %d ms\n", tagLocalRevs.length, System.currentTimeMillis() - start);
 
 		final Path targetPath = Path.create("README");
 		//
@@ -365,6 +371,21 @@
 		System.out.printf("Free mem: %,d\n", Runtime.getRuntime().freeMemory());
 	}
 
+	static class DoNothingManifestInspector implements HgManifest.Inspector2 {
+		public boolean begin(int mainfestRevision, Nodeid nid, int changelogRevision) {
+			return true;
+		}
+		public boolean next(Nodeid nid, String fname, String flags) {
+			throw new HgBadStateException(HgManifest.Inspector2.class.getName());
+		}
+		public boolean next(Nodeid nid, Path fname, Flags flags) {
+			return true;
+		}
+		public boolean end(int manifestRevision) {
+			return true;
+		}
+	}
+	
 	public static void main2(String[] args) throws HgException, CancelledException {
 		final HgRepository repository = new HgLookup().detect(new File("/temp/hg/cpython"));
 		final Path targetPath = Path.create("README");