Mercurial > hg4j
changeset 329:694ebabb5cb3
Refactor revlog patch mechanism, towards patch merging
author | Artem Tikhomirov <tikhomirov.artem@gmail.com> |
---|---|
date | Thu, 13 Oct 2011 03:30:50 +0200 |
parents | a674b8590362 |
children | 9747a786a34d |
files | src/org/tmatesoft/hg/internal/IntVector.java src/org/tmatesoft/hg/internal/Patch.java src/org/tmatesoft/hg/internal/RevlogStream.java src/org/tmatesoft/hg/repo/HgBundle.java src/org/tmatesoft/hg/repo/HgChangelog.java test/org/tmatesoft/hg/test/MapTagsToFileRevisions.java |
diffstat | 6 files changed, 220 insertions(+), 115 deletions(-) [+] |
line wrap: on
line diff
--- a/src/org/tmatesoft/hg/internal/IntVector.java Wed Oct 05 07:13:57 2011 +0200 +++ b/src/org/tmatesoft/hg/internal/IntVector.java Thu Oct 13 03:30:50 2011 +0200 @@ -57,6 +57,15 @@ public int size() { return count; } + + public void clear() { + count = 0; + } + + public void trimToSize() { + data = toArray(true); + } + public int[] toArray() { int[] rv = new int[count]; @@ -77,8 +86,7 @@ private void grow() { if (increment == 0) { - // throw specific exception right away - return; + throw new UnsupportedOperationException("This vector is not allowed to expand"); } int newCapacity = increment < 0 ? data.length << 1 : data.length + increment; assert newCapacity > 0 && newCapacity != data.length : newCapacity;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/org/tmatesoft/hg/internal/Patch.java Thu Oct 13 03:30:50 2011 +0200 @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2011 TMate Software Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * For information on how to redistribute this software under + * the terms of a license other than GNU General Public License + * contact TMate Software at support@hg4j.com + */ +package org.tmatesoft.hg.internal; + +import java.io.IOException; +import java.util.ArrayList; + +/** + * @see http://mercurial.selenic.com/wiki/BundleFormat, in Changelog group description + * + * range [start..end] in original source gets replaced with data of length (do not keep, use data.length instead) + * range [end(i)..start(i+1)] is copied from the source + * + * @author Artem Tikhomirov + * @author TMate Software Ltd. + */ +public final class Patch { + private final IntVector starts, ends; + private final ArrayList<byte[]> data; + + public Patch() { + starts = new IntVector(); + ends = new IntVector(); + data = new ArrayList<byte[]>(); + } + + public int count() { + return data.size(); + } + + // number of bytes this patch will add (or remove, if negative) from the base revision + private int patchSizeDelta() { + int rv = 0; + int prevEnd = 0; + for (int i = 0, x = data.size(); i < x; i++) { + final int start = starts.get(i); + final int len = data.get(i).length; + rv += start - prevEnd; // would copy from original + rv += len; // and add new + prevEnd = ends.get(i); + } + rv -= prevEnd; + return rv; + } + + public byte[] apply(DataAccess baseRevisionContent, int outcomeLen) throws IOException { + if (outcomeLen == -1) { + outcomeLen = baseRevisionContent.length() + patchSizeDelta(); + } + int prevEnd = 0, destIndex = 0; + byte[] rv = new byte[outcomeLen]; + for (int i = 0, x = data.size(); i < x; i++) { + final int start = starts.get(i); + baseRevisionContent.seek(prevEnd); + // copy source bytes that were not modified (up to start of the record) + baseRevisionContent.readBytes(rv, destIndex, start - prevEnd); + destIndex += start - prevEnd; + // insert new data from the patch, if any + byte[] d = data.get(i); + System.arraycopy(d, 0, rv, destIndex, d.length); + destIndex += d.length; + prevEnd = ends.get(i); + } + baseRevisionContent.seek(prevEnd); + // copy everything in the source past last record's end + baseRevisionContent.readBytes(rv, destIndex, (int) (baseRevisionContent.length() - prevEnd)); + return rv; + } + + public void clear() { + starts.clear(); + ends.clear(); + data.clear(); + } + + /** + * Initialize instance from stream. Any previous patch information (i.e. if instance if reused) is cleared first. + * Read up to the end of DataAccess and interpret data as patch records. + */ + public void read(DataAccess da) throws IOException { + clear(); + while (!da.isEmpty()) { + readOne(da); + } + } + + /** + * Caller is responsible to ensure stream got some data to read + */ + public void readOne(DataAccess da) throws IOException { + int s = da.readInt(); + int e = da.readInt(); + int len = da.readInt(); + byte[] src = new byte[len]; + da.readBytes(src, 0, len); + starts.add(s); + ends.add(e); + data.add(src); + } + +/* + private void add(Patch another, int index) { + starts.add(another.starts.get(index)); + ends.add(another.ends.get(index)); + data.add(another.data.get(index)); + } + + /** + * Modify this patch with subsequent patch + * / + public void apply(Patch another) { + Patch r = new Patch(); + int p1AppliedPos = 0; + int p1PrevEnd = 0; + for (int i = 0, j = 0, iMax = another.count(), jMax = this.count(); i < iMax; i++) { + int newerPatchEntryStart = another.starts.get(i); + int olderPatchEntryEnd; + + while (j < jMax) { + if (starts.get(j) < newerPatchEntryStart) { + if (starts.get(j)+data.get(j).length <= newerPatchEntryStart) { + r.add(this, j); + } else { + int newLen = newerPatchEntryStart - starts.get(j); + int newEnd = ends.get(j) <= newerPatchEntryStart ? ends.get(j) : newerPatchEntryStart; + r.add(starts.get(j), newEnd, data.get(j), newLen); + break; + } + } + p1AppliedPos += starts.get(j) - p1PrevEnd; + p1AppliedPos += data.get(j).length; + p1PrevEnd = ends.get(j); + j++; + } + r.add(newerPatchEntryStart, another.ends.get(i), another.data.get(i)); + p1AppliedPos += newerPatchEntryStart + p1PrevEnd - another.data.get(i).length; + // either j == jMax and another(i, i+1, ..., iMax) need to be just copied + // or new patch entry starts before end of one of original patch entries + if (olderPatchEntryEnd > (destPosition + newerPatchEntryStart)) { + destPosition += starts.get(j) - prevEnd; // count those in the original stream up to old patch start + int newLen = newerPatchEntryStart - destPosition; + } + } + } +*/ +} \ No newline at end of file
--- a/src/org/tmatesoft/hg/internal/RevlogStream.java Wed Oct 05 07:13:57 2011 +0200 +++ b/src/org/tmatesoft/hg/internal/RevlogStream.java Thu Oct 13 03:30:50 2011 +0200 @@ -21,8 +21,6 @@ import java.io.File; import java.io.IOException; -import java.util.ArrayList; -import java.util.List; import java.util.zip.Inflater; import org.tmatesoft.hg.core.HgBadStateException; @@ -392,20 +390,17 @@ public boolean range(int start, int end) throws IOException { byte[] nodeidBuf = new byte[20]; int i; - boolean extraReadsToBaseRev = false; // to indicate we read revision prior to start. XXX not sure can't do without // it (i.e. replace with i >= start) if (needData && (i = getBaseRevision(start)) < start) { // if lastRevisionRead in [baseRevision(start), start) can reuse lastUserData // doesn't make sense to reuse if lastRevisionRead == start (too much to change in the cycle below). if (lastRevisionRead != BAD_REVISION && i <= lastRevisionRead && lastRevisionRead < start) { i = lastRevisionRead + 1; // start with first not-yet-read revision - extraReadsToBaseRev = i < start; } else { if (lastUserData != null) { lastUserData.done(); lastUserData = null; } - extraReadsToBaseRev = true; } } else { // don't need to clean lastUserData as it's always null when !needData @@ -415,7 +410,7 @@ daIndex.seek(getIndexOffsetInt(i)); // // reuse some instances - final ArrayList<PatchRecord> patches = new ArrayList<PatchRecord>(); + final Patch patch = new Patch(); final Inflater inflater = new Inflater(); // can share buffer between instances of InflaterDataAccess as I never read any two of them in parallel final byte[] inflaterBuffer = new byte[1024]; @@ -470,12 +465,7 @@ // XXX if (patchToPrevious) { // this is a patch - patches.clear(); // won't hurt to ensure there are no leftovers, even if we already cleaned - while (!userDataAccess.isEmpty()) { - PatchRecord pr = PatchRecord.read(userDataAccess); -// System.out.printf("PatchRecord:%d %d %d\n", pr.start, pr.end, pr.len); - patches.add(pr); - } + patch.read(userDataAccess); userDataAccess.done(); // // it shall be reset at the end of prev iteration, when it got assigned from userDataAccess @@ -483,9 +473,9 @@ // Alternatively, userDataAccess.done() above may be responsible to reset Inflater (if it's InflaterDataAccess) lastUserData.reset(); // final long startMeasuring = System.currentTimeMillis(); // TIMING - byte[] userData = apply(lastUserData, actualLen, patches); + byte[] userData = patch.apply(lastUserData, actualLen); // applyTime += (System.currentTimeMillis() - startMeasuring); // TIMING - patches.clear(); // do not keep any reference, allow PatchRecord to be gc'd + patch.clear(); // do not keep any reference, allow byte[] data to be gc'd userDataAccess = new ByteArrayDataAccess(userData); } } else { @@ -493,7 +483,7 @@ daIndex.skip(compressedLen); } } - if (!extraReadsToBaseRev || i >= start) { + if (i >= start) { // final long startMeasuring = System.currentTimeMillis(); // TIMING inspector.next(i, actualLen, baseRevision, linkRevision, parent1Revision, parent2Revision, nodeidBuf, userDataAccess); // inspectorTime += (System.currentTimeMillis() - startMeasuring); // TIMING @@ -517,75 +507,6 @@ } - // mpatch.c : apply() - // FIXME need to implement patch merge (fold, combine, gather and discard from aforementioned mpatch.[c|py]), also see Revlog and Mercurial PDF - public/*for HgBundle; until moved to better place*/static byte[] apply(DataAccess baseRevisionContent, int outcomeLen, List<PatchRecord> patch) throws IOException { - int last = 0, destIndex = 0; - if (outcomeLen == -1) { - outcomeLen = baseRevisionContent.length(); - for (int i = 0, x = patch.size(); i < x; i++) { - PatchRecord pr = patch.get(i); - outcomeLen += pr.start - last + pr.len; - last = pr.end; - } - outcomeLen -= last; - last = 0; - } - byte[] rv = new byte[outcomeLen]; - for (int i = 0, x = patch.size(); i < x; i++) { - PatchRecord pr = patch.get(i); - baseRevisionContent.seek(last); - baseRevisionContent.readBytes(rv, destIndex, pr.start-last); - destIndex += pr.start - last; - System.arraycopy(pr.data, 0, rv, destIndex, pr.data.length); - destIndex += pr.data.length; - last = pr.end; - } - baseRevisionContent.seek(last); - baseRevisionContent.readBytes(rv, destIndex, (int) (baseRevisionContent.length() - last)); - return rv; - } - - // @see http://mercurial.selenic.com/wiki/BundleFormat, in Changelog group description - public static class PatchRecord { - /* - Given there are pr1 and pr2: - pr1.start to pr1.end will be replaced with pr's data (of pr1.len) - pr1.end to pr2.start gets copied from base - */ - public int start, end, len; - public byte[] data; - - // TODO consider PatchRecord that only records data position (absolute in data source), and acquires data as needed - private PatchRecord(int p1, int p2, int length, byte[] src) { - start = p1; - end = p2; - len = length; - data = src; - } - - /*package-local*/ static PatchRecord read(byte[] data, int offset) { - final int x = offset; // shorthand - int p1 = ((data[x] & 0xFF)<< 24) | ((data[x+1] & 0xFF) << 16) | ((data[x+2] & 0xFF) << 8) | (data[x+3] & 0xFF); - int p2 = ((data[x+4] & 0xFF) << 24) | ((data[x+5] & 0xFF) << 16) | ((data[x+6] & 0xFF) << 8) | (data[x+7] & 0xFF); - int len = ((data[x+8] & 0xFF) << 24) | ((data[x+9] & 0xFF) << 16) | ((data[x+10] & 0xFF) << 8) | (data[x+11] & 0xFF); - byte[] dataCopy = new byte[len]; - System.arraycopy(data, x+12, dataCopy, 0, len); - return new PatchRecord(p1, p2, len, dataCopy); - } - - public /*for HgBundle*/ static PatchRecord read(DataAccess da) throws IOException { - int p1 = da.readInt(); - int p2 = da.readInt(); - int len = da.readInt(); - byte[] src = new byte[len]; - da.readBytes(src, 0, len); - return new PatchRecord(p1, p2, len, src); - } - } - - // FIXME byte[] data might be too expensive, for few usecases it may be better to have intermediate Access object (when we don't need full data - // instantly - e.g. calculate hash, or comparing two revisions public interface Inspector { // XXX boolean retVal to indicate whether to continue? // TODO specify nodeid and data length, and reuse policy (i.e. if revlog stream doesn't reuse nodeid[] for each call)
--- a/src/org/tmatesoft/hg/repo/HgBundle.java Wed Oct 05 07:13:57 2011 +0200 +++ b/src/org/tmatesoft/hg/repo/HgBundle.java Thu Oct 13 03:30:50 2011 +0200 @@ -19,7 +19,6 @@ import java.io.File; import java.io.IOException; import java.util.LinkedList; -import java.util.List; import org.tmatesoft.hg.core.HgBadStateException; import org.tmatesoft.hg.core.HgException; @@ -31,7 +30,7 @@ import org.tmatesoft.hg.internal.DataAccessProvider; import org.tmatesoft.hg.internal.DigestHelper; import org.tmatesoft.hg.internal.InflaterDataAccess; -import org.tmatesoft.hg.internal.RevlogStream; +import org.tmatesoft.hg.internal.Patch; import org.tmatesoft.hg.repo.HgChangelog.RawChangeset; import org.tmatesoft.hg.util.CancelledException; @@ -239,7 +238,7 @@ public boolean element(GroupElement ge) { try { - System.out.printf(" %s %s %s %s; patches:%d\n", ge.node(), ge.firstParent(), ge.secondParent(), ge.cset(), ge.patches().size()); + System.out.printf(" %s %s %s %s; patches:%d\n", ge.node(), ge.firstParent(), ge.secondParent(), ge.cset(), ge.patch().count()); } catch (Exception ex) { ex.printStackTrace(); // FIXME } @@ -397,10 +396,11 @@ } } + // FIXME GroupElement exposes some internal API!!! (DataAccess) public static class GroupElement { private final byte[] header; // byte[80] takes 120 bytes, 4 Nodeids - 192 private final DataAccess dataAccess; - private List<RevlogStream.PatchRecord> patches; + private Patch patches; GroupElement(byte[] fourNodeids, DataAccess rawDataAccess) { assert fourNodeids != null && fourNodeids.length == 80; @@ -432,21 +432,17 @@ return dataAccess; } - public List<RevlogStream.PatchRecord> patches() throws IOException { + /*package-local*/ Patch patch() throws IOException { if (patches == null) { dataAccess.reset(); - LinkedList<RevlogStream.PatchRecord> p = new LinkedList<RevlogStream.PatchRecord>(); - while (!dataAccess.isEmpty()) { - RevlogStream.PatchRecord pr = RevlogStream.PatchRecord.read(dataAccess); - p.add(pr); - } - patches = p; + patches = new Patch(); + patches.read(dataAccess); } return patches; } public byte[] apply(DataAccess baseContent) throws IOException { - return RevlogStream.apply(baseContent, -1, patches()); + return patch().apply(baseContent, -1); } } }
--- a/src/org/tmatesoft/hg/repo/HgChangelog.java Wed Oct 05 07:13:57 2011 +0200 +++ b/src/org/tmatesoft/hg/repo/HgChangelog.java Thu Oct 13 03:30:50 2011 +0200 @@ -32,7 +32,6 @@ import java.util.TimeZone; import org.tmatesoft.hg.core.HgBadStateException; -import org.tmatesoft.hg.core.HgLogCommand; import org.tmatesoft.hg.core.Nodeid; import org.tmatesoft.hg.internal.DataAccess; import org.tmatesoft.hg.internal.IterateControlMediator;
--- a/test/org/tmatesoft/hg/test/MapTagsToFileRevisions.java Wed Oct 05 07:13:57 2011 +0200 +++ b/test/org/tmatesoft/hg/test/MapTagsToFileRevisions.java Thu Oct 13 03:30:50 2011 +0200 @@ -40,7 +40,8 @@ public static void main(String[] args) throws Exception { MapTagsToFileRevisions m = new MapTagsToFileRevisions(); System.out.printf("Free mem: %,d\n", Runtime.getRuntime().freeMemory()); - m.collectTagsPerFile(); + m.measurePatchAffectsArbitraryRevisionRead(); +// m.collectTagsPerFile(); // m.manifestWalk(); // m.changelogWalk(); // m.revisionMap(); @@ -50,6 +51,24 @@ System.out.printf("Free mem: %,d\n", Runtime.getRuntime().freeMemory()); } + + // revision == 2406 - 5 ms per run (baseRevision == 2406) + // revision == 2405 - 69 ms per run (baseRevision == 1403) + private void measurePatchAffectsArbitraryRevisionRead() throws Exception { + final HgRepository repository = new HgLookup().detect(new File("/temp/hg/cpython")); + final DoNothingManifestInspector insp = new DoNothingManifestInspector(); + final int revision = 2405; + // warm-up. + repository.getManifest().walk(revision, revision, insp); + final int runs = 10; + final long start = System.nanoTime(); + for (int i = 0; i < runs; i++) { + repository.getManifest().walk(revision, revision, insp); + } + final long end = System.nanoTime(); + System.out.printf("%d ms per run\n", (end - start)/ (runs*1000000)); + } + /* * .hgtags, 261 revisions * Approach 1: total 83, init: 0, iteration: 82 @@ -174,20 +193,7 @@ System.out.println(System.getProperty("java.version")); final long start = System.currentTimeMillis(); final HgRepository repository = new HgLookup().detect(new File("/temp/hg/cpython")); - repository.getManifest().walk(0, 10000, new HgManifest.Inspector2() { - public boolean begin(int mainfestRevision, Nodeid nid, int changelogRevision) { - return true; - } - public boolean next(Nodeid nid, String fname, String flags) { - throw new HgBadStateException(HgManifest.Inspector2.class.getName()); - } - public boolean next(Nodeid nid, Path fname, Flags flags) { - return true; - } - public boolean end(int manifestRevision) { - return true; - } - }); + repository.getManifest().walk(0, 10000, new DoNothingManifestInspector()); // cpython: 1,1 sec for 0..1000, 43 sec for 0..10000, 115 sec for 0..20000 (Pool with HashMap) // 2,4 sec for 1000..2000 // cpython -r 1000: 484 files, -r 2000: 1015 files. Iteration 1000..2000; fnamePool.size:1019 nodeidPool.size:2989 @@ -240,7 +246,7 @@ final IntMap<List<TagInfo>> tagLocalRev2TagInfo = new IntMap<List<TagInfo>>(allTags.length); System.out.printf("Collecting manifests for %d tags\n", allTags.length); final int[] tagLocalRevs = collectLocalTagRevisions(clogrmap, allTags, tagLocalRev2TagInfo); - System.out.printf("Prepared tag revisions to analyze: %d ms\n", System.currentTimeMillis() - start); + System.out.printf("Prepared %d tag revisions to analyze: %d ms\n", tagLocalRevs.length, System.currentTimeMillis() - start); final Path targetPath = Path.create("README"); // @@ -365,6 +371,21 @@ System.out.printf("Free mem: %,d\n", Runtime.getRuntime().freeMemory()); } + static class DoNothingManifestInspector implements HgManifest.Inspector2 { + public boolean begin(int mainfestRevision, Nodeid nid, int changelogRevision) { + return true; + } + public boolean next(Nodeid nid, String fname, String flags) { + throw new HgBadStateException(HgManifest.Inspector2.class.getName()); + } + public boolean next(Nodeid nid, Path fname, Flags flags) { + return true; + } + public boolean end(int manifestRevision) { + return true; + } + } + public static void main2(String[] args) throws HgException, CancelledException { final HgRepository repository = new HgLookup().detect(new File("/temp/hg/cpython")); final Path targetPath = Path.create("README");