Mercurial > hg4j
changeset 533:e6f72c9829a6
Generate patches using diff algorithm
author | Artem Tikhomirov <tikhomirov.artem@gmail.com> |
---|---|
date | Wed, 30 Jan 2013 15:48:36 +0100 |
parents | 688c1ab113bb |
children | 243202f1bda5 |
files | src/org/tmatesoft/hg/internal/Patch.java src/org/tmatesoft/hg/internal/PatchGenerator.java src/org/tmatesoft/hg/internal/RevlogDump.java src/org/tmatesoft/hg/internal/RevlogStreamWriter.java |
diffstat | 4 files changed, 368 insertions(+), 8 deletions(-) [+] |
line wrap: on
line diff
--- a/src/org/tmatesoft/hg/internal/Patch.java Wed Jan 23 19:14:15 2013 +0100 +++ b/src/org/tmatesoft/hg/internal/Patch.java Wed Jan 30 15:48:36 2013 +0100 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2012 TMate Software Ltd + * Copyright (c) 2011-2013 TMate Software Ltd * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -24,8 +24,8 @@ * @see http://mercurial.selenic.com/wiki/BundleFormat * in Changelog group description * - * range [start..end] in original source gets replaced with data of length (do not keep, use data.length instead) - * range [end(i)..start(i+1)] is copied from the source + * range [start..end) in original source gets replaced with data of length (do not keep, use data.length instead) + * range [end(i)..start(i+1)) is copied from the source * * @author Artem Tikhomirov * @author TMate Software Ltd. @@ -159,7 +159,7 @@ add(p.starts.get(i), p.ends.get(i), p.data.get(i)); } - private void add(int start, int end, byte[] d) { + /*package-local*/ void add(int start, int end, byte[] d) { starts.add(start); ends.add(end); data.add(d);
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/org/tmatesoft/hg/internal/PatchGenerator.java Wed Jan 30 15:48:36 2013 +0100 @@ -0,0 +1,311 @@ +/* + * Copyright (c) 2013 TMate Software Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * For information on how to redistribute this software under + * the terms of a license other than GNU General Public License + * contact TMate Software at support@hg4j.com + */ +package org.tmatesoft.hg.internal; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; + +import org.tmatesoft.hg.repo.HgDataFile; +import org.tmatesoft.hg.repo.HgLookup; +import org.tmatesoft.hg.repo.HgRepository; + +/** + * Mercurial cares about changes only up to the line level, e.g. a simple file version bump in manifest looks like (RevlogDump output): + * + * 522: 233748 0 103 17438 433 522 521 -1 756073cf2321df44d3ed0585f2a5754bc8a1b2f6 + * <PATCH>: + * 3487..3578, 91:src/org/tmatesoft/hg/core/HgIterateDirection.java\00add61a8a665c5d8f092210767f812fe0d335ac8 + * + * I.e. for the {fname}{revision} entry format of manifest, not only {revision} is changed, but the whole line, with unchanged {fname} is recorded + * in the patch. + * + * Mercurial paper describes reasons for choosing this approach to delta generation, too. + * + * + * @author Artem Tikhomirov + * @author TMate Software Ltd. + */ +public class PatchGenerator { + + private Map<ChunkSequence.ByteChain, IntVector> chunk2UseIndex; + private ChunkSequence seq1, seq2; + + // get filled by #longestMatch, track start of common sequence in seq1 and seq2, respectively + private int matchStartS1, matchStartS2; + // get filled by #findMatchingBlocks, track start of changed/unknown sequence in seq1 and seq2 + private int changeStartS1, changeStartS2; + + public void init(byte[] data1, byte[] data2) { + seq1 = new ChunkSequence(data1); + seq1.splitByNewlines(); + seq2 = new ChunkSequence(data2); + seq2.splitByNewlines(); + prepare(seq2); + } + + private void prepare(ChunkSequence s2) { + chunk2UseIndex = new HashMap<ChunkSequence.ByteChain, IntVector>(); + for (int i = 0, len = s2.chunkCount(); i < len; i++) { + ChunkSequence.ByteChain bc = s2.chunk(i); + IntVector loc = chunk2UseIndex.get(bc); + if (loc == null) { + chunk2UseIndex.put(bc, loc = new IntVector()); + } + loc.add(i); + // bc.registerUseIn(i) - BEWARE, use of bc here is incorrect + // in this case need to find the only ByteChain to keep indexes + // i.e. when there are few equal ByteChain instances, notion of "usedIn" shall be either shared (reference same vector) + // or kept within only one of them + } +// for (ChunkSequence.ByteChain bc : chunk2UseIndex.keySet()) { +// System.out.printf("%s: {", new String(bc.data())); +// for (int x : chunk2UseIndex.get(bc).toArray()) { +// System.out.printf(" %d,", x); +// } +// System.out.println("}"); +// } + } + + public void findMatchingBlocks() { + changeStartS1 = changeStartS2 = 0; + findMatchingBlocks(0, seq1.chunkCount(), 0, seq2.chunkCount()); + if (changeStartS1 < seq1.chunkCount() || changeStartS2 < seq2.chunkCount()) { + reportDeltaElement(seq1.chunkCount(), seq2.chunkCount()); + } + } + + /** + * implementation based on Python's difflib.py and SequenceMatcher + */ + public int longestMatch(int startS1, int endS1, int startS2, int endS2) { + matchStartS1 = matchStartS2 = 0; + int maxLength = 0; + IntMap<Integer> chunkIndex2MatchCount = new IntMap<Integer>(8); + for (int i = startS1; i < endS1; i++) { + ChunkSequence.ByteChain bc = seq1.chunk(i); + IntMap<Integer> newChunkIndex2MatchCount = new IntMap<Integer>(8); + IntVector occurencesInS2 = chunk2UseIndex.get(bc); + if (occurencesInS2 == null) { + // chunkIndex2MatchCount.clear(); // TODO need clear instead of new instance + chunkIndex2MatchCount = newChunkIndex2MatchCount; + continue; + } + for (int j : occurencesInS2.toArray()) { + // s1[i] == s2[j] + if (j < startS2) { + continue; + } + if (j >= endS2) { + break; + } + int prevChunkMatches = chunkIndex2MatchCount.containsKey(j-1) ? chunkIndex2MatchCount.get(j-1) : 0; + int k = prevChunkMatches + 1; + newChunkIndex2MatchCount.put(j, k); + if (k > maxLength) { + matchStartS1 = i-k+1; + matchStartS2 = j-k+1; + maxLength = k; + } + } + chunkIndex2MatchCount = newChunkIndex2MatchCount; + } + return maxLength; + } + + public void findMatchingBlocks(int startS1, int endS1, int startS2, int endS2) { + int matchLength = longestMatch(startS1, endS1, startS2, endS2); + if (matchLength > 0) { + final int saveStartS1 = matchStartS1, saveStartS2 = matchStartS2; + if (startS1 < matchStartS1 && startS2 < matchStartS2) { + findMatchingBlocks(startS1, matchStartS1, startS2, matchStartS2); + } + reportDeltaElement(saveStartS1, saveStartS2); + changeStartS1 = saveStartS1 + matchLength; + changeStartS2 = saveStartS2 + matchLength; +// System.out.printf("match: from line #%d and line #%d of length %d\n", saveStartS1, saveStartS2, matchLength); + if (saveStartS1+matchLength < endS1 && saveStartS2+matchLength < endS2) { + findMatchingBlocks(saveStartS1 + matchLength, endS1, saveStartS2 + matchLength, endS2); + } + } + } + + private Patch deltaCollector; + + private void reportDeltaElement(int i, int j) { + if (changeStartS1 < i) { + if (changeStartS2 < j) { + System.out.printf("changed [%d..%d) with [%d..%d)\n", changeStartS1, i, changeStartS2, j); + } else { + assert changeStartS2 == j; + System.out.printf("deleted [%d..%d)\n", changeStartS1, i); + } + if (deltaCollector != null) { + int from = seq1.chunk(changeStartS1).getOffset(); + int to = seq1.chunk(i).getOffset(); + byte[] data = seq2.data(changeStartS2, j); + deltaCollector.add(from, to, data); + } + } else { + assert changeStartS1 == i; + if(changeStartS2 < j) { + System.out.printf("added [%d..%d)\n", changeStartS2, j); + } else { + assert changeStartS2 == j; + System.out.printf("adjustent equal blocks %d, %d and %d,%d\n", changeStartS1, i, changeStartS2, j); + } + if (deltaCollector != null) { + int insPoint = seq1.chunk(changeStartS1).getOffset(); + byte[] data = seq2.data(changeStartS2, j); + deltaCollector.add(insPoint, insPoint, data); + } + } + } + + public static void main(String[] args) throws Exception { + HgRepository repo = new HgLookup().detectFromWorkingDir(); + HgDataFile df = repo.getFileNode("cmdline/org/tmatesoft/hg/console/Main.java"); + ByteArrayChannel bac1, bac2; + df.content(80, bac1 = new ByteArrayChannel()); + df.content(81, bac2 = new ByteArrayChannel()); +// String s1 = "line 1\nline 2\r\nline 3\n\nline 1\nline 2"; +// String s2 = "abc\ncdef\r\nline 2\r\nline 3\nline 2"; + PatchGenerator pg = new PatchGenerator(); + pg.init(bac1.toArray(), bac2.toArray()); + pg.findMatchingBlocks(); + } + + public Patch delta(byte[] prev, byte[] content) { + deltaCollector = new Patch(); + init(prev, content); + findMatchingBlocks(); + return deltaCollector; + } + + private static class ChunkSequence { + + private final byte[] input; + private ArrayList<ByteChain> lines; + + public ChunkSequence(byte[] data) { + input = data; + } + + public void splitByNewlines() { + lines = new ArrayList<ByteChain>(); + int lastStart = 0; + for (int i = 0; i < input.length; i++) { + if (input[i] == '\n') { + lines.add(new ByteChain(lastStart, i+1)); + lastStart = i+1; + } else if (input[i] == '\r') { + if (i+1 < input.length && input[i+1] == '\n') { + i++; + } + lines.add(new ByteChain(lastStart, i+1)); + lastStart = i+1; + } + } + if (lastStart < input.length) { + lines.add(new ByteChain(lastStart, input.length)); + } + } + + public ByteChain chunk(int index) { + return lines.get(index); + } + + public int chunkCount() { + return lines.size(); + } + + public byte[] data(int chunkFrom, int chunkTo) { + if (chunkFrom == chunkTo) { + return new byte[0]; + } + int from = chunk(chunkFrom).getOffset(), to = chunk(chunkTo).getOffset(); + byte[] rv = new byte[to - from]; + System.arraycopy(input, from, rv, 0, rv.length); + return rv; + } + + + final class ByteChain { + private final int start, end; + private final int hash; + + ByteChain(int s, int e) { + start = s; + end = e; + hash = calcHash(input, s, e); + } + + /** + * byte offset of the this ByteChain inside ChainSequence + */ + public int getOffset() { + return start; + } + + public byte[] data() { + byte[] rv = new byte[end - start]; + System.arraycopy(input, start, rv, 0, rv.length); + return rv; + } + + @Override + public boolean equals(Object obj) { + if (obj == null || obj.getClass() != ByteChain.class) { + return false; + } + ByteChain other = (ByteChain) obj; + if (other.hash != hash || other.end - other.start != end - start) { + return false; + } + return other.match(input, start); + } + + private boolean match(byte[] oi, int from) { + for (int i = start, j = from; i < end; i++, j++) { + if (ChunkSequence.this.input[i] != oi[j]) { + return false; + } + } + return true; + } + + @Override + public int hashCode() { + return hash; + } + + @Override + public String toString() { + return String.format("[@%d\"%s\"]", start, new String(data())); + } + } + + // same as Arrays.hashCode(byte[]), just for a slice of a bigger array + static int calcHash(byte[] data, int from, int to) { + int result = 1; + for (int i = from; i < to; i++) { + result = 31 * result + data[i]; + } + return result; + } + } +}
--- a/src/org/tmatesoft/hg/internal/RevlogDump.java Wed Jan 23 19:14:15 2013 +0100 +++ b/src/org/tmatesoft/hg/internal/RevlogDump.java Wed Jan 30 15:48:36 2013 +0100 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2012 TMate Software Ltd + * Copyright (c) 2010-2013 TMate Software Ltd * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -27,6 +27,8 @@ import java.math.BigInteger; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import java.util.zip.Inflater; /** @@ -48,6 +50,7 @@ String filename = "store/00changelog.i"; // String filename = "store/data/hello.c.i"; // String filename = "store/data/docs/readme.i"; + System.out.println(escape("abc\0def\nzxc\tmnb")); boolean dumpDataFull = true; boolean dumpDataStats = false; if (args.length > 1) { @@ -140,7 +143,7 @@ byte[] src = new byte[l]; dis.read(src, 0, l); sb.append(":"); - sb.append(new String(src, 0, l, "UTF-8")); + sb.append(escape(new String(src, 0, l, "UTF-8"))); } else { dis.skipBytes(l); } @@ -149,9 +152,38 @@ return sb.toString(); } else { if (completeDataDump) { - return new String(data, offset, len, "UTF-8"); + return escape(new String(data, offset, len, "UTF-8")); } return String.format("<DATA>:%d bytes", len-offset); } } + + private static Pattern controlCharPattern = Pattern.compile("\\p{Cntrl}"); + // \p{Cntrl} A control character: [\x00-\x1F\x7F] + private static String[] replacements = new String[33]; + static { + for (int i = 0; i < 32; i++) { + // no idea why need FOUR backslashes to get only one in printout + replacements[i] = String.format("\\\\%X", i); + } + replacements[32] = String.format("\\\\%X", 127); + } + // handy to get newline-separated data printed on newlines. + // set to false for non-printable data (e.g. binaries, where \n doesn't make sense) + private static boolean leaveNewlineInData = true; + + private static String escape(CharSequence possiblyWithBinary) { + Matcher m = controlCharPattern.matcher(possiblyWithBinary); + StringBuffer rv = new StringBuffer(); + while (m.find()) { + char c = m.group().charAt(0); + if (leaveNewlineInData && c == '\n') { + continue; + } + int x = (int) c; + m.appendReplacement(rv, replacements[x == 127 ? 32 : x]); + } + m.appendTail(rv); + return rv.toString(); + } }
--- a/src/org/tmatesoft/hg/internal/RevlogStreamWriter.java Wed Jan 23 19:14:15 2013 +0100 +++ b/src/org/tmatesoft/hg/internal/RevlogStreamWriter.java Wed Jan 30 15:48:36 2013 +0100 @@ -113,6 +113,23 @@ } } - public void addRevision(String text, int baseRevision, int linkRevision, int p1, int p2) { + + private final DigestHelper dh = new DigestHelper(); + + public void addRevision(byte[] content, int linkRevision, int p1, int p2) { + Nodeid p1Rev = parent(p1); + Nodeid p2Rev = parent(p2); + byte[] revisionBytes = dh.sha1(p1Rev, p2Rev, content).asBinary(); + //final Nodeid revision = Nodeid.fromBinary(revisionBytes, 0); + // cache last revision (its delta and baseRev) + PatchGenerator pg = new PatchGenerator(); + byte[] prev = null; + Patch patch = pg.delta(prev, content); + byte[] patchContent; + // rest as in HgCloneCommand + } + + private Nodeid parent(int parentIndex) { + return null; } }