tikhomirov@533: /* tikhomirov@533: * Copyright (c) 2013 TMate Software Ltd tikhomirov@533: * tikhomirov@533: * This program is free software; you can redistribute it and/or modify tikhomirov@533: * it under the terms of the GNU General Public License as published by tikhomirov@533: * the Free Software Foundation; version 2 of the License. tikhomirov@533: * tikhomirov@533: * This program is distributed in the hope that it will be useful, tikhomirov@533: * but WITHOUT ANY WARRANTY; without even the implied warranty of tikhomirov@533: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the tikhomirov@533: * GNU General Public License for more details. tikhomirov@533: * tikhomirov@533: * For information on how to redistribute this software under tikhomirov@533: * the terms of a license other than GNU General Public License tikhomirov@533: * contact TMate Software at support@hg4j.com tikhomirov@533: */ tikhomirov@533: package org.tmatesoft.hg.internal; tikhomirov@533: tikhomirov@533: import java.util.ArrayList; tikhomirov@533: import java.util.HashMap; tikhomirov@533: import java.util.Map; tikhomirov@533: tikhomirov@533: import org.tmatesoft.hg.repo.HgDataFile; tikhomirov@533: import org.tmatesoft.hg.repo.HgLookup; tikhomirov@533: import org.tmatesoft.hg.repo.HgRepository; tikhomirov@533: tikhomirov@533: /** tikhomirov@538: * Mercurial cares about changes only up to the line level, e.g. a simple file version dump in manifest looks like (RevlogDump output): tikhomirov@533: * tikhomirov@533: * 522: 233748 0 103 17438 433 522 521 -1 756073cf2321df44d3ed0585f2a5754bc8a1b2f6 tikhomirov@533: * : tikhomirov@533: * 3487..3578, 91:src/org/tmatesoft/hg/core/HgIterateDirection.java\00add61a8a665c5d8f092210767f812fe0d335ac8 tikhomirov@533: * tikhomirov@533: * I.e. for the {fname}{revision} entry format of manifest, not only {revision} is changed, but the whole line, with unchanged {fname} is recorded tikhomirov@533: * in the patch. tikhomirov@533: * tikhomirov@533: * Mercurial paper describes reasons for choosing this approach to delta generation, too. tikhomirov@533: * tikhomirov@533: * tikhomirov@533: * @author Artem Tikhomirov tikhomirov@533: * @author TMate Software Ltd. tikhomirov@533: */ tikhomirov@533: public class PatchGenerator { tikhomirov@533: tikhomirov@533: private Map chunk2UseIndex; tikhomirov@533: private ChunkSequence seq1, seq2; tikhomirov@533: tikhomirov@533: // get filled by #longestMatch, track start of common sequence in seq1 and seq2, respectively tikhomirov@533: private int matchStartS1, matchStartS2; tikhomirov@541: tikhomirov@541: private MatchInspector matchInspector; tikhomirov@533: tikhomirov@533: public void init(byte[] data1, byte[] data2) { tikhomirov@533: seq1 = new ChunkSequence(data1); tikhomirov@533: seq1.splitByNewlines(); tikhomirov@533: seq2 = new ChunkSequence(data2); tikhomirov@533: seq2.splitByNewlines(); tikhomirov@533: prepare(seq2); tikhomirov@533: } tikhomirov@533: tikhomirov@533: private void prepare(ChunkSequence s2) { tikhomirov@533: chunk2UseIndex = new HashMap(); tikhomirov@533: for (int i = 0, len = s2.chunkCount(); i < len; i++) { tikhomirov@533: ChunkSequence.ByteChain bc = s2.chunk(i); tikhomirov@533: IntVector loc = chunk2UseIndex.get(bc); tikhomirov@533: if (loc == null) { tikhomirov@533: chunk2UseIndex.put(bc, loc = new IntVector()); tikhomirov@533: } tikhomirov@533: loc.add(i); tikhomirov@533: // bc.registerUseIn(i) - BEWARE, use of bc here is incorrect tikhomirov@533: // in this case need to find the only ByteChain to keep indexes tikhomirov@533: // i.e. when there are few equal ByteChain instances, notion of "usedIn" shall be either shared (reference same vector) tikhomirov@533: // or kept within only one of them tikhomirov@533: } tikhomirov@533: // for (ChunkSequence.ByteChain bc : chunk2UseIndex.keySet()) { tikhomirov@533: // System.out.printf("%s: {", new String(bc.data())); tikhomirov@533: // for (int x : chunk2UseIndex.get(bc).toArray()) { tikhomirov@533: // System.out.printf(" %d,", x); tikhomirov@533: // } tikhomirov@533: // System.out.println("}"); tikhomirov@533: // } tikhomirov@533: } tikhomirov@533: tikhomirov@541: public void findMatchingBlocks(MatchInspector insp) { tikhomirov@541: insp.begin(seq1, seq2); tikhomirov@541: matchInspector = insp; tikhomirov@533: findMatchingBlocks(0, seq1.chunkCount(), 0, seq2.chunkCount()); tikhomirov@541: insp.end(); tikhomirov@533: } tikhomirov@533: tikhomirov@533: /** tikhomirov@533: * implementation based on Python's difflib.py and SequenceMatcher tikhomirov@533: */ tikhomirov@533: public int longestMatch(int startS1, int endS1, int startS2, int endS2) { tikhomirov@533: matchStartS1 = matchStartS2 = 0; tikhomirov@533: int maxLength = 0; tikhomirov@533: IntMap chunkIndex2MatchCount = new IntMap(8); tikhomirov@533: for (int i = startS1; i < endS1; i++) { tikhomirov@533: ChunkSequence.ByteChain bc = seq1.chunk(i); tikhomirov@533: IntMap newChunkIndex2MatchCount = new IntMap(8); tikhomirov@533: IntVector occurencesInS2 = chunk2UseIndex.get(bc); tikhomirov@533: if (occurencesInS2 == null) { tikhomirov@533: // chunkIndex2MatchCount.clear(); // TODO need clear instead of new instance tikhomirov@533: chunkIndex2MatchCount = newChunkIndex2MatchCount; tikhomirov@533: continue; tikhomirov@533: } tikhomirov@533: for (int j : occurencesInS2.toArray()) { tikhomirov@533: // s1[i] == s2[j] tikhomirov@533: if (j < startS2) { tikhomirov@533: continue; tikhomirov@533: } tikhomirov@533: if (j >= endS2) { tikhomirov@533: break; tikhomirov@533: } tikhomirov@533: int prevChunkMatches = chunkIndex2MatchCount.containsKey(j-1) ? chunkIndex2MatchCount.get(j-1) : 0; tikhomirov@533: int k = prevChunkMatches + 1; tikhomirov@533: newChunkIndex2MatchCount.put(j, k); tikhomirov@533: if (k > maxLength) { tikhomirov@533: matchStartS1 = i-k+1; tikhomirov@533: matchStartS2 = j-k+1; tikhomirov@533: maxLength = k; tikhomirov@533: } tikhomirov@533: } tikhomirov@533: chunkIndex2MatchCount = newChunkIndex2MatchCount; tikhomirov@533: } tikhomirov@533: return maxLength; tikhomirov@533: } tikhomirov@533: tikhomirov@541: private void findMatchingBlocks(int startS1, int endS1, int startS2, int endS2) { tikhomirov@533: int matchLength = longestMatch(startS1, endS1, startS2, endS2); tikhomirov@533: if (matchLength > 0) { tikhomirov@533: final int saveStartS1 = matchStartS1, saveStartS2 = matchStartS2; tikhomirov@533: if (startS1 < matchStartS1 && startS2 < matchStartS2) { tikhomirov@533: findMatchingBlocks(startS1, matchStartS1, startS2, matchStartS2); tikhomirov@533: } tikhomirov@541: matchInspector.match(saveStartS1, saveStartS2, matchLength); tikhomirov@533: if (saveStartS1+matchLength < endS1 && saveStartS2+matchLength < endS2) { tikhomirov@533: findMatchingBlocks(saveStartS1 + matchLength, endS1, saveStartS2 + matchLength, endS2); tikhomirov@533: } tikhomirov@533: } tikhomirov@533: } tikhomirov@533: tikhomirov@541: interface MatchInspector { tikhomirov@541: void begin(ChunkSequence s1, ChunkSequence s2); tikhomirov@541: void match(int startSeq1, int startSeq2, int matchLength); tikhomirov@541: void end(); tikhomirov@541: } tikhomirov@533: tikhomirov@541: static class MatchDumpInspector implements MatchInspector { tikhomirov@541: public void begin(ChunkSequence s1, ChunkSequence s2) { tikhomirov@541: } tikhomirov@541: tikhomirov@541: public void match(int startSeq1, int startSeq2, int matchLength) { tikhomirov@541: System.out.printf("match: from line #%d and line #%d of length %d\n", startSeq1, startSeq2, matchLength); tikhomirov@541: } tikhomirov@541: tikhomirov@541: public void end() { tikhomirov@541: } tikhomirov@541: } tikhomirov@541: tikhomirov@542: static class DeltaInspector implements MatchInspector { tikhomirov@541: protected int changeStartS1, changeStartS2; tikhomirov@541: protected ChunkSequence seq1, seq2; tikhomirov@541: tikhomirov@541: tikhomirov@541: public void begin(ChunkSequence s1, ChunkSequence s2) { tikhomirov@541: seq1 = s1; tikhomirov@541: seq2 = s2; tikhomirov@541: changeStartS1 = changeStartS2 = 0; tikhomirov@541: } tikhomirov@541: tikhomirov@541: public void match(int startSeq1, int startSeq2, int matchLength) { tikhomirov@542: reportDeltaElement(startSeq1, startSeq2, matchLength); tikhomirov@541: changeStartS1 = startSeq1 + matchLength; tikhomirov@541: changeStartS2 = startSeq2 + matchLength; tikhomirov@541: } tikhomirov@541: tikhomirov@541: public void end() { tikhomirov@541: if (changeStartS1 < seq1.chunkCount() || changeStartS2 < seq2.chunkCount()) { tikhomirov@542: reportDeltaElement(seq1.chunkCount(), seq2.chunkCount(), 0); tikhomirov@541: } tikhomirov@541: } tikhomirov@541: tikhomirov@542: protected void reportDeltaElement(int matchStartSeq1, int matchStartSeq2, int matchLength) { tikhomirov@541: if (changeStartS1 < matchStartSeq1) { tikhomirov@541: if (changeStartS2 < matchStartSeq2) { tikhomirov@542: changed(changeStartS1, matchStartSeq1, changeStartS2, matchStartSeq2); tikhomirov@541: } else { tikhomirov@541: assert changeStartS2 == matchStartSeq2; tikhomirov@542: deleted(changeStartS1, matchStartSeq1); tikhomirov@541: } tikhomirov@533: } else { tikhomirov@541: assert changeStartS1 == matchStartSeq1; tikhomirov@541: if(changeStartS2 < matchStartSeq2) { tikhomirov@542: added(matchStartSeq1, changeStartS2, matchStartSeq2); tikhomirov@541: } else { tikhomirov@541: assert changeStartS2 == matchStartSeq2; tikhomirov@541: System.out.printf("adjustent equal blocks %d, %d and %d,%d\n", changeStartS1, matchStartSeq1, changeStartS2, matchStartSeq2); tikhomirov@541: } tikhomirov@533: } tikhomirov@542: if (matchLength > 0) { tikhomirov@542: unchanged(matchStartSeq1, matchStartSeq2, matchLength); tikhomirov@542: } tikhomirov@542: } tikhomirov@542: tikhomirov@542: /** tikhomirov@542: * [s1From..s1To) replaced with [s2From..s2To) tikhomirov@542: */ tikhomirov@542: protected void changed(int s1From, int s1To, int s2From, int s2To) { tikhomirov@542: // NO-OP tikhomirov@542: } tikhomirov@542: tikhomirov@542: protected void deleted(int s1From, int s1To) { tikhomirov@542: // NO-OP tikhomirov@542: } tikhomirov@542: tikhomirov@542: protected void added(int s1InsertPoint, int s2From, int s2To) { tikhomirov@542: // NO-OP tikhomirov@542: } tikhomirov@542: tikhomirov@542: protected void unchanged(int s1From, int s2From, int length) { tikhomirov@542: // NO-OP tikhomirov@541: } tikhomirov@541: } tikhomirov@541: tikhomirov@542: static class DeltaDumpInspector extends DeltaInspector { tikhomirov@542: tikhomirov@542: @Override tikhomirov@542: protected void changed(int s1From, int s1To, int s2From, int s2To) { tikhomirov@542: System.out.printf("changed [%d..%d) with [%d..%d)\n", s1From, s1To, s2From, s2To); tikhomirov@542: } tikhomirov@542: tikhomirov@542: @Override tikhomirov@542: protected void deleted(int s1From, int s1To) { tikhomirov@542: System.out.printf("deleted [%d..%d)\n", s1From, s1To); tikhomirov@542: } tikhomirov@542: tikhomirov@542: @Override tikhomirov@542: protected void added(int s1InsertPoint, int s2From, int s2To) { tikhomirov@542: System.out.printf("added [%d..%d) at %d\n", s2From, s2To, s1InsertPoint); tikhomirov@542: } tikhomirov@542: tikhomirov@542: } tikhomirov@542: tikhomirov@542: static class PatchFillInspector extends DeltaInspector { tikhomirov@541: private final Patch deltaCollector; tikhomirov@541: tikhomirov@541: PatchFillInspector(Patch p) { tikhomirov@541: assert p != null; tikhomirov@541: deltaCollector = p; tikhomirov@541: } tikhomirov@541: tikhomirov@541: @Override tikhomirov@542: protected void changed(int s1From, int s1To, int s2From, int s2To) { tikhomirov@542: int from = seq1.chunk(s1From).getOffset(); tikhomirov@542: int to = seq1.chunk(s1To).getOffset(); tikhomirov@542: byte[] data = seq2.data(s2From, s2To); tikhomirov@542: deltaCollector.add(from, to, data); tikhomirov@542: } tikhomirov@542: tikhomirov@542: @Override tikhomirov@542: protected void deleted(int s1From, int s1To) { tikhomirov@542: int from = seq1.chunk(s1From).getOffset(); tikhomirov@542: int to = seq1.chunk(s1To).getOffset(); tikhomirov@542: deltaCollector.add(from, to, new byte[0]); tikhomirov@542: } tikhomirov@542: tikhomirov@542: @Override tikhomirov@542: protected void added(int s1InsertPoint, int s2From, int s2To) { tikhomirov@542: int insPoint = seq1.chunk(s1InsertPoint).getOffset(); tikhomirov@542: byte[] data = seq2.data(s2From, s2To); tikhomirov@542: deltaCollector.add(insPoint, insPoint, data); tikhomirov@533: } tikhomirov@533: } tikhomirov@533: tikhomirov@541: tikhomirov@541: tikhomirov@533: public static void main(String[] args) throws Exception { tikhomirov@538: PatchGenerator pg1 = new PatchGenerator(); tikhomirov@538: pg1.init("hello".getBytes(), "hello\nworld".getBytes()); tikhomirov@541: pg1.findMatchingBlocks(new MatchDumpInspector()); tikhomirov@541: pg1.findMatchingBlocks(new DeltaDumpInspector()); tikhomirov@541: if (Boolean.FALSE.booleanValue()) { tikhomirov@538: return; tikhomirov@538: } tikhomirov@533: HgRepository repo = new HgLookup().detectFromWorkingDir(); tikhomirov@533: HgDataFile df = repo.getFileNode("cmdline/org/tmatesoft/hg/console/Main.java"); tikhomirov@533: ByteArrayChannel bac1, bac2; tikhomirov@533: df.content(80, bac1 = new ByteArrayChannel()); tikhomirov@533: df.content(81, bac2 = new ByteArrayChannel()); tikhomirov@533: // String s1 = "line 1\nline 2\r\nline 3\n\nline 1\nline 2"; tikhomirov@533: // String s2 = "abc\ncdef\r\nline 2\r\nline 3\nline 2"; tikhomirov@533: PatchGenerator pg = new PatchGenerator(); tikhomirov@533: pg.init(bac1.toArray(), bac2.toArray()); tikhomirov@541: System.out.println("Matches:"); tikhomirov@541: pg.findMatchingBlocks(new MatchDumpInspector()); tikhomirov@541: System.out.println("Deltas:"); tikhomirov@541: pg.findMatchingBlocks(new DeltaDumpInspector()); tikhomirov@533: } tikhomirov@533: tikhomirov@533: public Patch delta(byte[] prev, byte[] content) { tikhomirov@541: Patch rv = new Patch(); tikhomirov@533: init(prev, content); tikhomirov@541: findMatchingBlocks(new PatchFillInspector(rv)); tikhomirov@541: return rv; tikhomirov@533: } tikhomirov@533: tikhomirov@542: /* tikhomirov@542: * TODO shall be parameterized (template?) and refacctored to facilitate matching non lines only tikhomirov@542: * (sequence diff algorithm above doesn't care about sequence nature) tikhomirov@542: */ tikhomirov@542: static final class ChunkSequence { tikhomirov@533: tikhomirov@533: private final byte[] input; tikhomirov@533: private ArrayList lines; tikhomirov@533: tikhomirov@533: public ChunkSequence(byte[] data) { tikhomirov@533: input = data; tikhomirov@533: } tikhomirov@533: tikhomirov@533: public void splitByNewlines() { tikhomirov@533: lines = new ArrayList(); tikhomirov@533: int lastStart = 0; tikhomirov@533: for (int i = 0; i < input.length; i++) { tikhomirov@533: if (input[i] == '\n') { tikhomirov@533: lines.add(new ByteChain(lastStart, i+1)); tikhomirov@533: lastStart = i+1; tikhomirov@533: } else if (input[i] == '\r') { tikhomirov@533: if (i+1 < input.length && input[i+1] == '\n') { tikhomirov@533: i++; tikhomirov@533: } tikhomirov@533: lines.add(new ByteChain(lastStart, i+1)); tikhomirov@533: lastStart = i+1; tikhomirov@533: } tikhomirov@533: } tikhomirov@533: if (lastStart < input.length) { tikhomirov@533: lines.add(new ByteChain(lastStart, input.length)); tikhomirov@533: } tikhomirov@538: // empty chunk to keep offset of input end tikhomirov@538: lines.add(new ByteChain(input.length, input.length)); tikhomirov@533: } tikhomirov@533: tikhomirov@533: public ByteChain chunk(int index) { tikhomirov@533: return lines.get(index); tikhomirov@533: } tikhomirov@533: tikhomirov@533: public int chunkCount() { tikhomirov@533: return lines.size(); tikhomirov@533: } tikhomirov@533: tikhomirov@533: public byte[] data(int chunkFrom, int chunkTo) { tikhomirov@533: if (chunkFrom == chunkTo) { tikhomirov@533: return new byte[0]; tikhomirov@533: } tikhomirov@533: int from = chunk(chunkFrom).getOffset(), to = chunk(chunkTo).getOffset(); tikhomirov@533: byte[] rv = new byte[to - from]; tikhomirov@533: System.arraycopy(input, from, rv, 0, rv.length); tikhomirov@533: return rv; tikhomirov@533: } tikhomirov@533: tikhomirov@533: tikhomirov@533: final class ByteChain { tikhomirov@533: private final int start, end; tikhomirov@533: private final int hash; tikhomirov@533: tikhomirov@533: ByteChain(int s, int e) { tikhomirov@533: start = s; tikhomirov@533: end = e; tikhomirov@533: hash = calcHash(input, s, e); tikhomirov@533: } tikhomirov@533: tikhomirov@533: /** tikhomirov@533: * byte offset of the this ByteChain inside ChainSequence tikhomirov@533: */ tikhomirov@533: public int getOffset() { tikhomirov@533: return start; tikhomirov@533: } tikhomirov@533: tikhomirov@533: public byte[] data() { tikhomirov@533: byte[] rv = new byte[end - start]; tikhomirov@533: System.arraycopy(input, start, rv, 0, rv.length); tikhomirov@533: return rv; tikhomirov@533: } tikhomirov@533: tikhomirov@533: @Override tikhomirov@533: public boolean equals(Object obj) { tikhomirov@533: if (obj == null || obj.getClass() != ByteChain.class) { tikhomirov@533: return false; tikhomirov@533: } tikhomirov@533: ByteChain other = (ByteChain) obj; tikhomirov@533: if (other.hash != hash || other.end - other.start != end - start) { tikhomirov@533: return false; tikhomirov@533: } tikhomirov@533: return other.match(input, start); tikhomirov@533: } tikhomirov@533: tikhomirov@533: private boolean match(byte[] oi, int from) { tikhomirov@533: for (int i = start, j = from; i < end; i++, j++) { tikhomirov@533: if (ChunkSequence.this.input[i] != oi[j]) { tikhomirov@533: return false; tikhomirov@533: } tikhomirov@533: } tikhomirov@533: return true; tikhomirov@533: } tikhomirov@533: tikhomirov@533: @Override tikhomirov@533: public int hashCode() { tikhomirov@533: return hash; tikhomirov@533: } tikhomirov@533: tikhomirov@533: @Override tikhomirov@533: public String toString() { tikhomirov@533: return String.format("[@%d\"%s\"]", start, new String(data())); tikhomirov@533: } tikhomirov@533: } tikhomirov@533: tikhomirov@533: // same as Arrays.hashCode(byte[]), just for a slice of a bigger array tikhomirov@533: static int calcHash(byte[] data, int from, int to) { tikhomirov@533: int result = 1; tikhomirov@533: for (int i = from; i < to; i++) { tikhomirov@533: result = 31 * result + data[i]; tikhomirov@533: } tikhomirov@533: return result; tikhomirov@533: } tikhomirov@533: } tikhomirov@533: }