tikhomirov@533: /* tikhomirov@533: * Copyright (c) 2013 TMate Software Ltd tikhomirov@533: * tikhomirov@533: * This program is free software; you can redistribute it and/or modify tikhomirov@533: * it under the terms of the GNU General Public License as published by tikhomirov@533: * the Free Software Foundation; version 2 of the License. tikhomirov@533: * tikhomirov@533: * This program is distributed in the hope that it will be useful, tikhomirov@533: * but WITHOUT ANY WARRANTY; without even the implied warranty of tikhomirov@533: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the tikhomirov@533: * GNU General Public License for more details. tikhomirov@533: * tikhomirov@533: * For information on how to redistribute this software under tikhomirov@533: * the terms of a license other than GNU General Public License tikhomirov@533: * contact TMate Software at support@hg4j.com tikhomirov@533: */ tikhomirov@533: package org.tmatesoft.hg.internal; tikhomirov@533: tikhomirov@533: import java.util.ArrayList; tikhomirov@533: import java.util.HashMap; tikhomirov@533: import java.util.Map; tikhomirov@533: tikhomirov@549: import org.tmatesoft.hg.repo.HgInvalidStateException; tikhomirov@533: tikhomirov@533: /** tikhomirov@538: * Mercurial cares about changes only up to the line level, e.g. a simple file version dump in manifest looks like (RevlogDump output): tikhomirov@533: * tikhomirov@533: * 522: 233748 0 103 17438 433 522 521 -1 756073cf2321df44d3ed0585f2a5754bc8a1b2f6 tikhomirov@533: * : tikhomirov@533: * 3487..3578, 91:src/org/tmatesoft/hg/core/HgIterateDirection.java\00add61a8a665c5d8f092210767f812fe0d335ac8 tikhomirov@533: * tikhomirov@533: * I.e. for the {fname}{revision} entry format of manifest, not only {revision} is changed, but the whole line, with unchanged {fname} is recorded tikhomirov@533: * in the patch. tikhomirov@533: * tikhomirov@533: * Mercurial paper describes reasons for choosing this approach to delta generation, too. tikhomirov@533: * tikhomirov@533: * tikhomirov@533: * @author Artem Tikhomirov tikhomirov@533: * @author TMate Software Ltd. tikhomirov@533: */ tikhomirov@551: public class DiffHelper> { tikhomirov@533: tikhomirov@551: private Map chunk2UseIndex; tikhomirov@544: private T seq1, seq2; tikhomirov@533: tikhomirov@533: // get filled by #longestMatch, track start of common sequence in seq1 and seq2, respectively tikhomirov@533: private int matchStartS1, matchStartS2; tikhomirov@541: tikhomirov@544: private MatchInspector matchInspector; tikhomirov@533: tikhomirov@544: public void init(T s1, T s2) { tikhomirov@544: seq1 = s1; tikhomirov@544: seq2 = s2; tikhomirov@544: prepare(s2); tikhomirov@533: } tikhomirov@549: tikhomirov@549: public void init(T s1) { tikhomirov@549: if (seq2 == null) { tikhomirov@549: throw new IllegalStateException("Use this #init() only when target sequence shall be matched against different origin"); tikhomirov@549: } tikhomirov@549: seq1 = s1; tikhomirov@549: } tikhomirov@533: tikhomirov@544: tikhomirov@544: private void prepare(T s2) { tikhomirov@551: chunk2UseIndex = new HashMap(); tikhomirov@533: for (int i = 0, len = s2.chunkCount(); i < len; i++) { tikhomirov@551: Object bc = s2.chunk(i); tikhomirov@533: IntVector loc = chunk2UseIndex.get(bc); tikhomirov@533: if (loc == null) { tikhomirov@533: chunk2UseIndex.put(bc, loc = new IntVector()); tikhomirov@533: } tikhomirov@533: loc.add(i); tikhomirov@533: // bc.registerUseIn(i) - BEWARE, use of bc here is incorrect tikhomirov@533: // in this case need to find the only ByteChain to keep indexes tikhomirov@533: // i.e. when there are few equal ByteChain instances, notion of "usedIn" shall be either shared (reference same vector) tikhomirov@533: // or kept within only one of them tikhomirov@533: } tikhomirov@533: } tikhomirov@533: tikhomirov@544: public void findMatchingBlocks(MatchInspector insp) { tikhomirov@541: insp.begin(seq1, seq2); tikhomirov@541: matchInspector = insp; tikhomirov@533: findMatchingBlocks(0, seq1.chunkCount(), 0, seq2.chunkCount()); tikhomirov@541: insp.end(); tikhomirov@533: } tikhomirov@533: tikhomirov@533: /** tikhomirov@533: * implementation based on Python's difflib.py and SequenceMatcher tikhomirov@533: */ tikhomirov@533: public int longestMatch(int startS1, int endS1, int startS2, int endS2) { tikhomirov@533: matchStartS1 = matchStartS2 = 0; tikhomirov@533: int maxLength = 0; tikhomirov@533: IntMap chunkIndex2MatchCount = new IntMap(8); tikhomirov@533: for (int i = startS1; i < endS1; i++) { tikhomirov@551: Object bc = seq1.chunk(i); tikhomirov@533: IntVector occurencesInS2 = chunk2UseIndex.get(bc); tikhomirov@533: if (occurencesInS2 == null) { tikhomirov@551: chunkIndex2MatchCount.clear(); tikhomirov@533: continue; tikhomirov@533: } tikhomirov@551: IntMap newChunkIndex2MatchCount = new IntMap(8); tikhomirov@533: for (int j : occurencesInS2.toArray()) { tikhomirov@533: // s1[i] == s2[j] tikhomirov@533: if (j < startS2) { tikhomirov@533: continue; tikhomirov@533: } tikhomirov@533: if (j >= endS2) { tikhomirov@533: break; tikhomirov@533: } tikhomirov@533: int prevChunkMatches = chunkIndex2MatchCount.containsKey(j-1) ? chunkIndex2MatchCount.get(j-1) : 0; tikhomirov@533: int k = prevChunkMatches + 1; tikhomirov@533: newChunkIndex2MatchCount.put(j, k); tikhomirov@533: if (k > maxLength) { tikhomirov@533: matchStartS1 = i-k+1; tikhomirov@533: matchStartS2 = j-k+1; tikhomirov@533: maxLength = k; tikhomirov@533: } tikhomirov@533: } tikhomirov@533: chunkIndex2MatchCount = newChunkIndex2MatchCount; tikhomirov@533: } tikhomirov@533: return maxLength; tikhomirov@533: } tikhomirov@533: tikhomirov@541: private void findMatchingBlocks(int startS1, int endS1, int startS2, int endS2) { tikhomirov@533: int matchLength = longestMatch(startS1, endS1, startS2, endS2); tikhomirov@533: if (matchLength > 0) { tikhomirov@533: final int saveStartS1 = matchStartS1, saveStartS2 = matchStartS2; tikhomirov@533: if (startS1 < matchStartS1 && startS2 < matchStartS2) { tikhomirov@533: findMatchingBlocks(startS1, matchStartS1, startS2, matchStartS2); tikhomirov@533: } tikhomirov@541: matchInspector.match(saveStartS1, saveStartS2, matchLength); tikhomirov@533: if (saveStartS1+matchLength < endS1 && saveStartS2+matchLength < endS2) { tikhomirov@533: findMatchingBlocks(saveStartS1 + matchLength, endS1, saveStartS2 + matchLength, endS2); tikhomirov@533: } tikhomirov@533: } tikhomirov@533: } tikhomirov@533: tikhomirov@551: public interface MatchInspector> { tikhomirov@544: void begin(T s1, T s2); tikhomirov@541: void match(int startSeq1, int startSeq2, int matchLength); tikhomirov@541: void end(); tikhomirov@541: } tikhomirov@533: tikhomirov@544: static class MatchDumpInspector> implements MatchInspector { tikhomirov@545: private int matchCount; tikhomirov@545: tikhomirov@544: public void begin(T s1, T s2) { tikhomirov@545: matchCount = 0; tikhomirov@541: } tikhomirov@541: tikhomirov@541: public void match(int startSeq1, int startSeq2, int matchLength) { tikhomirov@545: matchCount++; tikhomirov@545: System.out.printf("match #%d: from line #%d and line #%d of length %d\n", matchCount, startSeq1, startSeq2, matchLength); tikhomirov@541: } tikhomirov@541: tikhomirov@541: public void end() { tikhomirov@545: if (matchCount == 0) { tikhomirov@545: System.out.println("NO MATCHES FOUND!"); tikhomirov@545: } tikhomirov@541: } tikhomirov@541: } tikhomirov@541: tikhomirov@551: /** tikhomirov@551: * Matcher implementation that translates "match/equal" notification to a delta-style "added/removed/changed". tikhomirov@551: */ tikhomirov@551: public static class DeltaInspector> implements MatchInspector { tikhomirov@541: protected int changeStartS1, changeStartS2; tikhomirov@544: protected T seq1, seq2; tikhomirov@541: tikhomirov@544: public void begin(T s1, T s2) { tikhomirov@541: seq1 = s1; tikhomirov@541: seq2 = s2; tikhomirov@541: changeStartS1 = changeStartS2 = 0; tikhomirov@541: } tikhomirov@541: tikhomirov@541: public void match(int startSeq1, int startSeq2, int matchLength) { tikhomirov@542: reportDeltaElement(startSeq1, startSeq2, matchLength); tikhomirov@541: changeStartS1 = startSeq1 + matchLength; tikhomirov@541: changeStartS2 = startSeq2 + matchLength; tikhomirov@541: } tikhomirov@541: tikhomirov@541: public void end() { tikhomirov@545: if (changeStartS1 < seq1.chunkCount()-1 || changeStartS2 < seq2.chunkCount()-1) { tikhomirov@545: reportDeltaElement(seq1.chunkCount()-1, seq2.chunkCount()-1, 0); tikhomirov@541: } tikhomirov@541: } tikhomirov@541: tikhomirov@542: protected void reportDeltaElement(int matchStartSeq1, int matchStartSeq2, int matchLength) { tikhomirov@541: if (changeStartS1 < matchStartSeq1) { tikhomirov@541: if (changeStartS2 < matchStartSeq2) { tikhomirov@542: changed(changeStartS1, matchStartSeq1, changeStartS2, matchStartSeq2); tikhomirov@541: } else { tikhomirov@541: assert changeStartS2 == matchStartSeq2; tikhomirov@543: deleted(matchStartSeq2, changeStartS1, matchStartSeq1); tikhomirov@541: } tikhomirov@533: } else { tikhomirov@541: assert changeStartS1 == matchStartSeq1; tikhomirov@541: if(changeStartS2 < matchStartSeq2) { tikhomirov@545: added(changeStartS1, changeStartS2, matchStartSeq2); tikhomirov@541: } else { tikhomirov@541: assert changeStartS2 == matchStartSeq2; tikhomirov@549: if (matchStartSeq1 > 0 || matchStartSeq2 > 0) { tikhomirov@549: // FIXME perhaps, exception is too much for the case tikhomirov@549: // once diff is covered with tests, replace with assert false : msg; tikhomirov@549: throw new HgInvalidStateException(String.format("adjustent equal blocks %d, %d and %d,%d", changeStartS1, matchStartSeq1, changeStartS2, matchStartSeq2)); tikhomirov@549: } tikhomirov@541: } tikhomirov@533: } tikhomirov@542: if (matchLength > 0) { tikhomirov@542: unchanged(matchStartSeq1, matchStartSeq2, matchLength); tikhomirov@542: } tikhomirov@542: } tikhomirov@542: tikhomirov@542: /** tikhomirov@542: * [s1From..s1To) replaced with [s2From..s2To) tikhomirov@542: */ tikhomirov@542: protected void changed(int s1From, int s1To, int s2From, int s2To) { tikhomirov@542: // NO-OP tikhomirov@542: } tikhomirov@542: tikhomirov@543: protected void deleted(int s2DeletePoint, int s1From, int s1To) { tikhomirov@542: // NO-OP tikhomirov@542: } tikhomirov@542: tikhomirov@542: protected void added(int s1InsertPoint, int s2From, int s2To) { tikhomirov@542: // NO-OP tikhomirov@542: } tikhomirov@542: tikhomirov@542: protected void unchanged(int s1From, int s2From, int length) { tikhomirov@542: // NO-OP tikhomirov@541: } tikhomirov@541: } tikhomirov@541: tikhomirov@583: public static class DeltaDumpInspector> extends DeltaInspector { tikhomirov@542: tikhomirov@542: @Override tikhomirov@542: protected void changed(int s1From, int s1To, int s2From, int s2To) { tikhomirov@542: System.out.printf("changed [%d..%d) with [%d..%d)\n", s1From, s1To, s2From, s2To); tikhomirov@542: } tikhomirov@542: tikhomirov@542: @Override tikhomirov@543: protected void deleted(int s2DeletionPoint, int s1From, int s1To) { tikhomirov@542: System.out.printf("deleted [%d..%d)\n", s1From, s1To); tikhomirov@542: } tikhomirov@542: tikhomirov@542: @Override tikhomirov@542: protected void added(int s1InsertPoint, int s2From, int s2To) { tikhomirov@542: System.out.printf("added [%d..%d) at %d\n", s2From, s2To, s1InsertPoint); tikhomirov@542: } tikhomirov@545: tikhomirov@545: @Override tikhomirov@545: protected void unchanged(int s1From, int s2From, int length) { tikhomirov@545: System.out.printf("same [%d..%d) and [%d..%d)\n", s1From, s1From + length, s2From, s2From + length); tikhomirov@545: } tikhomirov@542: } tikhomirov@542: tikhomirov@544: /** tikhomirov@544: * Generic sequence of chunk, where chunk is anything comparable to another chunk, e.g. a string or a single char tikhomirov@544: * Sequence diff algorithm above doesn't care about sequence nature. tikhomirov@544: */ tikhomirov@551: public interface ChunkSequence { tikhomirov@544: public T chunk(int index); tikhomirov@544: public int chunkCount(); tikhomirov@533: } tikhomirov@533: tikhomirov@551: public static final class LineSequence implements ChunkSequence { tikhomirov@533: tikhomirov@533: private final byte[] input; tikhomirov@533: private ArrayList lines; tikhomirov@533: tikhomirov@544: public LineSequence(byte[] data) { tikhomirov@533: input = data; tikhomirov@533: } tikhomirov@533: tikhomirov@544: public static LineSequence newlines(byte[] array) { tikhomirov@544: return new LineSequence(array).splitByNewlines(); tikhomirov@544: } tikhomirov@544: tikhomirov@545: // sequence ends with fake, empty line chunk tikhomirov@544: public LineSequence splitByNewlines() { tikhomirov@533: lines = new ArrayList(); tikhomirov@533: int lastStart = 0; tikhomirov@533: for (int i = 0; i < input.length; i++) { tikhomirov@533: if (input[i] == '\n') { tikhomirov@533: lines.add(new ByteChain(lastStart, i+1)); tikhomirov@533: lastStart = i+1; tikhomirov@533: } else if (input[i] == '\r') { tikhomirov@533: if (i+1 < input.length && input[i+1] == '\n') { tikhomirov@533: i++; tikhomirov@533: } tikhomirov@533: lines.add(new ByteChain(lastStart, i+1)); tikhomirov@533: lastStart = i+1; tikhomirov@533: } tikhomirov@533: } tikhomirov@533: if (lastStart < input.length) { tikhomirov@533: lines.add(new ByteChain(lastStart, input.length)); tikhomirov@533: } tikhomirov@538: // empty chunk to keep offset of input end tikhomirov@545: lines.add(new ByteChain(input.length)); tikhomirov@544: return this; tikhomirov@533: } tikhomirov@533: tikhomirov@533: public ByteChain chunk(int index) { tikhomirov@533: return lines.get(index); tikhomirov@533: } tikhomirov@533: tikhomirov@533: public int chunkCount() { tikhomirov@533: return lines.size(); tikhomirov@533: } tikhomirov@533: tikhomirov@533: public byte[] data(int chunkFrom, int chunkTo) { tikhomirov@533: if (chunkFrom == chunkTo) { tikhomirov@533: return new byte[0]; tikhomirov@533: } tikhomirov@533: int from = chunk(chunkFrom).getOffset(), to = chunk(chunkTo).getOffset(); tikhomirov@533: byte[] rv = new byte[to - from]; tikhomirov@533: System.arraycopy(input, from, rv, 0, rv.length); tikhomirov@533: return rv; tikhomirov@533: } tikhomirov@533: tikhomirov@533: tikhomirov@556: public final class ByteChain { tikhomirov@533: private final int start, end; tikhomirov@533: private final int hash; tikhomirov@533: tikhomirov@545: /** tikhomirov@545: * construct a chunk with a sole purpose to keep tikhomirov@545: * offset of the data end tikhomirov@545: */ tikhomirov@545: ByteChain(int offset) { tikhomirov@545: start = end = offset; tikhomirov@545: // ensure this chunk doesn't match trailing chunk of another sequence tikhomirov@545: hash = System.identityHashCode(this); tikhomirov@545: } tikhomirov@545: tikhomirov@533: ByteChain(int s, int e) { tikhomirov@533: start = s; tikhomirov@533: end = e; tikhomirov@533: hash = calcHash(input, s, e); tikhomirov@533: } tikhomirov@533: tikhomirov@533: /** tikhomirov@533: * byte offset of the this ByteChain inside ChainSequence tikhomirov@533: */ tikhomirov@533: public int getOffset() { tikhomirov@533: return start; tikhomirov@533: } tikhomirov@533: tikhomirov@533: public byte[] data() { tikhomirov@533: byte[] rv = new byte[end - start]; tikhomirov@533: System.arraycopy(input, start, rv, 0, rv.length); tikhomirov@533: return rv; tikhomirov@533: } tikhomirov@533: tikhomirov@533: @Override tikhomirov@533: public boolean equals(Object obj) { tikhomirov@533: if (obj == null || obj.getClass() != ByteChain.class) { tikhomirov@533: return false; tikhomirov@533: } tikhomirov@533: ByteChain other = (ByteChain) obj; tikhomirov@533: if (other.hash != hash || other.end - other.start != end - start) { tikhomirov@533: return false; tikhomirov@533: } tikhomirov@533: return other.match(input, start); tikhomirov@533: } tikhomirov@533: tikhomirov@533: private boolean match(byte[] oi, int from) { tikhomirov@533: for (int i = start, j = from; i < end; i++, j++) { tikhomirov@544: if (LineSequence.this.input[i] != oi[j]) { tikhomirov@533: return false; tikhomirov@533: } tikhomirov@533: } tikhomirov@533: return true; tikhomirov@533: } tikhomirov@533: tikhomirov@533: @Override tikhomirov@533: public int hashCode() { tikhomirov@533: return hash; tikhomirov@533: } tikhomirov@533: tikhomirov@533: @Override tikhomirov@533: public String toString() { tikhomirov@533: return String.format("[@%d\"%s\"]", start, new String(data())); tikhomirov@533: } tikhomirov@533: } tikhomirov@533: tikhomirov@533: // same as Arrays.hashCode(byte[]), just for a slice of a bigger array tikhomirov@533: static int calcHash(byte[] data, int from, int to) { tikhomirov@533: int result = 1; tikhomirov@533: for (int i = from; i < to; i++) { tikhomirov@533: result = 31 * result + data[i]; tikhomirov@533: } tikhomirov@533: return result; tikhomirov@533: } tikhomirov@533: } tikhomirov@533: }