comparison src/org/tmatesoft/hg/internal/DiffHelper.java @ 551:4ea0351ca878

Better (precise) name for diff facility, tests
author Artem Tikhomirov <tikhomirov.artem@gmail.com>
date Wed, 20 Feb 2013 18:19:52 +0100
parents src/org/tmatesoft/hg/internal/PatchGenerator.java@83afa680555d
children e55f17a7a195
comparison
equal deleted inserted replaced
550:c1478cc31f45 551:4ea0351ca878
1 /*
2 * Copyright (c) 2013 TMate Software Ltd
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; version 2 of the License.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * For information on how to redistribute this software under
14 * the terms of a license other than GNU General Public License
15 * contact TMate Software at support@hg4j.com
16 */
17 package org.tmatesoft.hg.internal;
18
19 import java.util.ArrayList;
20 import java.util.HashMap;
21 import java.util.Map;
22
23 import org.tmatesoft.hg.repo.HgInvalidStateException;
24
25 /**
26 * Mercurial cares about changes only up to the line level, e.g. a simple file version dump in manifest looks like (RevlogDump output):
27 *
28 * 522: 233748 0 103 17438 433 522 521 -1 756073cf2321df44d3ed0585f2a5754bc8a1b2f6
29 * <PATCH>:
30 * 3487..3578, 91:src/org/tmatesoft/hg/core/HgIterateDirection.java\00add61a8a665c5d8f092210767f812fe0d335ac8
31 *
32 * I.e. for the {fname}{revision} entry format of manifest, not only {revision} is changed, but the whole line, with unchanged {fname} is recorded
33 * in the patch.
34 *
35 * Mercurial paper describes reasons for choosing this approach to delta generation, too.
36 *
37 *
38 * @author Artem Tikhomirov
39 * @author TMate Software Ltd.
40 */
41 public class DiffHelper<T extends DiffHelper.ChunkSequence<?>> {
42
43 private Map<Object, IntVector> chunk2UseIndex;
44 private T seq1, seq2;
45
46 // get filled by #longestMatch, track start of common sequence in seq1 and seq2, respectively
47 private int matchStartS1, matchStartS2;
48
49 private MatchInspector<T> matchInspector;
50
51 public void init(T s1, T s2) {
52 seq1 = s1;
53 seq2 = s2;
54 prepare(s2);
55 }
56
57 public void init(T s1) {
58 if (seq2 == null) {
59 throw new IllegalStateException("Use this #init() only when target sequence shall be matched against different origin");
60 }
61 seq1 = s1;
62 }
63
64
65 private void prepare(T s2) {
66 chunk2UseIndex = new HashMap<Object, IntVector>();
67 for (int i = 0, len = s2.chunkCount(); i < len; i++) {
68 Object bc = s2.chunk(i);
69 IntVector loc = chunk2UseIndex.get(bc);
70 if (loc == null) {
71 chunk2UseIndex.put(bc, loc = new IntVector());
72 }
73 loc.add(i);
74 // bc.registerUseIn(i) - BEWARE, use of bc here is incorrect
75 // in this case need to find the only ByteChain to keep indexes
76 // i.e. when there are few equal ByteChain instances, notion of "usedIn" shall be either shared (reference same vector)
77 // or kept within only one of them
78 }
79 }
80
81 public void findMatchingBlocks(MatchInspector<T> insp) {
82 insp.begin(seq1, seq2);
83 matchInspector = insp;
84 findMatchingBlocks(0, seq1.chunkCount(), 0, seq2.chunkCount());
85 insp.end();
86 }
87
88 /**
89 * implementation based on Python's difflib.py and SequenceMatcher
90 */
91 public int longestMatch(int startS1, int endS1, int startS2, int endS2) {
92 matchStartS1 = matchStartS2 = 0;
93 int maxLength = 0;
94 IntMap<Integer> chunkIndex2MatchCount = new IntMap<Integer>(8);
95 for (int i = startS1; i < endS1; i++) {
96 Object bc = seq1.chunk(i);
97 IntVector occurencesInS2 = chunk2UseIndex.get(bc);
98 if (occurencesInS2 == null) {
99 chunkIndex2MatchCount.clear();
100 continue;
101 }
102 IntMap<Integer> newChunkIndex2MatchCount = new IntMap<Integer>(8);
103 for (int j : occurencesInS2.toArray()) {
104 // s1[i] == s2[j]
105 if (j < startS2) {
106 continue;
107 }
108 if (j >= endS2) {
109 break;
110 }
111 int prevChunkMatches = chunkIndex2MatchCount.containsKey(j-1) ? chunkIndex2MatchCount.get(j-1) : 0;
112 int k = prevChunkMatches + 1;
113 newChunkIndex2MatchCount.put(j, k);
114 if (k > maxLength) {
115 matchStartS1 = i-k+1;
116 matchStartS2 = j-k+1;
117 maxLength = k;
118 }
119 }
120 chunkIndex2MatchCount = newChunkIndex2MatchCount;
121 }
122 return maxLength;
123 }
124
125 private void findMatchingBlocks(int startS1, int endS1, int startS2, int endS2) {
126 int matchLength = longestMatch(startS1, endS1, startS2, endS2);
127 if (matchLength > 0) {
128 final int saveStartS1 = matchStartS1, saveStartS2 = matchStartS2;
129 if (startS1 < matchStartS1 && startS2 < matchStartS2) {
130 findMatchingBlocks(startS1, matchStartS1, startS2, matchStartS2);
131 }
132 matchInspector.match(saveStartS1, saveStartS2, matchLength);
133 if (saveStartS1+matchLength < endS1 && saveStartS2+matchLength < endS2) {
134 findMatchingBlocks(saveStartS1 + matchLength, endS1, saveStartS2 + matchLength, endS2);
135 }
136 }
137 }
138
139 public interface MatchInspector<T extends ChunkSequence<?>> {
140 void begin(T s1, T s2);
141 void match(int startSeq1, int startSeq2, int matchLength);
142 void end();
143 }
144
145 static class MatchDumpInspector<T extends ChunkSequence<?>> implements MatchInspector<T> {
146 private int matchCount;
147
148 public void begin(T s1, T s2) {
149 matchCount = 0;
150 }
151
152 public void match(int startSeq1, int startSeq2, int matchLength) {
153 matchCount++;
154 System.out.printf("match #%d: from line #%d and line #%d of length %d\n", matchCount, startSeq1, startSeq2, matchLength);
155 }
156
157 public void end() {
158 if (matchCount == 0) {
159 System.out.println("NO MATCHES FOUND!");
160 }
161 }
162 }
163
164 /**
165 * Matcher implementation that translates "match/equal" notification to a delta-style "added/removed/changed".
166 */
167 public static class DeltaInspector<T extends ChunkSequence<?>> implements MatchInspector<T> {
168 protected int changeStartS1, changeStartS2;
169 protected T seq1, seq2;
170
171 public void begin(T s1, T s2) {
172 seq1 = s1;
173 seq2 = s2;
174 changeStartS1 = changeStartS2 = 0;
175 }
176
177 public void match(int startSeq1, int startSeq2, int matchLength) {
178 reportDeltaElement(startSeq1, startSeq2, matchLength);
179 changeStartS1 = startSeq1 + matchLength;
180 changeStartS2 = startSeq2 + matchLength;
181 }
182
183 public void end() {
184 if (changeStartS1 < seq1.chunkCount()-1 || changeStartS2 < seq2.chunkCount()-1) {
185 reportDeltaElement(seq1.chunkCount()-1, seq2.chunkCount()-1, 0);
186 }
187 }
188
189 protected void reportDeltaElement(int matchStartSeq1, int matchStartSeq2, int matchLength) {
190 if (changeStartS1 < matchStartSeq1) {
191 if (changeStartS2 < matchStartSeq2) {
192 changed(changeStartS1, matchStartSeq1, changeStartS2, matchStartSeq2);
193 } else {
194 assert changeStartS2 == matchStartSeq2;
195 deleted(matchStartSeq2, changeStartS1, matchStartSeq1);
196 }
197 } else {
198 assert changeStartS1 == matchStartSeq1;
199 if(changeStartS2 < matchStartSeq2) {
200 added(changeStartS1, changeStartS2, matchStartSeq2);
201 } else {
202 assert changeStartS2 == matchStartSeq2;
203 if (matchStartSeq1 > 0 || matchStartSeq2 > 0) {
204 // FIXME perhaps, exception is too much for the case
205 // once diff is covered with tests, replace with assert false : msg;
206 throw new HgInvalidStateException(String.format("adjustent equal blocks %d, %d and %d,%d", changeStartS1, matchStartSeq1, changeStartS2, matchStartSeq2));
207 }
208 }
209 }
210 if (matchLength > 0) {
211 unchanged(matchStartSeq1, matchStartSeq2, matchLength);
212 }
213 }
214
215 /**
216 * [s1From..s1To) replaced with [s2From..s2To)
217 */
218 protected void changed(int s1From, int s1To, int s2From, int s2To) {
219 // NO-OP
220 }
221
222 protected void deleted(int s2DeletePoint, int s1From, int s1To) {
223 // NO-OP
224 }
225
226 protected void added(int s1InsertPoint, int s2From, int s2To) {
227 // NO-OP
228 }
229
230 protected void unchanged(int s1From, int s2From, int length) {
231 // NO-OP
232 }
233 }
234
235 static class DeltaDumpInspector<T extends ChunkSequence<?>> extends DeltaInspector<T> {
236
237 @Override
238 protected void changed(int s1From, int s1To, int s2From, int s2To) {
239 System.out.printf("changed [%d..%d) with [%d..%d)\n", s1From, s1To, s2From, s2To);
240 }
241
242 @Override
243 protected void deleted(int s2DeletionPoint, int s1From, int s1To) {
244 System.out.printf("deleted [%d..%d)\n", s1From, s1To);
245 }
246
247 @Override
248 protected void added(int s1InsertPoint, int s2From, int s2To) {
249 System.out.printf("added [%d..%d) at %d\n", s2From, s2To, s1InsertPoint);
250 }
251
252 @Override
253 protected void unchanged(int s1From, int s2From, int length) {
254 System.out.printf("same [%d..%d) and [%d..%d)\n", s1From, s1From + length, s2From, s2From + length);
255 }
256 }
257
258 /**
259 * Generic sequence of chunk, where chunk is anything comparable to another chunk, e.g. a string or a single char
260 * Sequence diff algorithm above doesn't care about sequence nature.
261 */
262 public interface ChunkSequence<T> {
263 public T chunk(int index);
264 public int chunkCount();
265 }
266
267 public static final class LineSequence implements ChunkSequence<LineSequence.ByteChain> {
268
269 private final byte[] input;
270 private ArrayList<ByteChain> lines;
271
272 public LineSequence(byte[] data) {
273 input = data;
274 }
275
276 public static LineSequence newlines(byte[] array) {
277 return new LineSequence(array).splitByNewlines();
278 }
279
280 // sequence ends with fake, empty line chunk
281 public LineSequence splitByNewlines() {
282 lines = new ArrayList<ByteChain>();
283 int lastStart = 0;
284 for (int i = 0; i < input.length; i++) {
285 if (input[i] == '\n') {
286 lines.add(new ByteChain(lastStart, i+1));
287 lastStart = i+1;
288 } else if (input[i] == '\r') {
289 if (i+1 < input.length && input[i+1] == '\n') {
290 i++;
291 }
292 lines.add(new ByteChain(lastStart, i+1));
293 lastStart = i+1;
294 }
295 }
296 if (lastStart < input.length) {
297 lines.add(new ByteChain(lastStart, input.length));
298 }
299 // empty chunk to keep offset of input end
300 lines.add(new ByteChain(input.length));
301 return this;
302 }
303
304 public ByteChain chunk(int index) {
305 return lines.get(index);
306 }
307
308 public int chunkCount() {
309 return lines.size();
310 }
311
312 public byte[] data(int chunkFrom, int chunkTo) {
313 if (chunkFrom == chunkTo) {
314 return new byte[0];
315 }
316 int from = chunk(chunkFrom).getOffset(), to = chunk(chunkTo).getOffset();
317 byte[] rv = new byte[to - from];
318 System.arraycopy(input, from, rv, 0, rv.length);
319 return rv;
320 }
321
322
323 final class ByteChain {
324 private final int start, end;
325 private final int hash;
326
327 /**
328 * construct a chunk with a sole purpose to keep
329 * offset of the data end
330 */
331 ByteChain(int offset) {
332 start = end = offset;
333 // ensure this chunk doesn't match trailing chunk of another sequence
334 hash = System.identityHashCode(this);
335 }
336
337 ByteChain(int s, int e) {
338 start = s;
339 end = e;
340 hash = calcHash(input, s, e);
341 }
342
343 /**
344 * byte offset of the this ByteChain inside ChainSequence
345 */
346 public int getOffset() {
347 return start;
348 }
349
350 public byte[] data() {
351 byte[] rv = new byte[end - start];
352 System.arraycopy(input, start, rv, 0, rv.length);
353 return rv;
354 }
355
356 @Override
357 public boolean equals(Object obj) {
358 if (obj == null || obj.getClass() != ByteChain.class) {
359 return false;
360 }
361 ByteChain other = (ByteChain) obj;
362 if (other.hash != hash || other.end - other.start != end - start) {
363 return false;
364 }
365 return other.match(input, start);
366 }
367
368 private boolean match(byte[] oi, int from) {
369 for (int i = start, j = from; i < end; i++, j++) {
370 if (LineSequence.this.input[i] != oi[j]) {
371 return false;
372 }
373 }
374 return true;
375 }
376
377 @Override
378 public int hashCode() {
379 return hash;
380 }
381
382 @Override
383 public String toString() {
384 return String.format("[@%d\"%s\"]", start, new String(data()));
385 }
386 }
387
388 // same as Arrays.hashCode(byte[]), just for a slice of a bigger array
389 static int calcHash(byte[] data, int from, int to) {
390 int result = 1;
391 for (int i = from; i < to; i++) {
392 result = 31 * result + data[i];
393 }
394 return result;
395 }
396 }
397 }