comparison src/org/tmatesoft/hg/internal/diff/DiffHelper.java @ 703:7839ff0bfd78

Refactor: move diff/blame related code to a separate package
author Artem Tikhomirov <tikhomirov.artem@gmail.com>
date Wed, 14 Aug 2013 14:51:51 +0200
parents src/org/tmatesoft/hg/internal/DiffHelper.java@58a6900f845d
children
comparison
equal deleted inserted replaced
702:992fa84e7885 703:7839ff0bfd78
1 /*
2 * Copyright (c) 2013 TMate Software Ltd
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; version 2 of the License.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * For information on how to redistribute this software under
14 * the terms of a license other than GNU General Public License
15 * contact TMate Software at support@hg4j.com
16 */
17 package org.tmatesoft.hg.internal.diff;
18
19 import java.util.ArrayList;
20 import java.util.HashMap;
21 import java.util.Map;
22
23 import org.tmatesoft.hg.internal.IntMap;
24 import org.tmatesoft.hg.internal.IntSliceSeq;
25 import org.tmatesoft.hg.internal.IntTuple;
26 import org.tmatesoft.hg.internal.IntVector;
27
28 /**
29 * Mercurial cares about changes only up to the line level, e.g. a simple file version dump in manifest looks like (RevlogDump output):
30 *
31 * 522: 233748 0 103 17438 433 522 521 -1 756073cf2321df44d3ed0585f2a5754bc8a1b2f6
32 * <PATCH>:
33 * 3487..3578, 91:src/org/tmatesoft/hg/core/HgIterateDirection.java\00add61a8a665c5d8f092210767f812fe0d335ac8
34 *
35 * I.e. for the {fname}{revision} entry format of manifest, not only {revision} is changed, but the whole line, with unchanged {fname} is recorded
36 * in the patch.
37 *
38 * Mercurial paper describes reasons for choosing this approach to delta generation, too.
39 *
40 *
41 * @author Artem Tikhomirov
42 * @author TMate Software Ltd.
43 */
44 public class DiffHelper<T extends DiffHelper.ChunkSequence<?>> {
45
46 private Map<Object, IntVector> chunk2UseIndex;
47 private T seq1, seq2;
48
49 // get filled by #longestMatch, track start of common sequence in seq1 and seq2, respectively
50 private int matchStartS1, matchStartS2;
51
52 private MatchInspector<T> matchInspector;
53
54 public void init(T s1, T s2) {
55 seq1 = s1;
56 seq2 = s2;
57 prepare(s2);
58 }
59
60 public void init(T s1) {
61 if (seq2 == null) {
62 throw new IllegalStateException("Use this #init() only when target sequence shall be matched against different origin");
63 }
64 seq1 = s1;
65 }
66
67
68 private void prepare(T s2) {
69 chunk2UseIndex = new HashMap<Object, IntVector>();
70 for (int i = 0, len = s2.chunkCount(); i < len; i++) {
71 Object bc = s2.chunk(i);
72 IntVector loc = chunk2UseIndex.get(bc);
73 if (loc == null) {
74 chunk2UseIndex.put(bc, loc = new IntVector());
75 }
76 loc.add(i);
77 // bc.registerUseIn(i) - BEWARE, use of bc here is incorrect
78 // in this case need to find the only ByteChain to keep indexes
79 // i.e. when there are few equal ByteChain instances, notion of "usedIn" shall be either shared (reference same vector)
80 // or kept within only one of them
81 }
82 }
83
84 public void findMatchingBlocks(MatchInspector<T> insp) {
85 insp.begin(seq1, seq2);
86 matchInspector = insp;
87 findMatchingBlocks(0, seq1.chunkCount(), 0, seq2.chunkCount());
88 insp.end();
89 }
90
91 /**
92 * look up every line in s2 that match lines in s1
93 * idea: pure additions in s2 are diff-ed against s1 again and again, to see if there are any matches
94 */
95 void findAllMatchAlternatives(final MatchInspector<T> insp) {
96 assert seq1.chunkCount() > 0;
97 final IntSliceSeq insertions = new IntSliceSeq(2);
98 final boolean matchedAny[] = new boolean[] {false};
99 DeltaInspector<T> myInsp = new DeltaInspector<T>() {
100 @Override
101 protected void unchanged(int s1From, int s2From, int length) {
102 matchedAny[0] = true;
103 insp.match(s1From, s2From, length);
104 }
105 @Override
106 protected void added(int s1InsertPoint, int s2From, int s2To) {
107 insertions.add(s2From, s2To);
108 }
109 };
110 matchInspector = myInsp;
111 myInsp.begin(seq1, seq2);
112 IntSliceSeq s2RangesToCheck = new IntSliceSeq(2, 1, 0);
113 s2RangesToCheck.add(0, seq2.chunkCount());
114 do {
115 IntSliceSeq nextCheck = new IntSliceSeq(2);
116 for (IntTuple t : s2RangesToCheck) {
117 int s2Start = t.at(0);
118 int s2End = t.at(1);
119 myInsp.changeStartS1 = 0;
120 myInsp.changeStartS2 = s2Start;
121 insp.begin(seq1, seq2);
122 matchedAny[0] = false;
123 findMatchingBlocks(0, seq1.chunkCount(), s2Start, s2End);
124 insp.end();
125 myInsp.end();
126 if (matchedAny[0]) {
127 nextCheck.addAll(insertions);
128 }
129 insertions.clear();
130 }
131 s2RangesToCheck = nextCheck;
132 } while (s2RangesToCheck.size() > 0);
133 }
134
135 /**
136 * implementation based on Python's difflib.py and SequenceMatcher
137 */
138 public int longestMatch(int startS1, int endS1, int startS2, int endS2) {
139 matchStartS1 = matchStartS2 = 0;
140 int maxLength = 0;
141 IntMap<Integer> chunkIndex2MatchCount = new IntMap<Integer>(8);
142 for (int i = startS1; i < endS1; i++) {
143 Object bc = seq1.chunk(i);
144 IntVector occurencesInS2 = chunk2UseIndex.get(bc);
145 if (occurencesInS2 == null) {
146 chunkIndex2MatchCount.clear();
147 continue;
148 }
149 IntMap<Integer> newChunkIndex2MatchCount = new IntMap<Integer>(8);
150 for (int j : occurencesInS2.toArray()) {
151 // s1[i] == s2[j]
152 if (j < startS2) {
153 continue;
154 }
155 if (j >= endS2) {
156 break;
157 }
158 int prevChunkMatches = chunkIndex2MatchCount.containsKey(j-1) ? chunkIndex2MatchCount.get(j-1) : 0;
159 int k = prevChunkMatches + 1;
160 newChunkIndex2MatchCount.put(j, k);
161 if (k > maxLength) {
162 matchStartS1 = i-k+1;
163 matchStartS2 = j-k+1;
164 maxLength = k;
165 }
166 }
167 chunkIndex2MatchCount = newChunkIndex2MatchCount;
168 }
169 return maxLength;
170 }
171
172 private void findMatchingBlocks(int startS1, int endS1, int startS2, int endS2) {
173 int matchLength = longestMatch(startS1, endS1, startS2, endS2);
174 if (matchLength > 0) {
175 final int saveStartS1 = matchStartS1, saveStartS2 = matchStartS2;
176 if (startS1 < matchStartS1 && startS2 < matchStartS2) {
177 findMatchingBlocks(startS1, matchStartS1, startS2, matchStartS2);
178 }
179 matchInspector.match(saveStartS1, saveStartS2, matchLength);
180 if (saveStartS1+matchLength < endS1 && saveStartS2+matchLength < endS2) {
181 findMatchingBlocks(saveStartS1 + matchLength, endS1, saveStartS2 + matchLength, endS2);
182 }
183 }
184 }
185
186 public interface MatchInspector<T extends ChunkSequence<?>> {
187 void begin(T s1, T s2);
188 void match(int startSeq1, int startSeq2, int matchLength);
189 void end();
190 }
191
192 static class MatchDumpInspector<T extends ChunkSequence<?>> implements MatchInspector<T> {
193 private int matchCount;
194
195 public void begin(T s1, T s2) {
196 matchCount = 0;
197 }
198
199 public void match(int startSeq1, int startSeq2, int matchLength) {
200 matchCount++;
201 System.out.printf("match #%d: from line #%d and line #%d of length %d\n", matchCount, startSeq1, startSeq2, matchLength);
202 }
203
204 public void end() {
205 if (matchCount == 0) {
206 System.out.println("NO MATCHES FOUND!");
207 }
208 }
209 }
210
211 /**
212 * Matcher implementation that translates "match/equal" notification to a delta-style "added/removed/changed".
213 */
214 public static class DeltaInspector<T extends ChunkSequence<?>> implements MatchInspector<T> {
215 protected int changeStartS1, changeStartS2;
216 protected T seq1, seq2;
217
218 public void begin(T s1, T s2) {
219 seq1 = s1;
220 seq2 = s2;
221 changeStartS1 = changeStartS2 = 0;
222 }
223
224 public void match(int startSeq1, int startSeq2, int matchLength) {
225 reportDeltaElement(startSeq1, startSeq2, matchLength);
226 changeStartS1 = startSeq1 + matchLength;
227 changeStartS2 = startSeq2 + matchLength;
228 }
229
230 public void end() {
231 if (changeStartS1 < seq1.chunkCount()-1 || changeStartS2 < seq2.chunkCount()-1) {
232 reportDeltaElement(seq1.chunkCount()-1, seq2.chunkCount()-1, 0);
233 }
234 }
235
236 protected void reportDeltaElement(int matchStartSeq1, int matchStartSeq2, int matchLength) {
237 if (changeStartS1 < matchStartSeq1) {
238 if (changeStartS2 < matchStartSeq2) {
239 changed(changeStartS1, matchStartSeq1, changeStartS2, matchStartSeq2);
240 } else {
241 assert changeStartS2 == matchStartSeq2;
242 deleted(matchStartSeq2, changeStartS1, matchStartSeq1);
243 }
244 } else {
245 assert changeStartS1 == matchStartSeq1;
246 if(changeStartS2 < matchStartSeq2) {
247 added(changeStartS1, changeStartS2, matchStartSeq2);
248 } else {
249 assert changeStartS2 == matchStartSeq2;
250 if (matchStartSeq1 > 0 || matchStartSeq2 > 0) {
251 assert false : String.format("adjustent equal blocks %d, %d and %d,%d", changeStartS1, matchStartSeq1, changeStartS2, matchStartSeq2);
252 }
253 }
254 }
255 if (matchLength > 0) {
256 unchanged(matchStartSeq1, matchStartSeq2, matchLength);
257 }
258 }
259
260 /**
261 * [s1From..s1To) replaced with [s2From..s2To)
262 */
263 protected void changed(int s1From, int s1To, int s2From, int s2To) {
264 // NO-OP
265 }
266
267 protected void deleted(int s2DeletePoint, int s1From, int s1To) {
268 // NO-OP
269 }
270
271 protected void added(int s1InsertPoint, int s2From, int s2To) {
272 // NO-OP
273 }
274
275 protected void unchanged(int s1From, int s2From, int length) {
276 // NO-OP
277 }
278 }
279
280 public static class DeltaDumpInspector<T extends ChunkSequence<?>> extends DeltaInspector<T> {
281
282 @Override
283 protected void changed(int s1From, int s1To, int s2From, int s2To) {
284 System.out.printf("changed [%d..%d) with [%d..%d)\n", s1From, s1To, s2From, s2To);
285 }
286
287 @Override
288 protected void deleted(int s2DeletionPoint, int s1From, int s1To) {
289 System.out.printf("deleted [%d..%d)\n", s1From, s1To);
290 }
291
292 @Override
293 protected void added(int s1InsertPoint, int s2From, int s2To) {
294 System.out.printf("added [%d..%d) at %d\n", s2From, s2To, s1InsertPoint);
295 }
296
297 @Override
298 protected void unchanged(int s1From, int s2From, int length) {
299 System.out.printf("same [%d..%d) and [%d..%d)\n", s1From, s1From + length, s2From, s2From + length);
300 }
301 }
302
303 /**
304 * Generic sequence of chunk, where chunk is anything comparable to another chunk, e.g. a string or a single char
305 * Sequence diff algorithm above doesn't care about sequence nature.
306 */
307 public interface ChunkSequence<T> {
308 public T chunk(int index);
309 public int chunkCount();
310 }
311
312 public static final class LineSequence implements ChunkSequence<LineSequence.ByteChain> {
313
314 private final byte[] input;
315 private ArrayList<ByteChain> lines;
316
317 public LineSequence(byte[] data) {
318 input = data;
319 }
320
321 public static LineSequence newlines(byte[] array) {
322 return new LineSequence(array).splitByNewlines();
323 }
324
325 // sequence ends with fake, empty line chunk
326 public LineSequence splitByNewlines() {
327 lines = new ArrayList<ByteChain>();
328 int lastStart = 0;
329 for (int i = 0; i < input.length; i++) {
330 if (input[i] == '\n') {
331 lines.add(new ByteChain(lastStart, i+1));
332 lastStart = i+1;
333 } else if (input[i] == '\r') {
334 if (i+1 < input.length && input[i+1] == '\n') {
335 i++;
336 }
337 lines.add(new ByteChain(lastStart, i+1));
338 lastStart = i+1;
339 }
340 }
341 if (lastStart < input.length) {
342 lines.add(new ByteChain(lastStart, input.length));
343 }
344 // empty chunk to keep offset of input end
345 lines.add(new ByteChain(input.length));
346 return this;
347 }
348
349 public ByteChain chunk(int index) {
350 return lines.get(index);
351 }
352
353 public int chunkCount() {
354 return lines.size();
355 }
356
357 public byte[] data(int chunkFrom, int chunkTo) {
358 if (chunkFrom == chunkTo) {
359 return new byte[0];
360 }
361 int from = chunk(chunkFrom).getOffset(), to = chunk(chunkTo).getOffset();
362 byte[] rv = new byte[to - from];
363 System.arraycopy(input, from, rv, 0, rv.length);
364 return rv;
365 }
366
367
368 public final class ByteChain {
369 private final int start, end;
370 private final int hash;
371
372 /**
373 * construct a chunk with a sole purpose to keep
374 * offset of the data end
375 */
376 ByteChain(int offset) {
377 start = end = offset;
378 // ensure this chunk doesn't match trailing chunk of another sequence
379 hash = System.identityHashCode(this);
380 }
381
382 ByteChain(int s, int e) {
383 start = s;
384 end = e;
385 hash = calcHash(input, s, e);
386 }
387
388 /**
389 * byte offset of the this ByteChain inside ChainSequence
390 */
391 public int getOffset() {
392 return start;
393 }
394
395 public byte[] data() {
396 byte[] rv = new byte[end - start];
397 System.arraycopy(input, start, rv, 0, rv.length);
398 return rv;
399 }
400
401 @Override
402 public boolean equals(Object obj) {
403 if (obj == null || obj.getClass() != ByteChain.class) {
404 return false;
405 }
406 ByteChain other = (ByteChain) obj;
407 if (other.hash != hash || other.end - other.start != end - start) {
408 return false;
409 }
410 return other.match(input, start);
411 }
412
413 private boolean match(byte[] oi, int from) {
414 for (int i = start, j = from; i < end; i++, j++) {
415 if (LineSequence.this.input[i] != oi[j]) {
416 return false;
417 }
418 }
419 return true;
420 }
421
422 @Override
423 public int hashCode() {
424 return hash;
425 }
426
427 @Override
428 public String toString() {
429 return String.format("[@%d\"%s\"]", start, new String(data()));
430 }
431 }
432
433 // same as Arrays.hashCode(byte[]), just for a slice of a bigger array
434 static int calcHash(byte[] data, int from, int to) {
435 int result = 1;
436 for (int i = from; i < to; i++) {
437 result = 31 * result + data[i];
438 }
439 return result;
440 }
441 }
442 }