Mercurial > hg4j
comparison src/org/tmatesoft/hg/internal/diff/DiffHelper.java @ 703:7839ff0bfd78
Refactor: move diff/blame related code to a separate package
author | Artem Tikhomirov <tikhomirov.artem@gmail.com> |
---|---|
date | Wed, 14 Aug 2013 14:51:51 +0200 |
parents | src/org/tmatesoft/hg/internal/DiffHelper.java@58a6900f845d |
children |
comparison
equal
deleted
inserted
replaced
702:992fa84e7885 | 703:7839ff0bfd78 |
---|---|
1 /* | |
2 * Copyright (c) 2013 TMate Software Ltd | |
3 * | |
4 * This program is free software; you can redistribute it and/or modify | |
5 * it under the terms of the GNU General Public License as published by | |
6 * the Free Software Foundation; version 2 of the License. | |
7 * | |
8 * This program is distributed in the hope that it will be useful, | |
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
11 * GNU General Public License for more details. | |
12 * | |
13 * For information on how to redistribute this software under | |
14 * the terms of a license other than GNU General Public License | |
15 * contact TMate Software at support@hg4j.com | |
16 */ | |
17 package org.tmatesoft.hg.internal.diff; | |
18 | |
19 import java.util.ArrayList; | |
20 import java.util.HashMap; | |
21 import java.util.Map; | |
22 | |
23 import org.tmatesoft.hg.internal.IntMap; | |
24 import org.tmatesoft.hg.internal.IntSliceSeq; | |
25 import org.tmatesoft.hg.internal.IntTuple; | |
26 import org.tmatesoft.hg.internal.IntVector; | |
27 | |
28 /** | |
29 * Mercurial cares about changes only up to the line level, e.g. a simple file version dump in manifest looks like (RevlogDump output): | |
30 * | |
31 * 522: 233748 0 103 17438 433 522 521 -1 756073cf2321df44d3ed0585f2a5754bc8a1b2f6 | |
32 * <PATCH>: | |
33 * 3487..3578, 91:src/org/tmatesoft/hg/core/HgIterateDirection.java\00add61a8a665c5d8f092210767f812fe0d335ac8 | |
34 * | |
35 * I.e. for the {fname}{revision} entry format of manifest, not only {revision} is changed, but the whole line, with unchanged {fname} is recorded | |
36 * in the patch. | |
37 * | |
38 * Mercurial paper describes reasons for choosing this approach to delta generation, too. | |
39 * | |
40 * | |
41 * @author Artem Tikhomirov | |
42 * @author TMate Software Ltd. | |
43 */ | |
44 public class DiffHelper<T extends DiffHelper.ChunkSequence<?>> { | |
45 | |
46 private Map<Object, IntVector> chunk2UseIndex; | |
47 private T seq1, seq2; | |
48 | |
49 // get filled by #longestMatch, track start of common sequence in seq1 and seq2, respectively | |
50 private int matchStartS1, matchStartS2; | |
51 | |
52 private MatchInspector<T> matchInspector; | |
53 | |
54 public void init(T s1, T s2) { | |
55 seq1 = s1; | |
56 seq2 = s2; | |
57 prepare(s2); | |
58 } | |
59 | |
60 public void init(T s1) { | |
61 if (seq2 == null) { | |
62 throw new IllegalStateException("Use this #init() only when target sequence shall be matched against different origin"); | |
63 } | |
64 seq1 = s1; | |
65 } | |
66 | |
67 | |
68 private void prepare(T s2) { | |
69 chunk2UseIndex = new HashMap<Object, IntVector>(); | |
70 for (int i = 0, len = s2.chunkCount(); i < len; i++) { | |
71 Object bc = s2.chunk(i); | |
72 IntVector loc = chunk2UseIndex.get(bc); | |
73 if (loc == null) { | |
74 chunk2UseIndex.put(bc, loc = new IntVector()); | |
75 } | |
76 loc.add(i); | |
77 // bc.registerUseIn(i) - BEWARE, use of bc here is incorrect | |
78 // in this case need to find the only ByteChain to keep indexes | |
79 // i.e. when there are few equal ByteChain instances, notion of "usedIn" shall be either shared (reference same vector) | |
80 // or kept within only one of them | |
81 } | |
82 } | |
83 | |
84 public void findMatchingBlocks(MatchInspector<T> insp) { | |
85 insp.begin(seq1, seq2); | |
86 matchInspector = insp; | |
87 findMatchingBlocks(0, seq1.chunkCount(), 0, seq2.chunkCount()); | |
88 insp.end(); | |
89 } | |
90 | |
91 /** | |
92 * look up every line in s2 that match lines in s1 | |
93 * idea: pure additions in s2 are diff-ed against s1 again and again, to see if there are any matches | |
94 */ | |
95 void findAllMatchAlternatives(final MatchInspector<T> insp) { | |
96 assert seq1.chunkCount() > 0; | |
97 final IntSliceSeq insertions = new IntSliceSeq(2); | |
98 final boolean matchedAny[] = new boolean[] {false}; | |
99 DeltaInspector<T> myInsp = new DeltaInspector<T>() { | |
100 @Override | |
101 protected void unchanged(int s1From, int s2From, int length) { | |
102 matchedAny[0] = true; | |
103 insp.match(s1From, s2From, length); | |
104 } | |
105 @Override | |
106 protected void added(int s1InsertPoint, int s2From, int s2To) { | |
107 insertions.add(s2From, s2To); | |
108 } | |
109 }; | |
110 matchInspector = myInsp; | |
111 myInsp.begin(seq1, seq2); | |
112 IntSliceSeq s2RangesToCheck = new IntSliceSeq(2, 1, 0); | |
113 s2RangesToCheck.add(0, seq2.chunkCount()); | |
114 do { | |
115 IntSliceSeq nextCheck = new IntSliceSeq(2); | |
116 for (IntTuple t : s2RangesToCheck) { | |
117 int s2Start = t.at(0); | |
118 int s2End = t.at(1); | |
119 myInsp.changeStartS1 = 0; | |
120 myInsp.changeStartS2 = s2Start; | |
121 insp.begin(seq1, seq2); | |
122 matchedAny[0] = false; | |
123 findMatchingBlocks(0, seq1.chunkCount(), s2Start, s2End); | |
124 insp.end(); | |
125 myInsp.end(); | |
126 if (matchedAny[0]) { | |
127 nextCheck.addAll(insertions); | |
128 } | |
129 insertions.clear(); | |
130 } | |
131 s2RangesToCheck = nextCheck; | |
132 } while (s2RangesToCheck.size() > 0); | |
133 } | |
134 | |
135 /** | |
136 * implementation based on Python's difflib.py and SequenceMatcher | |
137 */ | |
138 public int longestMatch(int startS1, int endS1, int startS2, int endS2) { | |
139 matchStartS1 = matchStartS2 = 0; | |
140 int maxLength = 0; | |
141 IntMap<Integer> chunkIndex2MatchCount = new IntMap<Integer>(8); | |
142 for (int i = startS1; i < endS1; i++) { | |
143 Object bc = seq1.chunk(i); | |
144 IntVector occurencesInS2 = chunk2UseIndex.get(bc); | |
145 if (occurencesInS2 == null) { | |
146 chunkIndex2MatchCount.clear(); | |
147 continue; | |
148 } | |
149 IntMap<Integer> newChunkIndex2MatchCount = new IntMap<Integer>(8); | |
150 for (int j : occurencesInS2.toArray()) { | |
151 // s1[i] == s2[j] | |
152 if (j < startS2) { | |
153 continue; | |
154 } | |
155 if (j >= endS2) { | |
156 break; | |
157 } | |
158 int prevChunkMatches = chunkIndex2MatchCount.containsKey(j-1) ? chunkIndex2MatchCount.get(j-1) : 0; | |
159 int k = prevChunkMatches + 1; | |
160 newChunkIndex2MatchCount.put(j, k); | |
161 if (k > maxLength) { | |
162 matchStartS1 = i-k+1; | |
163 matchStartS2 = j-k+1; | |
164 maxLength = k; | |
165 } | |
166 } | |
167 chunkIndex2MatchCount = newChunkIndex2MatchCount; | |
168 } | |
169 return maxLength; | |
170 } | |
171 | |
172 private void findMatchingBlocks(int startS1, int endS1, int startS2, int endS2) { | |
173 int matchLength = longestMatch(startS1, endS1, startS2, endS2); | |
174 if (matchLength > 0) { | |
175 final int saveStartS1 = matchStartS1, saveStartS2 = matchStartS2; | |
176 if (startS1 < matchStartS1 && startS2 < matchStartS2) { | |
177 findMatchingBlocks(startS1, matchStartS1, startS2, matchStartS2); | |
178 } | |
179 matchInspector.match(saveStartS1, saveStartS2, matchLength); | |
180 if (saveStartS1+matchLength < endS1 && saveStartS2+matchLength < endS2) { | |
181 findMatchingBlocks(saveStartS1 + matchLength, endS1, saveStartS2 + matchLength, endS2); | |
182 } | |
183 } | |
184 } | |
185 | |
186 public interface MatchInspector<T extends ChunkSequence<?>> { | |
187 void begin(T s1, T s2); | |
188 void match(int startSeq1, int startSeq2, int matchLength); | |
189 void end(); | |
190 } | |
191 | |
192 static class MatchDumpInspector<T extends ChunkSequence<?>> implements MatchInspector<T> { | |
193 private int matchCount; | |
194 | |
195 public void begin(T s1, T s2) { | |
196 matchCount = 0; | |
197 } | |
198 | |
199 public void match(int startSeq1, int startSeq2, int matchLength) { | |
200 matchCount++; | |
201 System.out.printf("match #%d: from line #%d and line #%d of length %d\n", matchCount, startSeq1, startSeq2, matchLength); | |
202 } | |
203 | |
204 public void end() { | |
205 if (matchCount == 0) { | |
206 System.out.println("NO MATCHES FOUND!"); | |
207 } | |
208 } | |
209 } | |
210 | |
211 /** | |
212 * Matcher implementation that translates "match/equal" notification to a delta-style "added/removed/changed". | |
213 */ | |
214 public static class DeltaInspector<T extends ChunkSequence<?>> implements MatchInspector<T> { | |
215 protected int changeStartS1, changeStartS2; | |
216 protected T seq1, seq2; | |
217 | |
218 public void begin(T s1, T s2) { | |
219 seq1 = s1; | |
220 seq2 = s2; | |
221 changeStartS1 = changeStartS2 = 0; | |
222 } | |
223 | |
224 public void match(int startSeq1, int startSeq2, int matchLength) { | |
225 reportDeltaElement(startSeq1, startSeq2, matchLength); | |
226 changeStartS1 = startSeq1 + matchLength; | |
227 changeStartS2 = startSeq2 + matchLength; | |
228 } | |
229 | |
230 public void end() { | |
231 if (changeStartS1 < seq1.chunkCount()-1 || changeStartS2 < seq2.chunkCount()-1) { | |
232 reportDeltaElement(seq1.chunkCount()-1, seq2.chunkCount()-1, 0); | |
233 } | |
234 } | |
235 | |
236 protected void reportDeltaElement(int matchStartSeq1, int matchStartSeq2, int matchLength) { | |
237 if (changeStartS1 < matchStartSeq1) { | |
238 if (changeStartS2 < matchStartSeq2) { | |
239 changed(changeStartS1, matchStartSeq1, changeStartS2, matchStartSeq2); | |
240 } else { | |
241 assert changeStartS2 == matchStartSeq2; | |
242 deleted(matchStartSeq2, changeStartS1, matchStartSeq1); | |
243 } | |
244 } else { | |
245 assert changeStartS1 == matchStartSeq1; | |
246 if(changeStartS2 < matchStartSeq2) { | |
247 added(changeStartS1, changeStartS2, matchStartSeq2); | |
248 } else { | |
249 assert changeStartS2 == matchStartSeq2; | |
250 if (matchStartSeq1 > 0 || matchStartSeq2 > 0) { | |
251 assert false : String.format("adjustent equal blocks %d, %d and %d,%d", changeStartS1, matchStartSeq1, changeStartS2, matchStartSeq2); | |
252 } | |
253 } | |
254 } | |
255 if (matchLength > 0) { | |
256 unchanged(matchStartSeq1, matchStartSeq2, matchLength); | |
257 } | |
258 } | |
259 | |
260 /** | |
261 * [s1From..s1To) replaced with [s2From..s2To) | |
262 */ | |
263 protected void changed(int s1From, int s1To, int s2From, int s2To) { | |
264 // NO-OP | |
265 } | |
266 | |
267 protected void deleted(int s2DeletePoint, int s1From, int s1To) { | |
268 // NO-OP | |
269 } | |
270 | |
271 protected void added(int s1InsertPoint, int s2From, int s2To) { | |
272 // NO-OP | |
273 } | |
274 | |
275 protected void unchanged(int s1From, int s2From, int length) { | |
276 // NO-OP | |
277 } | |
278 } | |
279 | |
280 public static class DeltaDumpInspector<T extends ChunkSequence<?>> extends DeltaInspector<T> { | |
281 | |
282 @Override | |
283 protected void changed(int s1From, int s1To, int s2From, int s2To) { | |
284 System.out.printf("changed [%d..%d) with [%d..%d)\n", s1From, s1To, s2From, s2To); | |
285 } | |
286 | |
287 @Override | |
288 protected void deleted(int s2DeletionPoint, int s1From, int s1To) { | |
289 System.out.printf("deleted [%d..%d)\n", s1From, s1To); | |
290 } | |
291 | |
292 @Override | |
293 protected void added(int s1InsertPoint, int s2From, int s2To) { | |
294 System.out.printf("added [%d..%d) at %d\n", s2From, s2To, s1InsertPoint); | |
295 } | |
296 | |
297 @Override | |
298 protected void unchanged(int s1From, int s2From, int length) { | |
299 System.out.printf("same [%d..%d) and [%d..%d)\n", s1From, s1From + length, s2From, s2From + length); | |
300 } | |
301 } | |
302 | |
303 /** | |
304 * Generic sequence of chunk, where chunk is anything comparable to another chunk, e.g. a string or a single char | |
305 * Sequence diff algorithm above doesn't care about sequence nature. | |
306 */ | |
307 public interface ChunkSequence<T> { | |
308 public T chunk(int index); | |
309 public int chunkCount(); | |
310 } | |
311 | |
312 public static final class LineSequence implements ChunkSequence<LineSequence.ByteChain> { | |
313 | |
314 private final byte[] input; | |
315 private ArrayList<ByteChain> lines; | |
316 | |
317 public LineSequence(byte[] data) { | |
318 input = data; | |
319 } | |
320 | |
321 public static LineSequence newlines(byte[] array) { | |
322 return new LineSequence(array).splitByNewlines(); | |
323 } | |
324 | |
325 // sequence ends with fake, empty line chunk | |
326 public LineSequence splitByNewlines() { | |
327 lines = new ArrayList<ByteChain>(); | |
328 int lastStart = 0; | |
329 for (int i = 0; i < input.length; i++) { | |
330 if (input[i] == '\n') { | |
331 lines.add(new ByteChain(lastStart, i+1)); | |
332 lastStart = i+1; | |
333 } else if (input[i] == '\r') { | |
334 if (i+1 < input.length && input[i+1] == '\n') { | |
335 i++; | |
336 } | |
337 lines.add(new ByteChain(lastStart, i+1)); | |
338 lastStart = i+1; | |
339 } | |
340 } | |
341 if (lastStart < input.length) { | |
342 lines.add(new ByteChain(lastStart, input.length)); | |
343 } | |
344 // empty chunk to keep offset of input end | |
345 lines.add(new ByteChain(input.length)); | |
346 return this; | |
347 } | |
348 | |
349 public ByteChain chunk(int index) { | |
350 return lines.get(index); | |
351 } | |
352 | |
353 public int chunkCount() { | |
354 return lines.size(); | |
355 } | |
356 | |
357 public byte[] data(int chunkFrom, int chunkTo) { | |
358 if (chunkFrom == chunkTo) { | |
359 return new byte[0]; | |
360 } | |
361 int from = chunk(chunkFrom).getOffset(), to = chunk(chunkTo).getOffset(); | |
362 byte[] rv = new byte[to - from]; | |
363 System.arraycopy(input, from, rv, 0, rv.length); | |
364 return rv; | |
365 } | |
366 | |
367 | |
368 public final class ByteChain { | |
369 private final int start, end; | |
370 private final int hash; | |
371 | |
372 /** | |
373 * construct a chunk with a sole purpose to keep | |
374 * offset of the data end | |
375 */ | |
376 ByteChain(int offset) { | |
377 start = end = offset; | |
378 // ensure this chunk doesn't match trailing chunk of another sequence | |
379 hash = System.identityHashCode(this); | |
380 } | |
381 | |
382 ByteChain(int s, int e) { | |
383 start = s; | |
384 end = e; | |
385 hash = calcHash(input, s, e); | |
386 } | |
387 | |
388 /** | |
389 * byte offset of the this ByteChain inside ChainSequence | |
390 */ | |
391 public int getOffset() { | |
392 return start; | |
393 } | |
394 | |
395 public byte[] data() { | |
396 byte[] rv = new byte[end - start]; | |
397 System.arraycopy(input, start, rv, 0, rv.length); | |
398 return rv; | |
399 } | |
400 | |
401 @Override | |
402 public boolean equals(Object obj) { | |
403 if (obj == null || obj.getClass() != ByteChain.class) { | |
404 return false; | |
405 } | |
406 ByteChain other = (ByteChain) obj; | |
407 if (other.hash != hash || other.end - other.start != end - start) { | |
408 return false; | |
409 } | |
410 return other.match(input, start); | |
411 } | |
412 | |
413 private boolean match(byte[] oi, int from) { | |
414 for (int i = start, j = from; i < end; i++, j++) { | |
415 if (LineSequence.this.input[i] != oi[j]) { | |
416 return false; | |
417 } | |
418 } | |
419 return true; | |
420 } | |
421 | |
422 @Override | |
423 public int hashCode() { | |
424 return hash; | |
425 } | |
426 | |
427 @Override | |
428 public String toString() { | |
429 return String.format("[@%d\"%s\"]", start, new String(data())); | |
430 } | |
431 } | |
432 | |
433 // same as Arrays.hashCode(byte[]), just for a slice of a bigger array | |
434 static int calcHash(byte[] data, int from, int to) { | |
435 int result = 1; | |
436 for (int i = from; i < to; i++) { | |
437 result = 31 * result + data[i]; | |
438 } | |
439 return result; | |
440 } | |
441 } | |
442 } |