Mercurial > hg4j
comparison src/org/tmatesoft/hg/internal/DiffHelper.java @ 551:4ea0351ca878
Better (precise) name for diff facility, tests
author | Artem Tikhomirov <tikhomirov.artem@gmail.com> |
---|---|
date | Wed, 20 Feb 2013 18:19:52 +0100 |
parents | src/org/tmatesoft/hg/internal/PatchGenerator.java@83afa680555d |
children | e55f17a7a195 |
comparison
equal
deleted
inserted
replaced
550:c1478cc31f45 | 551:4ea0351ca878 |
---|---|
1 /* | |
2 * Copyright (c) 2013 TMate Software Ltd | |
3 * | |
4 * This program is free software; you can redistribute it and/or modify | |
5 * it under the terms of the GNU General Public License as published by | |
6 * the Free Software Foundation; version 2 of the License. | |
7 * | |
8 * This program is distributed in the hope that it will be useful, | |
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
11 * GNU General Public License for more details. | |
12 * | |
13 * For information on how to redistribute this software under | |
14 * the terms of a license other than GNU General Public License | |
15 * contact TMate Software at support@hg4j.com | |
16 */ | |
17 package org.tmatesoft.hg.internal; | |
18 | |
19 import java.util.ArrayList; | |
20 import java.util.HashMap; | |
21 import java.util.Map; | |
22 | |
23 import org.tmatesoft.hg.repo.HgInvalidStateException; | |
24 | |
25 /** | |
26 * Mercurial cares about changes only up to the line level, e.g. a simple file version dump in manifest looks like (RevlogDump output): | |
27 * | |
28 * 522: 233748 0 103 17438 433 522 521 -1 756073cf2321df44d3ed0585f2a5754bc8a1b2f6 | |
29 * <PATCH>: | |
30 * 3487..3578, 91:src/org/tmatesoft/hg/core/HgIterateDirection.java\00add61a8a665c5d8f092210767f812fe0d335ac8 | |
31 * | |
32 * I.e. for the {fname}{revision} entry format of manifest, not only {revision} is changed, but the whole line, with unchanged {fname} is recorded | |
33 * in the patch. | |
34 * | |
35 * Mercurial paper describes reasons for choosing this approach to delta generation, too. | |
36 * | |
37 * | |
38 * @author Artem Tikhomirov | |
39 * @author TMate Software Ltd. | |
40 */ | |
41 public class DiffHelper<T extends DiffHelper.ChunkSequence<?>> { | |
42 | |
43 private Map<Object, IntVector> chunk2UseIndex; | |
44 private T seq1, seq2; | |
45 | |
46 // get filled by #longestMatch, track start of common sequence in seq1 and seq2, respectively | |
47 private int matchStartS1, matchStartS2; | |
48 | |
49 private MatchInspector<T> matchInspector; | |
50 | |
51 public void init(T s1, T s2) { | |
52 seq1 = s1; | |
53 seq2 = s2; | |
54 prepare(s2); | |
55 } | |
56 | |
57 public void init(T s1) { | |
58 if (seq2 == null) { | |
59 throw new IllegalStateException("Use this #init() only when target sequence shall be matched against different origin"); | |
60 } | |
61 seq1 = s1; | |
62 } | |
63 | |
64 | |
65 private void prepare(T s2) { | |
66 chunk2UseIndex = new HashMap<Object, IntVector>(); | |
67 for (int i = 0, len = s2.chunkCount(); i < len; i++) { | |
68 Object bc = s2.chunk(i); | |
69 IntVector loc = chunk2UseIndex.get(bc); | |
70 if (loc == null) { | |
71 chunk2UseIndex.put(bc, loc = new IntVector()); | |
72 } | |
73 loc.add(i); | |
74 // bc.registerUseIn(i) - BEWARE, use of bc here is incorrect | |
75 // in this case need to find the only ByteChain to keep indexes | |
76 // i.e. when there are few equal ByteChain instances, notion of "usedIn" shall be either shared (reference same vector) | |
77 // or kept within only one of them | |
78 } | |
79 } | |
80 | |
81 public void findMatchingBlocks(MatchInspector<T> insp) { | |
82 insp.begin(seq1, seq2); | |
83 matchInspector = insp; | |
84 findMatchingBlocks(0, seq1.chunkCount(), 0, seq2.chunkCount()); | |
85 insp.end(); | |
86 } | |
87 | |
88 /** | |
89 * implementation based on Python's difflib.py and SequenceMatcher | |
90 */ | |
91 public int longestMatch(int startS1, int endS1, int startS2, int endS2) { | |
92 matchStartS1 = matchStartS2 = 0; | |
93 int maxLength = 0; | |
94 IntMap<Integer> chunkIndex2MatchCount = new IntMap<Integer>(8); | |
95 for (int i = startS1; i < endS1; i++) { | |
96 Object bc = seq1.chunk(i); | |
97 IntVector occurencesInS2 = chunk2UseIndex.get(bc); | |
98 if (occurencesInS2 == null) { | |
99 chunkIndex2MatchCount.clear(); | |
100 continue; | |
101 } | |
102 IntMap<Integer> newChunkIndex2MatchCount = new IntMap<Integer>(8); | |
103 for (int j : occurencesInS2.toArray()) { | |
104 // s1[i] == s2[j] | |
105 if (j < startS2) { | |
106 continue; | |
107 } | |
108 if (j >= endS2) { | |
109 break; | |
110 } | |
111 int prevChunkMatches = chunkIndex2MatchCount.containsKey(j-1) ? chunkIndex2MatchCount.get(j-1) : 0; | |
112 int k = prevChunkMatches + 1; | |
113 newChunkIndex2MatchCount.put(j, k); | |
114 if (k > maxLength) { | |
115 matchStartS1 = i-k+1; | |
116 matchStartS2 = j-k+1; | |
117 maxLength = k; | |
118 } | |
119 } | |
120 chunkIndex2MatchCount = newChunkIndex2MatchCount; | |
121 } | |
122 return maxLength; | |
123 } | |
124 | |
125 private void findMatchingBlocks(int startS1, int endS1, int startS2, int endS2) { | |
126 int matchLength = longestMatch(startS1, endS1, startS2, endS2); | |
127 if (matchLength > 0) { | |
128 final int saveStartS1 = matchStartS1, saveStartS2 = matchStartS2; | |
129 if (startS1 < matchStartS1 && startS2 < matchStartS2) { | |
130 findMatchingBlocks(startS1, matchStartS1, startS2, matchStartS2); | |
131 } | |
132 matchInspector.match(saveStartS1, saveStartS2, matchLength); | |
133 if (saveStartS1+matchLength < endS1 && saveStartS2+matchLength < endS2) { | |
134 findMatchingBlocks(saveStartS1 + matchLength, endS1, saveStartS2 + matchLength, endS2); | |
135 } | |
136 } | |
137 } | |
138 | |
139 public interface MatchInspector<T extends ChunkSequence<?>> { | |
140 void begin(T s1, T s2); | |
141 void match(int startSeq1, int startSeq2, int matchLength); | |
142 void end(); | |
143 } | |
144 | |
145 static class MatchDumpInspector<T extends ChunkSequence<?>> implements MatchInspector<T> { | |
146 private int matchCount; | |
147 | |
148 public void begin(T s1, T s2) { | |
149 matchCount = 0; | |
150 } | |
151 | |
152 public void match(int startSeq1, int startSeq2, int matchLength) { | |
153 matchCount++; | |
154 System.out.printf("match #%d: from line #%d and line #%d of length %d\n", matchCount, startSeq1, startSeq2, matchLength); | |
155 } | |
156 | |
157 public void end() { | |
158 if (matchCount == 0) { | |
159 System.out.println("NO MATCHES FOUND!"); | |
160 } | |
161 } | |
162 } | |
163 | |
164 /** | |
165 * Matcher implementation that translates "match/equal" notification to a delta-style "added/removed/changed". | |
166 */ | |
167 public static class DeltaInspector<T extends ChunkSequence<?>> implements MatchInspector<T> { | |
168 protected int changeStartS1, changeStartS2; | |
169 protected T seq1, seq2; | |
170 | |
171 public void begin(T s1, T s2) { | |
172 seq1 = s1; | |
173 seq2 = s2; | |
174 changeStartS1 = changeStartS2 = 0; | |
175 } | |
176 | |
177 public void match(int startSeq1, int startSeq2, int matchLength) { | |
178 reportDeltaElement(startSeq1, startSeq2, matchLength); | |
179 changeStartS1 = startSeq1 + matchLength; | |
180 changeStartS2 = startSeq2 + matchLength; | |
181 } | |
182 | |
183 public void end() { | |
184 if (changeStartS1 < seq1.chunkCount()-1 || changeStartS2 < seq2.chunkCount()-1) { | |
185 reportDeltaElement(seq1.chunkCount()-1, seq2.chunkCount()-1, 0); | |
186 } | |
187 } | |
188 | |
189 protected void reportDeltaElement(int matchStartSeq1, int matchStartSeq2, int matchLength) { | |
190 if (changeStartS1 < matchStartSeq1) { | |
191 if (changeStartS2 < matchStartSeq2) { | |
192 changed(changeStartS1, matchStartSeq1, changeStartS2, matchStartSeq2); | |
193 } else { | |
194 assert changeStartS2 == matchStartSeq2; | |
195 deleted(matchStartSeq2, changeStartS1, matchStartSeq1); | |
196 } | |
197 } else { | |
198 assert changeStartS1 == matchStartSeq1; | |
199 if(changeStartS2 < matchStartSeq2) { | |
200 added(changeStartS1, changeStartS2, matchStartSeq2); | |
201 } else { | |
202 assert changeStartS2 == matchStartSeq2; | |
203 if (matchStartSeq1 > 0 || matchStartSeq2 > 0) { | |
204 // FIXME perhaps, exception is too much for the case | |
205 // once diff is covered with tests, replace with assert false : msg; | |
206 throw new HgInvalidStateException(String.format("adjustent equal blocks %d, %d and %d,%d", changeStartS1, matchStartSeq1, changeStartS2, matchStartSeq2)); | |
207 } | |
208 } | |
209 } | |
210 if (matchLength > 0) { | |
211 unchanged(matchStartSeq1, matchStartSeq2, matchLength); | |
212 } | |
213 } | |
214 | |
215 /** | |
216 * [s1From..s1To) replaced with [s2From..s2To) | |
217 */ | |
218 protected void changed(int s1From, int s1To, int s2From, int s2To) { | |
219 // NO-OP | |
220 } | |
221 | |
222 protected void deleted(int s2DeletePoint, int s1From, int s1To) { | |
223 // NO-OP | |
224 } | |
225 | |
226 protected void added(int s1InsertPoint, int s2From, int s2To) { | |
227 // NO-OP | |
228 } | |
229 | |
230 protected void unchanged(int s1From, int s2From, int length) { | |
231 // NO-OP | |
232 } | |
233 } | |
234 | |
235 static class DeltaDumpInspector<T extends ChunkSequence<?>> extends DeltaInspector<T> { | |
236 | |
237 @Override | |
238 protected void changed(int s1From, int s1To, int s2From, int s2To) { | |
239 System.out.printf("changed [%d..%d) with [%d..%d)\n", s1From, s1To, s2From, s2To); | |
240 } | |
241 | |
242 @Override | |
243 protected void deleted(int s2DeletionPoint, int s1From, int s1To) { | |
244 System.out.printf("deleted [%d..%d)\n", s1From, s1To); | |
245 } | |
246 | |
247 @Override | |
248 protected void added(int s1InsertPoint, int s2From, int s2To) { | |
249 System.out.printf("added [%d..%d) at %d\n", s2From, s2To, s1InsertPoint); | |
250 } | |
251 | |
252 @Override | |
253 protected void unchanged(int s1From, int s2From, int length) { | |
254 System.out.printf("same [%d..%d) and [%d..%d)\n", s1From, s1From + length, s2From, s2From + length); | |
255 } | |
256 } | |
257 | |
258 /** | |
259 * Generic sequence of chunk, where chunk is anything comparable to another chunk, e.g. a string or a single char | |
260 * Sequence diff algorithm above doesn't care about sequence nature. | |
261 */ | |
262 public interface ChunkSequence<T> { | |
263 public T chunk(int index); | |
264 public int chunkCount(); | |
265 } | |
266 | |
267 public static final class LineSequence implements ChunkSequence<LineSequence.ByteChain> { | |
268 | |
269 private final byte[] input; | |
270 private ArrayList<ByteChain> lines; | |
271 | |
272 public LineSequence(byte[] data) { | |
273 input = data; | |
274 } | |
275 | |
276 public static LineSequence newlines(byte[] array) { | |
277 return new LineSequence(array).splitByNewlines(); | |
278 } | |
279 | |
280 // sequence ends with fake, empty line chunk | |
281 public LineSequence splitByNewlines() { | |
282 lines = new ArrayList<ByteChain>(); | |
283 int lastStart = 0; | |
284 for (int i = 0; i < input.length; i++) { | |
285 if (input[i] == '\n') { | |
286 lines.add(new ByteChain(lastStart, i+1)); | |
287 lastStart = i+1; | |
288 } else if (input[i] == '\r') { | |
289 if (i+1 < input.length && input[i+1] == '\n') { | |
290 i++; | |
291 } | |
292 lines.add(new ByteChain(lastStart, i+1)); | |
293 lastStart = i+1; | |
294 } | |
295 } | |
296 if (lastStart < input.length) { | |
297 lines.add(new ByteChain(lastStart, input.length)); | |
298 } | |
299 // empty chunk to keep offset of input end | |
300 lines.add(new ByteChain(input.length)); | |
301 return this; | |
302 } | |
303 | |
304 public ByteChain chunk(int index) { | |
305 return lines.get(index); | |
306 } | |
307 | |
308 public int chunkCount() { | |
309 return lines.size(); | |
310 } | |
311 | |
312 public byte[] data(int chunkFrom, int chunkTo) { | |
313 if (chunkFrom == chunkTo) { | |
314 return new byte[0]; | |
315 } | |
316 int from = chunk(chunkFrom).getOffset(), to = chunk(chunkTo).getOffset(); | |
317 byte[] rv = new byte[to - from]; | |
318 System.arraycopy(input, from, rv, 0, rv.length); | |
319 return rv; | |
320 } | |
321 | |
322 | |
323 final class ByteChain { | |
324 private final int start, end; | |
325 private final int hash; | |
326 | |
327 /** | |
328 * construct a chunk with a sole purpose to keep | |
329 * offset of the data end | |
330 */ | |
331 ByteChain(int offset) { | |
332 start = end = offset; | |
333 // ensure this chunk doesn't match trailing chunk of another sequence | |
334 hash = System.identityHashCode(this); | |
335 } | |
336 | |
337 ByteChain(int s, int e) { | |
338 start = s; | |
339 end = e; | |
340 hash = calcHash(input, s, e); | |
341 } | |
342 | |
343 /** | |
344 * byte offset of the this ByteChain inside ChainSequence | |
345 */ | |
346 public int getOffset() { | |
347 return start; | |
348 } | |
349 | |
350 public byte[] data() { | |
351 byte[] rv = new byte[end - start]; | |
352 System.arraycopy(input, start, rv, 0, rv.length); | |
353 return rv; | |
354 } | |
355 | |
356 @Override | |
357 public boolean equals(Object obj) { | |
358 if (obj == null || obj.getClass() != ByteChain.class) { | |
359 return false; | |
360 } | |
361 ByteChain other = (ByteChain) obj; | |
362 if (other.hash != hash || other.end - other.start != end - start) { | |
363 return false; | |
364 } | |
365 return other.match(input, start); | |
366 } | |
367 | |
368 private boolean match(byte[] oi, int from) { | |
369 for (int i = start, j = from; i < end; i++, j++) { | |
370 if (LineSequence.this.input[i] != oi[j]) { | |
371 return false; | |
372 } | |
373 } | |
374 return true; | |
375 } | |
376 | |
377 @Override | |
378 public int hashCode() { | |
379 return hash; | |
380 } | |
381 | |
382 @Override | |
383 public String toString() { | |
384 return String.format("[@%d\"%s\"]", start, new String(data())); | |
385 } | |
386 } | |
387 | |
388 // same as Arrays.hashCode(byte[]), just for a slice of a bigger array | |
389 static int calcHash(byte[] data, int from, int to) { | |
390 int result = 1; | |
391 for (int i = from; i < to; i++) { | |
392 result = 31 * result + data[i]; | |
393 } | |
394 return result; | |
395 } | |
396 } | |
397 } |