Mercurial > jhg
comparison src/org/tmatesoft/hg/internal/diff/DiffHelper.java @ 703:7839ff0bfd78
Refactor: move diff/blame related code to a separate package
| author | Artem Tikhomirov <tikhomirov.artem@gmail.com> |
|---|---|
| date | Wed, 14 Aug 2013 14:51:51 +0200 |
| parents | src/org/tmatesoft/hg/internal/DiffHelper.java@58a6900f845d |
| children |
comparison
equal
deleted
inserted
replaced
| 702:992fa84e7885 | 703:7839ff0bfd78 |
|---|---|
| 1 /* | |
| 2 * Copyright (c) 2013 TMate Software Ltd | |
| 3 * | |
| 4 * This program is free software; you can redistribute it and/or modify | |
| 5 * it under the terms of the GNU General Public License as published by | |
| 6 * the Free Software Foundation; version 2 of the License. | |
| 7 * | |
| 8 * This program is distributed in the hope that it will be useful, | |
| 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 11 * GNU General Public License for more details. | |
| 12 * | |
| 13 * For information on how to redistribute this software under | |
| 14 * the terms of a license other than GNU General Public License | |
| 15 * contact TMate Software at support@hg4j.com | |
| 16 */ | |
| 17 package org.tmatesoft.hg.internal.diff; | |
| 18 | |
| 19 import java.util.ArrayList; | |
| 20 import java.util.HashMap; | |
| 21 import java.util.Map; | |
| 22 | |
| 23 import org.tmatesoft.hg.internal.IntMap; | |
| 24 import org.tmatesoft.hg.internal.IntSliceSeq; | |
| 25 import org.tmatesoft.hg.internal.IntTuple; | |
| 26 import org.tmatesoft.hg.internal.IntVector; | |
| 27 | |
| 28 /** | |
| 29 * Mercurial cares about changes only up to the line level, e.g. a simple file version dump in manifest looks like (RevlogDump output): | |
| 30 * | |
| 31 * 522: 233748 0 103 17438 433 522 521 -1 756073cf2321df44d3ed0585f2a5754bc8a1b2f6 | |
| 32 * <PATCH>: | |
| 33 * 3487..3578, 91:src/org/tmatesoft/hg/core/HgIterateDirection.java\00add61a8a665c5d8f092210767f812fe0d335ac8 | |
| 34 * | |
| 35 * I.e. for the {fname}{revision} entry format of manifest, not only {revision} is changed, but the whole line, with unchanged {fname} is recorded | |
| 36 * in the patch. | |
| 37 * | |
| 38 * Mercurial paper describes reasons for choosing this approach to delta generation, too. | |
| 39 * | |
| 40 * | |
| 41 * @author Artem Tikhomirov | |
| 42 * @author TMate Software Ltd. | |
| 43 */ | |
| 44 public class DiffHelper<T extends DiffHelper.ChunkSequence<?>> { | |
| 45 | |
| 46 private Map<Object, IntVector> chunk2UseIndex; | |
| 47 private T seq1, seq2; | |
| 48 | |
| 49 // get filled by #longestMatch, track start of common sequence in seq1 and seq2, respectively | |
| 50 private int matchStartS1, matchStartS2; | |
| 51 | |
| 52 private MatchInspector<T> matchInspector; | |
| 53 | |
| 54 public void init(T s1, T s2) { | |
| 55 seq1 = s1; | |
| 56 seq2 = s2; | |
| 57 prepare(s2); | |
| 58 } | |
| 59 | |
| 60 public void init(T s1) { | |
| 61 if (seq2 == null) { | |
| 62 throw new IllegalStateException("Use this #init() only when target sequence shall be matched against different origin"); | |
| 63 } | |
| 64 seq1 = s1; | |
| 65 } | |
| 66 | |
| 67 | |
| 68 private void prepare(T s2) { | |
| 69 chunk2UseIndex = new HashMap<Object, IntVector>(); | |
| 70 for (int i = 0, len = s2.chunkCount(); i < len; i++) { | |
| 71 Object bc = s2.chunk(i); | |
| 72 IntVector loc = chunk2UseIndex.get(bc); | |
| 73 if (loc == null) { | |
| 74 chunk2UseIndex.put(bc, loc = new IntVector()); | |
| 75 } | |
| 76 loc.add(i); | |
| 77 // bc.registerUseIn(i) - BEWARE, use of bc here is incorrect | |
| 78 // in this case need to find the only ByteChain to keep indexes | |
| 79 // i.e. when there are few equal ByteChain instances, notion of "usedIn" shall be either shared (reference same vector) | |
| 80 // or kept within only one of them | |
| 81 } | |
| 82 } | |
| 83 | |
| 84 public void findMatchingBlocks(MatchInspector<T> insp) { | |
| 85 insp.begin(seq1, seq2); | |
| 86 matchInspector = insp; | |
| 87 findMatchingBlocks(0, seq1.chunkCount(), 0, seq2.chunkCount()); | |
| 88 insp.end(); | |
| 89 } | |
| 90 | |
| 91 /** | |
| 92 * look up every line in s2 that match lines in s1 | |
| 93 * idea: pure additions in s2 are diff-ed against s1 again and again, to see if there are any matches | |
| 94 */ | |
| 95 void findAllMatchAlternatives(final MatchInspector<T> insp) { | |
| 96 assert seq1.chunkCount() > 0; | |
| 97 final IntSliceSeq insertions = new IntSliceSeq(2); | |
| 98 final boolean matchedAny[] = new boolean[] {false}; | |
| 99 DeltaInspector<T> myInsp = new DeltaInspector<T>() { | |
| 100 @Override | |
| 101 protected void unchanged(int s1From, int s2From, int length) { | |
| 102 matchedAny[0] = true; | |
| 103 insp.match(s1From, s2From, length); | |
| 104 } | |
| 105 @Override | |
| 106 protected void added(int s1InsertPoint, int s2From, int s2To) { | |
| 107 insertions.add(s2From, s2To); | |
| 108 } | |
| 109 }; | |
| 110 matchInspector = myInsp; | |
| 111 myInsp.begin(seq1, seq2); | |
| 112 IntSliceSeq s2RangesToCheck = new IntSliceSeq(2, 1, 0); | |
| 113 s2RangesToCheck.add(0, seq2.chunkCount()); | |
| 114 do { | |
| 115 IntSliceSeq nextCheck = new IntSliceSeq(2); | |
| 116 for (IntTuple t : s2RangesToCheck) { | |
| 117 int s2Start = t.at(0); | |
| 118 int s2End = t.at(1); | |
| 119 myInsp.changeStartS1 = 0; | |
| 120 myInsp.changeStartS2 = s2Start; | |
| 121 insp.begin(seq1, seq2); | |
| 122 matchedAny[0] = false; | |
| 123 findMatchingBlocks(0, seq1.chunkCount(), s2Start, s2End); | |
| 124 insp.end(); | |
| 125 myInsp.end(); | |
| 126 if (matchedAny[0]) { | |
| 127 nextCheck.addAll(insertions); | |
| 128 } | |
| 129 insertions.clear(); | |
| 130 } | |
| 131 s2RangesToCheck = nextCheck; | |
| 132 } while (s2RangesToCheck.size() > 0); | |
| 133 } | |
| 134 | |
| 135 /** | |
| 136 * implementation based on Python's difflib.py and SequenceMatcher | |
| 137 */ | |
| 138 public int longestMatch(int startS1, int endS1, int startS2, int endS2) { | |
| 139 matchStartS1 = matchStartS2 = 0; | |
| 140 int maxLength = 0; | |
| 141 IntMap<Integer> chunkIndex2MatchCount = new IntMap<Integer>(8); | |
| 142 for (int i = startS1; i < endS1; i++) { | |
| 143 Object bc = seq1.chunk(i); | |
| 144 IntVector occurencesInS2 = chunk2UseIndex.get(bc); | |
| 145 if (occurencesInS2 == null) { | |
| 146 chunkIndex2MatchCount.clear(); | |
| 147 continue; | |
| 148 } | |
| 149 IntMap<Integer> newChunkIndex2MatchCount = new IntMap<Integer>(8); | |
| 150 for (int j : occurencesInS2.toArray()) { | |
| 151 // s1[i] == s2[j] | |
| 152 if (j < startS2) { | |
| 153 continue; | |
| 154 } | |
| 155 if (j >= endS2) { | |
| 156 break; | |
| 157 } | |
| 158 int prevChunkMatches = chunkIndex2MatchCount.containsKey(j-1) ? chunkIndex2MatchCount.get(j-1) : 0; | |
| 159 int k = prevChunkMatches + 1; | |
| 160 newChunkIndex2MatchCount.put(j, k); | |
| 161 if (k > maxLength) { | |
| 162 matchStartS1 = i-k+1; | |
| 163 matchStartS2 = j-k+1; | |
| 164 maxLength = k; | |
| 165 } | |
| 166 } | |
| 167 chunkIndex2MatchCount = newChunkIndex2MatchCount; | |
| 168 } | |
| 169 return maxLength; | |
| 170 } | |
| 171 | |
| 172 private void findMatchingBlocks(int startS1, int endS1, int startS2, int endS2) { | |
| 173 int matchLength = longestMatch(startS1, endS1, startS2, endS2); | |
| 174 if (matchLength > 0) { | |
| 175 final int saveStartS1 = matchStartS1, saveStartS2 = matchStartS2; | |
| 176 if (startS1 < matchStartS1 && startS2 < matchStartS2) { | |
| 177 findMatchingBlocks(startS1, matchStartS1, startS2, matchStartS2); | |
| 178 } | |
| 179 matchInspector.match(saveStartS1, saveStartS2, matchLength); | |
| 180 if (saveStartS1+matchLength < endS1 && saveStartS2+matchLength < endS2) { | |
| 181 findMatchingBlocks(saveStartS1 + matchLength, endS1, saveStartS2 + matchLength, endS2); | |
| 182 } | |
| 183 } | |
| 184 } | |
| 185 | |
| 186 public interface MatchInspector<T extends ChunkSequence<?>> { | |
| 187 void begin(T s1, T s2); | |
| 188 void match(int startSeq1, int startSeq2, int matchLength); | |
| 189 void end(); | |
| 190 } | |
| 191 | |
| 192 static class MatchDumpInspector<T extends ChunkSequence<?>> implements MatchInspector<T> { | |
| 193 private int matchCount; | |
| 194 | |
| 195 public void begin(T s1, T s2) { | |
| 196 matchCount = 0; | |
| 197 } | |
| 198 | |
| 199 public void match(int startSeq1, int startSeq2, int matchLength) { | |
| 200 matchCount++; | |
| 201 System.out.printf("match #%d: from line #%d and line #%d of length %d\n", matchCount, startSeq1, startSeq2, matchLength); | |
| 202 } | |
| 203 | |
| 204 public void end() { | |
| 205 if (matchCount == 0) { | |
| 206 System.out.println("NO MATCHES FOUND!"); | |
| 207 } | |
| 208 } | |
| 209 } | |
| 210 | |
| 211 /** | |
| 212 * Matcher implementation that translates "match/equal" notification to a delta-style "added/removed/changed". | |
| 213 */ | |
| 214 public static class DeltaInspector<T extends ChunkSequence<?>> implements MatchInspector<T> { | |
| 215 protected int changeStartS1, changeStartS2; | |
| 216 protected T seq1, seq2; | |
| 217 | |
| 218 public void begin(T s1, T s2) { | |
| 219 seq1 = s1; | |
| 220 seq2 = s2; | |
| 221 changeStartS1 = changeStartS2 = 0; | |
| 222 } | |
| 223 | |
| 224 public void match(int startSeq1, int startSeq2, int matchLength) { | |
| 225 reportDeltaElement(startSeq1, startSeq2, matchLength); | |
| 226 changeStartS1 = startSeq1 + matchLength; | |
| 227 changeStartS2 = startSeq2 + matchLength; | |
| 228 } | |
| 229 | |
| 230 public void end() { | |
| 231 if (changeStartS1 < seq1.chunkCount()-1 || changeStartS2 < seq2.chunkCount()-1) { | |
| 232 reportDeltaElement(seq1.chunkCount()-1, seq2.chunkCount()-1, 0); | |
| 233 } | |
| 234 } | |
| 235 | |
| 236 protected void reportDeltaElement(int matchStartSeq1, int matchStartSeq2, int matchLength) { | |
| 237 if (changeStartS1 < matchStartSeq1) { | |
| 238 if (changeStartS2 < matchStartSeq2) { | |
| 239 changed(changeStartS1, matchStartSeq1, changeStartS2, matchStartSeq2); | |
| 240 } else { | |
| 241 assert changeStartS2 == matchStartSeq2; | |
| 242 deleted(matchStartSeq2, changeStartS1, matchStartSeq1); | |
| 243 } | |
| 244 } else { | |
| 245 assert changeStartS1 == matchStartSeq1; | |
| 246 if(changeStartS2 < matchStartSeq2) { | |
| 247 added(changeStartS1, changeStartS2, matchStartSeq2); | |
| 248 } else { | |
| 249 assert changeStartS2 == matchStartSeq2; | |
| 250 if (matchStartSeq1 > 0 || matchStartSeq2 > 0) { | |
| 251 assert false : String.format("adjustent equal blocks %d, %d and %d,%d", changeStartS1, matchStartSeq1, changeStartS2, matchStartSeq2); | |
| 252 } | |
| 253 } | |
| 254 } | |
| 255 if (matchLength > 0) { | |
| 256 unchanged(matchStartSeq1, matchStartSeq2, matchLength); | |
| 257 } | |
| 258 } | |
| 259 | |
| 260 /** | |
| 261 * [s1From..s1To) replaced with [s2From..s2To) | |
| 262 */ | |
| 263 protected void changed(int s1From, int s1To, int s2From, int s2To) { | |
| 264 // NO-OP | |
| 265 } | |
| 266 | |
| 267 protected void deleted(int s2DeletePoint, int s1From, int s1To) { | |
| 268 // NO-OP | |
| 269 } | |
| 270 | |
| 271 protected void added(int s1InsertPoint, int s2From, int s2To) { | |
| 272 // NO-OP | |
| 273 } | |
| 274 | |
| 275 protected void unchanged(int s1From, int s2From, int length) { | |
| 276 // NO-OP | |
| 277 } | |
| 278 } | |
| 279 | |
| 280 public static class DeltaDumpInspector<T extends ChunkSequence<?>> extends DeltaInspector<T> { | |
| 281 | |
| 282 @Override | |
| 283 protected void changed(int s1From, int s1To, int s2From, int s2To) { | |
| 284 System.out.printf("changed [%d..%d) with [%d..%d)\n", s1From, s1To, s2From, s2To); | |
| 285 } | |
| 286 | |
| 287 @Override | |
| 288 protected void deleted(int s2DeletionPoint, int s1From, int s1To) { | |
| 289 System.out.printf("deleted [%d..%d)\n", s1From, s1To); | |
| 290 } | |
| 291 | |
| 292 @Override | |
| 293 protected void added(int s1InsertPoint, int s2From, int s2To) { | |
| 294 System.out.printf("added [%d..%d) at %d\n", s2From, s2To, s1InsertPoint); | |
| 295 } | |
| 296 | |
| 297 @Override | |
| 298 protected void unchanged(int s1From, int s2From, int length) { | |
| 299 System.out.printf("same [%d..%d) and [%d..%d)\n", s1From, s1From + length, s2From, s2From + length); | |
| 300 } | |
| 301 } | |
| 302 | |
| 303 /** | |
| 304 * Generic sequence of chunk, where chunk is anything comparable to another chunk, e.g. a string or a single char | |
| 305 * Sequence diff algorithm above doesn't care about sequence nature. | |
| 306 */ | |
| 307 public interface ChunkSequence<T> { | |
| 308 public T chunk(int index); | |
| 309 public int chunkCount(); | |
| 310 } | |
| 311 | |
| 312 public static final class LineSequence implements ChunkSequence<LineSequence.ByteChain> { | |
| 313 | |
| 314 private final byte[] input; | |
| 315 private ArrayList<ByteChain> lines; | |
| 316 | |
| 317 public LineSequence(byte[] data) { | |
| 318 input = data; | |
| 319 } | |
| 320 | |
| 321 public static LineSequence newlines(byte[] array) { | |
| 322 return new LineSequence(array).splitByNewlines(); | |
| 323 } | |
| 324 | |
| 325 // sequence ends with fake, empty line chunk | |
| 326 public LineSequence splitByNewlines() { | |
| 327 lines = new ArrayList<ByteChain>(); | |
| 328 int lastStart = 0; | |
| 329 for (int i = 0; i < input.length; i++) { | |
| 330 if (input[i] == '\n') { | |
| 331 lines.add(new ByteChain(lastStart, i+1)); | |
| 332 lastStart = i+1; | |
| 333 } else if (input[i] == '\r') { | |
| 334 if (i+1 < input.length && input[i+1] == '\n') { | |
| 335 i++; | |
| 336 } | |
| 337 lines.add(new ByteChain(lastStart, i+1)); | |
| 338 lastStart = i+1; | |
| 339 } | |
| 340 } | |
| 341 if (lastStart < input.length) { | |
| 342 lines.add(new ByteChain(lastStart, input.length)); | |
| 343 } | |
| 344 // empty chunk to keep offset of input end | |
| 345 lines.add(new ByteChain(input.length)); | |
| 346 return this; | |
| 347 } | |
| 348 | |
| 349 public ByteChain chunk(int index) { | |
| 350 return lines.get(index); | |
| 351 } | |
| 352 | |
| 353 public int chunkCount() { | |
| 354 return lines.size(); | |
| 355 } | |
| 356 | |
| 357 public byte[] data(int chunkFrom, int chunkTo) { | |
| 358 if (chunkFrom == chunkTo) { | |
| 359 return new byte[0]; | |
| 360 } | |
| 361 int from = chunk(chunkFrom).getOffset(), to = chunk(chunkTo).getOffset(); | |
| 362 byte[] rv = new byte[to - from]; | |
| 363 System.arraycopy(input, from, rv, 0, rv.length); | |
| 364 return rv; | |
| 365 } | |
| 366 | |
| 367 | |
| 368 public final class ByteChain { | |
| 369 private final int start, end; | |
| 370 private final int hash; | |
| 371 | |
| 372 /** | |
| 373 * construct a chunk with a sole purpose to keep | |
| 374 * offset of the data end | |
| 375 */ | |
| 376 ByteChain(int offset) { | |
| 377 start = end = offset; | |
| 378 // ensure this chunk doesn't match trailing chunk of another sequence | |
| 379 hash = System.identityHashCode(this); | |
| 380 } | |
| 381 | |
| 382 ByteChain(int s, int e) { | |
| 383 start = s; | |
| 384 end = e; | |
| 385 hash = calcHash(input, s, e); | |
| 386 } | |
| 387 | |
| 388 /** | |
| 389 * byte offset of the this ByteChain inside ChainSequence | |
| 390 */ | |
| 391 public int getOffset() { | |
| 392 return start; | |
| 393 } | |
| 394 | |
| 395 public byte[] data() { | |
| 396 byte[] rv = new byte[end - start]; | |
| 397 System.arraycopy(input, start, rv, 0, rv.length); | |
| 398 return rv; | |
| 399 } | |
| 400 | |
| 401 @Override | |
| 402 public boolean equals(Object obj) { | |
| 403 if (obj == null || obj.getClass() != ByteChain.class) { | |
| 404 return false; | |
| 405 } | |
| 406 ByteChain other = (ByteChain) obj; | |
| 407 if (other.hash != hash || other.end - other.start != end - start) { | |
| 408 return false; | |
| 409 } | |
| 410 return other.match(input, start); | |
| 411 } | |
| 412 | |
| 413 private boolean match(byte[] oi, int from) { | |
| 414 for (int i = start, j = from; i < end; i++, j++) { | |
| 415 if (LineSequence.this.input[i] != oi[j]) { | |
| 416 return false; | |
| 417 } | |
| 418 } | |
| 419 return true; | |
| 420 } | |
| 421 | |
| 422 @Override | |
| 423 public int hashCode() { | |
| 424 return hash; | |
| 425 } | |
| 426 | |
| 427 @Override | |
| 428 public String toString() { | |
| 429 return String.format("[@%d\"%s\"]", start, new String(data())); | |
| 430 } | |
| 431 } | |
| 432 | |
| 433 // same as Arrays.hashCode(byte[]), just for a slice of a bigger array | |
| 434 static int calcHash(byte[] data, int from, int to) { | |
| 435 int result = 1; | |
| 436 for (int i = from; i < to; i++) { | |
| 437 result = 31 * result + data[i]; | |
| 438 } | |
| 439 return result; | |
| 440 } | |
| 441 } | |
| 442 } |
