view src/org/tmatesoft/hg/internal/diff/DiffHelper.java @ 709:497e697636fc

Report merged lines as changed block if possible, not as a sequence of added/deleted blocks. To facilitate access to merge parent lines AddBlock got mergeLineAt() method that reports index of the line in the second parent (if any), while insertedAt() has been changed to report index in the first parent always
author Artem Tikhomirov <tikhomirov.artem@gmail.com>
date Wed, 21 Aug 2013 16:23:27 +0200
parents 7839ff0bfd78
children
line wrap: on
line source
/*
 * Copyright (c) 2013 TMate Software Ltd
 *  
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 2 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * For information on how to redistribute this software under
 * the terms of a license other than GNU General Public License
 * contact TMate Software at support@hg4j.com
 */
package org.tmatesoft.hg.internal.diff;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;

import org.tmatesoft.hg.internal.IntMap;
import org.tmatesoft.hg.internal.IntSliceSeq;
import org.tmatesoft.hg.internal.IntTuple;
import org.tmatesoft.hg.internal.IntVector;

/**
 * Mercurial cares about changes only up to the line level, e.g. a simple file version dump in manifest looks like (RevlogDump output):
 * 
 *   522:        233748      0        103      17438        433        522      521       -1     756073cf2321df44d3ed0585f2a5754bc8a1b2f6
 *   <PATCH>:
 *   3487..3578, 91:src/org/tmatesoft/hg/core/HgIterateDirection.java\00add61a8a665c5d8f092210767f812fe0d335ac8
 *   
 * I.e. for the {fname}{revision} entry format of manifest, not only {revision} is changed, but the whole line, with unchanged {fname} is recorded
 * in the patch.
 * 
 * Mercurial paper describes reasons for choosing this approach to delta generation, too.
 * 
 * 
 * @author Artem Tikhomirov
 * @author TMate Software Ltd.
 */
public class DiffHelper<T extends DiffHelper.ChunkSequence<?>> {

	private Map<Object, IntVector> chunk2UseIndex;
	private T seq1, seq2;

	// get filled by #longestMatch, track start of common sequence in seq1 and seq2, respectively
	private int matchStartS1, matchStartS2;

	private MatchInspector<T> matchInspector; 

	public void init(T s1, T s2) {
		seq1 = s1;
		seq2 = s2;
		prepare(s2);
	}
	
	public void init(T s1) {
		if (seq2 == null) {
			throw new IllegalStateException("Use this #init() only when target sequence shall be matched against different origin");
		}
		seq1 = s1;
	}


	private void prepare(T s2) {
		chunk2UseIndex = new HashMap<Object, IntVector>();
		for (int i = 0, len = s2.chunkCount(); i < len; i++) {
			Object bc = s2.chunk(i);
			IntVector loc = chunk2UseIndex.get(bc);
			if (loc == null) {
				chunk2UseIndex.put(bc, loc = new IntVector());
			}
			loc.add(i);
			// bc.registerUseIn(i) - BEWARE, use of bc here is incorrect
			// in this case need to find the only ByteChain to keep indexes
			// i.e. when there are few equal ByteChain instances, notion of "usedIn" shall be either shared (reference same vector)
			// or kept within only one of them
		}
	}
	
	public void findMatchingBlocks(MatchInspector<T> insp) {
		insp.begin(seq1, seq2);
		matchInspector = insp;
		findMatchingBlocks(0, seq1.chunkCount(), 0, seq2.chunkCount());
		insp.end();
	}
	
	/** 
	 * look up every line in s2 that match lines in s1
	 * idea: pure additions in s2 are diff-ed against s1 again and again, to see if there are any matches
	 */
	void findAllMatchAlternatives(final MatchInspector<T> insp) {
		assert seq1.chunkCount() > 0;
		final IntSliceSeq insertions = new IntSliceSeq(2);
		final boolean matchedAny[] = new boolean[] {false};
		DeltaInspector<T> myInsp = new DeltaInspector<T>() {
			@Override
			protected void unchanged(int s1From, int s2From, int length) {
				matchedAny[0] = true;
				insp.match(s1From, s2From, length);
			}
			@Override
			protected void added(int s1InsertPoint, int s2From, int s2To) {
				insertions.add(s2From, s2To);
			}
		};
		matchInspector = myInsp;
		myInsp.begin(seq1, seq2);
		IntSliceSeq s2RangesToCheck = new IntSliceSeq(2, 1, 0);
		s2RangesToCheck.add(0, seq2.chunkCount());
		do {
			IntSliceSeq nextCheck = new IntSliceSeq(2);
			for (IntTuple t : s2RangesToCheck) {
				int s2Start = t.at(0);
				int s2End = t.at(1);
				myInsp.changeStartS1 = 0;
				myInsp.changeStartS2 = s2Start;
				insp.begin(seq1, seq2);
				matchedAny[0] = false;
				findMatchingBlocks(0, seq1.chunkCount(), s2Start, s2End);
				insp.end();
				myInsp.end();
				if (matchedAny[0]) {
					nextCheck.addAll(insertions);
				}
				insertions.clear();
			}
			s2RangesToCheck = nextCheck;
		} while (s2RangesToCheck.size() > 0);
	}
	
	/**
	 * implementation based on Python's difflib.py and SequenceMatcher 
	 */
	public int longestMatch(int startS1, int endS1, int startS2, int endS2) {
		matchStartS1 = matchStartS2 = 0;
		int maxLength = 0;
		IntMap<Integer> chunkIndex2MatchCount = new IntMap<Integer>(8);
		for (int i = startS1; i < endS1; i++) {
			Object bc = seq1.chunk(i);
			IntVector occurencesInS2 = chunk2UseIndex.get(bc);
			if (occurencesInS2 == null) {
				chunkIndex2MatchCount.clear();
				continue;
			}
			IntMap<Integer> newChunkIndex2MatchCount = new IntMap<Integer>(8);
			for (int j : occurencesInS2.toArray()) {
				// s1[i] == s2[j]
				if (j < startS2) {
					continue;
				}
				if (j >= endS2) {
					break;
				}
				int prevChunkMatches = chunkIndex2MatchCount.containsKey(j-1) ? chunkIndex2MatchCount.get(j-1) : 0;
				int k = prevChunkMatches + 1;
				newChunkIndex2MatchCount.put(j, k);
				if (k > maxLength) {
					matchStartS1 = i-k+1;
					matchStartS2 = j-k+1;
					maxLength = k;
				}
			}
			chunkIndex2MatchCount = newChunkIndex2MatchCount;
		}
		return maxLength;
	}
	
	private void findMatchingBlocks(int startS1, int endS1, int startS2, int endS2) {
		int matchLength = longestMatch(startS1, endS1, startS2, endS2);
		if (matchLength > 0) {
			final int saveStartS1 = matchStartS1, saveStartS2 = matchStartS2;
			if (startS1 < matchStartS1 && startS2 < matchStartS2) {
				findMatchingBlocks(startS1, matchStartS1, startS2, matchStartS2);
			}
			matchInspector.match(saveStartS1, saveStartS2, matchLength);
			if (saveStartS1+matchLength < endS1 && saveStartS2+matchLength < endS2) {
				findMatchingBlocks(saveStartS1 + matchLength, endS1, saveStartS2 + matchLength, endS2);
			}
		}
	}
	
	public interface MatchInspector<T extends ChunkSequence<?>> {
		void begin(T s1, T s2);
		void match(int startSeq1, int startSeq2, int matchLength);
		void end();
	}
	
	static class MatchDumpInspector<T extends ChunkSequence<?>> implements MatchInspector<T> {
		private int matchCount;

		public void begin(T s1, T s2) {
			matchCount = 0;
		}

		public void match(int startSeq1, int startSeq2, int matchLength) {
			matchCount++;
			System.out.printf("match #%d: from line #%d  and line #%d of length %d\n", matchCount, startSeq1, startSeq2, matchLength);
		}

		public void end() {
			if (matchCount == 0) {
				System.out.println("NO MATCHES FOUND!");
			}
		}
	}
	
	/**
	 * Matcher implementation that translates "match/equal" notification to a delta-style "added/removed/changed". 
	 */
	public static class DeltaInspector<T extends ChunkSequence<?>> implements MatchInspector<T> {
		protected int changeStartS1, changeStartS2;
		protected T seq1, seq2;

		public void begin(T s1, T s2) {
			seq1 = s1;
			seq2 = s2;
			changeStartS1 = changeStartS2 = 0;
		}

		public void match(int startSeq1, int startSeq2, int matchLength) {
			reportDeltaElement(startSeq1, startSeq2, matchLength);
			changeStartS1 = startSeq1 + matchLength;
			changeStartS2 = startSeq2 + matchLength;
		}

		public void end() {
			if (changeStartS1 < seq1.chunkCount()-1 || changeStartS2 < seq2.chunkCount()-1) {
				reportDeltaElement(seq1.chunkCount()-1, seq2.chunkCount()-1, 0);
			}
		}

		protected void reportDeltaElement(int matchStartSeq1, int matchStartSeq2, int matchLength) {
			if (changeStartS1 < matchStartSeq1) {
				if (changeStartS2 < matchStartSeq2) {
					changed(changeStartS1, matchStartSeq1, changeStartS2, matchStartSeq2);
				} else {
					assert changeStartS2 == matchStartSeq2;
					deleted(matchStartSeq2, changeStartS1, matchStartSeq1);
				}
			} else {
				assert changeStartS1 == matchStartSeq1;
				if(changeStartS2 < matchStartSeq2) {
					added(changeStartS1, changeStartS2, matchStartSeq2);
				} else {
					assert changeStartS2 == matchStartSeq2;
					if (matchStartSeq1 > 0 || matchStartSeq2 > 0) {
						assert false : String.format("adjustent equal blocks %d, %d and %d,%d", changeStartS1, matchStartSeq1, changeStartS2, matchStartSeq2);
					}
				}
			}
			if (matchLength > 0) {
				unchanged(matchStartSeq1, matchStartSeq2, matchLength);
			}
		}

		/**
		 * [s1From..s1To) replaced with [s2From..s2To)
		 */
		protected void changed(int s1From, int s1To, int s2From, int s2To) {
			// NO-OP
		}

		protected void deleted(int s2DeletePoint, int s1From, int s1To) {
			// NO-OP
		}

		protected void added(int s1InsertPoint, int s2From, int s2To) {
			// NO-OP
		}

		protected void unchanged(int s1From, int s2From, int length) {
			// NO-OP
		}
	}
	
	public static class DeltaDumpInspector<T extends ChunkSequence<?>> extends DeltaInspector<T> {

		@Override
		protected void changed(int s1From, int s1To, int s2From, int s2To) {
			System.out.printf("changed [%d..%d) with [%d..%d)\n", s1From, s1To, s2From, s2To);
		}
		
		@Override
		protected void deleted(int s2DeletionPoint, int s1From, int s1To) {
			System.out.printf("deleted [%d..%d)\n", s1From, s1To);
		}
		
		@Override
		protected void added(int s1InsertPoint, int s2From, int s2To) {
			System.out.printf("added [%d..%d) at %d\n", s2From, s2To, s1InsertPoint);
		}

		@Override
		protected void unchanged(int s1From, int s2From, int length) {
			System.out.printf("same [%d..%d) and [%d..%d)\n", s1From, s1From + length, s2From, s2From + length);
		}
	}
	
	/**
	 * Generic sequence of chunk, where chunk is anything comparable to another chunk, e.g. a string or a single char
	 * Sequence diff algorithm above doesn't care about sequence nature.
	 */
	public interface ChunkSequence<T> {
		public T chunk(int index);
		public int chunkCount();
	}
	
	public static final class LineSequence implements ChunkSequence<LineSequence.ByteChain> {
		
		private final byte[] input;
		private ArrayList<ByteChain> lines;

		public LineSequence(byte[] data) {
			input = data;
		}
		
		public static LineSequence newlines(byte[] array) {
			return new LineSequence(array).splitByNewlines();
		}

		// sequence ends with fake, empty line chunk
		public LineSequence splitByNewlines() {
			lines = new ArrayList<ByteChain>();
			int lastStart = 0;
			for (int i = 0; i < input.length; i++) {
				if (input[i] == '\n') {
					lines.add(new ByteChain(lastStart, i+1));
					lastStart = i+1;
				} else if (input[i] == '\r') {
					if (i+1 < input.length && input[i+1] == '\n') {
						i++;
					}
					lines.add(new ByteChain(lastStart, i+1));
					lastStart = i+1;
				}
			}
			if (lastStart < input.length) {
				lines.add(new ByteChain(lastStart, input.length));
			}
			// empty chunk to keep offset of input end
			lines.add(new ByteChain(input.length));
			return this;
		}
		
		public ByteChain chunk(int index) {
			return lines.get(index);
		}
		
		public int chunkCount() {
			return lines.size();
		}
		
		public byte[] data(int chunkFrom, int chunkTo) {
			if (chunkFrom == chunkTo) {
				return new byte[0];
			}
			int from = chunk(chunkFrom).getOffset(), to = chunk(chunkTo).getOffset();
			byte[] rv = new byte[to - from];
			System.arraycopy(input, from, rv, 0, rv.length);
			return rv;
		}

		
		public final class ByteChain {
			private final int start, end;
			private final int hash;
			
			/**
			 * construct a chunk with a sole purpose to keep 
			 * offset of the data end
			 */
			ByteChain(int offset) {
				start = end = offset;
				// ensure this chunk doesn't match trailing chunk of another sequence
				hash = System.identityHashCode(this);
			}
			
			ByteChain(int s, int e) {
				start = s;
				end = e;
				hash = calcHash(input, s, e);
			}
			
			/**
			 * byte offset of the this ByteChain inside ChainSequence 
			 */
			public int getOffset() {
				return start;
			}
			
			public byte[] data() {
				byte[] rv = new byte[end - start];
				System.arraycopy(input, start, rv, 0, rv.length);
				return rv;
			}
			
			@Override
			public boolean equals(Object obj) {
				if (obj == null || obj.getClass() != ByteChain.class) {
					return false;
				}
				ByteChain other = (ByteChain) obj;
				if (other.hash != hash || other.end - other.start != end - start) {
					return false;
				}
				return other.match(input, start);
			}
			
			private boolean match(byte[] oi, int from) {
				for (int i = start, j = from; i < end; i++, j++) {
					if (LineSequence.this.input[i] != oi[j]) {
						return false;
					}
				}
				return true;
			}
			
			@Override
			public int hashCode() {
				return hash;
			}
			
			@Override
			public String toString() {
				return String.format("[@%d\"%s\"]", start, new String(data()));
			}
		}

		// same as Arrays.hashCode(byte[]), just for a slice of a bigger array
		static int calcHash(byte[] data, int from, int to) {
			int result = 1;
			for (int i = from; i < to; i++) {
				result = 31 * result + data[i];
			}
			return result;
		}
	}
}