view src/org/tmatesoft/hg/repo/HgStatusCollector.java @ 709:497e697636fc

Report merged lines as changed block if possible, not as a sequence of added/deleted blocks. To facilitate access to merge parent lines AddBlock got mergeLineAt() method that reports index of the line in the second parent (if any), while insertedAt() has been changed to report index in the first parent always
author Artem Tikhomirov <tikhomirov.artem@gmail.com>
date Wed, 21 Aug 2013 16:23:27 +0200
parents 7efabe0cddcf
children
line wrap: on
line source
/*
 * Copyright (c) 2011-2012 TMate Software Ltd
 *  
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 2 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * For information on how to redistribute this software under
 * the terms of a license other than GNU General Public License
 * contact TMate Software at support@hg4j.com
 */
package org.tmatesoft.hg.repo;

import static org.tmatesoft.hg.repo.HgRepository.*;

import java.util.Collection;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;

import org.tmatesoft.hg.core.Nodeid;
import org.tmatesoft.hg.internal.FileRenameHistory;
import org.tmatesoft.hg.internal.FileRenameHistory.Chunk;
import org.tmatesoft.hg.internal.IntMap;
import org.tmatesoft.hg.internal.ManifestRevision;
import org.tmatesoft.hg.internal.Pool;
import org.tmatesoft.hg.util.CancelSupport;
import org.tmatesoft.hg.util.CancelledException;
import org.tmatesoft.hg.util.Convertor;
import org.tmatesoft.hg.util.Path;


/**
 * Collect status information for changes between two repository revisions.
 *
 * @author Artem Tikhomirov
 * @author TMate Software Ltd.
 */
public class HgStatusCollector {

	private final HgRepository repo;
	private final IntMap<ManifestRevision> cache; // sparse array, in fact
	// with cpython repository, ~70 000 changes, complete Log (direct out, no reverse) output 
	// no cache limit, no nodeids and fname caching - OOME on changeset 1035
	// no cache limit, but with cached nodeids and filenames - 1730+
	// cache limit 100 - 19+ minutes to process 10000, and still working (too long, stopped)
	private final int cacheMaxSize = 50; // do not keep too much manifest revisions
	private Convertor<Path> pathPool;
	private final Pool<Nodeid> cacheNodes;
	private final Pool<Path> cacheFilenames;
	private final ManifestRevision emptyFakeState;
	private Path.Matcher scope = new Path.Matcher.Any();
	// @see #detectCopies()
	private boolean detectCopies = true;
	

	public HgStatusCollector(HgRepository hgRepo) {
		this.repo = hgRepo;
		cache = new IntMap<ManifestRevision>(cacheMaxSize);
		cacheNodes = new Pool<Nodeid>();
		cacheFilenames = new Pool<Path>();

		emptyFakeState = createEmptyManifestRevision();
	}
	
	public HgRepository getRepo() {
		return repo;
	}
	
	private ManifestRevision get(int rev) throws HgRuntimeException {
		ManifestRevision i = cache.get(rev);
		if (i == null) {
			if (rev == NO_REVISION) {
				return emptyFakeState;
			}
			ensureCacheSize();
			i = new ManifestRevision(cacheNodes, cacheFilenames);
			cache.put(rev, i);
			repo.getManifest().walk(rev, rev, i);
		}
		return i;
	}

	private boolean cached(int revision) {
		return cache.containsKey(revision) || revision == NO_REVISION;
	}
	
	private void ensureCacheSize() {
		if (cache.size() > cacheMaxSize) {
			// assume usually we go from oldest to newest, hence remove oldest as most likely to be no longer necessary
			cache.removeFromStart(cache.size() - cacheMaxSize + 1 /* room for new element */);
		}
	}
	
	private void initCacheRange(int minRev, int maxRev) throws HgRuntimeException {
		ensureCacheSize();
		// In fact, walk(minRev, maxRev) doesn't imply
		// there would be maxRev-minRev+1 revisions visited. For example,
		// check cpython repo with 'hg log -r 22418:22420 --debug' and admire
		// manifest revisions 66650, 21683, 21684.  Thus, innocent walk(22418,22420) results in 40k+ revisions and OOME
		// Instead, be explicit of what revisions are of interest
		assert minRev <= maxRev;
		int[] revisionsToCollect = new int[maxRev - minRev + 1];
		for (int x = minRev, i = 0; x <= maxRev; i++, x++) {
			revisionsToCollect[i] = x;
		}
		repo.getManifest().walk(new HgManifest.Inspector() {
			private ManifestRevision delegate;
			private boolean cacheHit; // range may include revisions we already know about, do not re-create them

			public boolean begin(int manifestRevision, Nodeid nid, int changelogRevision) {
				assert delegate == null;
				if (cache.containsKey(changelogRevision)) { // don't need to check emptyFakeState hit as revision never NO_REVISION here
					cacheHit = true;
				} else {
					cache.put(changelogRevision, delegate = new ManifestRevision(cacheNodes, cacheFilenames));
					// cache may grow bigger than max size here, but it's ok as present simplistic cache clearing mechanism may
					// otherwise remove entries we just added
					delegate.begin(manifestRevision, nid, changelogRevision);
					cacheHit = false;
				}
				return true;
			}

			public boolean next(Nodeid nid, Path fname, HgManifest.Flags flags) {
				if (!cacheHit) {
					delegate.next(nid, fname, flags);
				}
				return true;
			}
			
			public boolean end(int revision) {
				if (!cacheHit) {
					delegate.end(revision);
				}
				cacheHit = false;				
				delegate = null;
				return true;
			}
		}, revisionsToCollect);
	}
	
	/*package-local*/ static ManifestRevision createEmptyManifestRevision() {
		ManifestRevision fakeEmptyRev = new ManifestRevision(null, null);
		fakeEmptyRev.begin(NO_REVISION, null, NO_REVISION);
		fakeEmptyRev.end(NO_REVISION);
		return fakeEmptyRev;
	}
	
	/**
	 * Access specific manifest revision
	 * @param rev 
	 * @return
	 * @throws HgInvalidControlFileException
	 */
	/*package-local*/ ManifestRevision raw(int rev) throws HgRuntimeException {
		return get(rev);
	}
	/*package-local*/ Convertor<Path> getPathPool() {
		if (pathPool == null) {
			pathPool = cacheFilenames;
		}
		return pathPool;
	}

	/**
	 * Allows sharing of a common path cache 
	 */
	public void setPathPool(Convertor<Path> pathConvertor) {
		pathPool = pathConvertor;
	}

	/**
	 * Limit activity of the collector to certain sub-tree of the repository.
	 * @param scopeMatcher tells whether collector shall report specific path, can be <code>null</code>
	 */
	public void setScope(Path.Matcher scopeMatcher) {
		// do not assign null, ever
		scope = scopeMatcher == null ? new Path.Matcher.Any() : scopeMatcher;
	}

	/**
	 * Select whether Collector shall tell "added-new" from "added-by-copy/rename" files.
	 * This is analogous to '-C' switch of 'hg status' command.
	 * 
	 * <p>With copy detection turned off, files continue be reported as plain 'added' files.
	 * 
	 * <p>By default, copy detection is <em>on</em>, as it's reasonably cheap. However,
	 * in certain scenarios it may be reasonable to turn it off, for example when it's a merge
	 * of two very different branches and there are a lot of files added/moved.
	 *  
	 * Another legitimate reason to set detection to off if you're lazy to 
	 * implement {@link HgStatusInspector#copied(Path, Path)} ;)
	 * 
	 * @param detect <code>true</code> if copies detection is desirable
	 */
	public void detectCopies(boolean detect) {
		// cpython, revision:72161, p1:72159, p2:72160
		// p2 comes from another branch with 321 file added (looks like copied/moved, however, the isCopy
		// record present only for couple of them). With 2,5 ms per isCopy() operation, almost a second
		// is spent detecting origins (according to Marc, of little use in this scenario, as it's second parent 
		// in the merge) - in fact, most of the time of the status operation
		detectCopies = detect;
	}
	
	/**
	 * 'hg status --change REV' command counterpart.
	 * 
	 * @throws CancelledException if operation execution was cancelled
	 * @throws HgRuntimeException subclass thereof to indicate issues with the library. <em>Runtime exception</em>
	 */
	public void change(int revisionIndex, HgStatusInspector inspector) throws CancelledException, HgRuntimeException {
		int p;
		if (revisionIndex == 0) {
			p = NO_REVISION;
		} else {
			int[] parents = new int[2];
			repo.getChangelog().parents(revisionIndex, parents, null, null);
			// #parents call above is responsible for NO_REVISION
			p = parents[0]; // hg --change alsways uses first parent, despite the fact there might be valid (-1, 18) pair of parents
		}
		walk(p, revisionIndex, inspector);
	}
	
	/**
	 * Parameters <b>rev1</b> and <b>rev2</b> are changelog revision indexes, shall not be the same. Argument order matters.
	 * Either rev1 or rev2 may be {@link HgRepository#NO_REVISION} to indicate comparison to empty repository
	 * 
	 * @param rev1 <em>from</em> changeset index, non-negative or {@link HgRepository#TIP}
	 * @param rev2 <em>to</em> changeset index, non-negative or {@link HgRepository#TIP}
	 * @param inspector callback for status information
	 * @throws CancelledException if operation execution was cancelled
	 * @throws HgRuntimeException subclass thereof to indicate issues with the library. <em>Runtime exception</em>
	 * @throws IllegalArgumentException inspector other incorrect argument values
	 */
	public void walk(int rev1, int rev2, HgStatusInspector inspector) throws CancelledException, HgRuntimeException, IllegalArgumentException {
		if (rev1 == rev2) {
			throw new IllegalArgumentException();
		}
		if (inspector == null) {
			throw new IllegalArgumentException();
		}
		final int lastChangelogRevision = repo.getChangelog().getLastRevision();
		if (rev1 == TIP) {
			rev1 = lastChangelogRevision;
		}
		if (rev2 == TIP) {
			rev2 = lastChangelogRevision; 
		}
		if (rev1 != NO_REVISION && (HgInternals.wrongRevisionIndex(rev1) || rev1 == WORKING_COPY || rev1 == BAD_REVISION || rev1 > lastChangelogRevision)) {
			throw new HgInvalidRevisionException(rev1);
		}
		if (rev2 != NO_REVISION && (HgInternals.wrongRevisionIndex(rev2) || rev2 == WORKING_COPY || rev2 == BAD_REVISION || rev2 > lastChangelogRevision)) {
			throw new HgInvalidRevisionException(rev2);
		}
		if (inspector instanceof Record) {
			((Record) inspector).init(rev1, rev2, this);
		}
		// in fact, rev1 and rev2 are often next (or close) to each other,
		// thus, we can optimize Manifest reads here (manifest.walk(rev1, rev2))
		ManifestRevision r1, r2 ;
		boolean need1 = !cached(rev1), need2 = !cached(rev2);
		if (need1 || need2) {
			int minRev, maxRev;
			if (need1 && need2 && Math.abs(rev1 - rev2) < 5 /*subjective equivalent of 'close enough'*/) {
				minRev = rev1 < rev2 ? rev1 : rev2;
				maxRev = minRev == rev1 ? rev2 : rev1;
				if (minRev > 0) {
					minRev--; // expand range a bit
				}
				initCacheRange(minRev, maxRev);
				need1 = need2 = false;
			}
			// either both unknown and far from each other, or just one of them.
			// read with neighbors to save potential subsequent calls for neighboring elements
			// XXX perhaps, if revlog.baseRevision is cheap, shall expand minRev up to baseRevision
			// which going to be read anyway
			if (need1) {
				minRev = rev1;
				maxRev = rev1 < lastChangelogRevision-5 ? rev1+5 : lastChangelogRevision;
				initCacheRange(minRev, maxRev);
			}
			if (need2) {
				minRev = rev2;
				maxRev = rev2 < lastChangelogRevision-5 ? rev2+5 : lastChangelogRevision;
				initCacheRange(minRev, maxRev);
			}
		}
		r1 = get(rev1);
		r2 = get(rev2);

		final CancelSupport cs = CancelSupport.Factory.get(inspector);

		Collection<Path> allBaseFiles = r1.files();
		TreeSet<Path> r1Files = new TreeSet<Path>(allBaseFiles);
		for (Path r2fname : r2.files()) {
			if (!scope.accept(r2fname)) {
				continue;
			}
			if (r1Files.remove(r2fname)) {
				Nodeid nidR1 = r1.nodeid(r2fname);
				Nodeid nidR2 = r2.nodeid(r2fname);
				HgManifest.Flags flagsR1 = r1.flags(r2fname);
				HgManifest.Flags flagsR2 = r2.flags(r2fname);
				if (nidR1.equals(nidR2) && flagsR2 == flagsR1) {
					inspector.clean(r2fname);
				} else {
					inspector.modified(r2fname);
				}
				cs.checkCancelled();
			} else {
				try {
					Path copyTarget = r2fname;
					Path copyOrigin = detectCopies ? getOriginIfCopy(repo, copyTarget, r2.nodeid(copyTarget), allBaseFiles, rev1) : null;
					if (copyOrigin != null) {
						inspector.copied(getPathPool().mangle(copyOrigin) /*pipe through pool, just in case*/, copyTarget);
					} else {
						inspector.added(copyTarget);
					}
				} catch (HgInvalidFileException ex) {
					// record exception to a mediator and continue, 
					// for a single file not to be irresolvable obstacle for a status operation
					inspector.invalid(r2fname, ex);
				}
				cs.checkCancelled();
			}
		}
		for (Path r1fname : r1Files) {
			if (scope.accept(r1fname)) {
				inspector.removed(r1fname);
				cs.checkCancelled();
			}
		}
	}
	
	/**
	 * Collects status between two revisions, changes from <b>rev1</b> up to <b>rev2</b>.
	 * 
	 * @param rev1 <em>from</em> changeset index 
	 * @param rev2 <em>to</em> changeset index
	 * @return information object that describes change between the revisions
	 * @throws HgInvalidRevisionException if any supplied revision doesn't identify changeset revision. <em>Runtime exception</em>
	 * @throws HgInvalidControlFileException if failed to access revlog index/data entry. <em>Runtime exception</em>
	 * @throws HgRuntimeException subclass thereof to indicate other issues with the library. <em>Runtime exception</em>
	 */
	public Record status(int rev1, int rev2) throws HgRuntimeException {
		Record rv = new Record();
		try {
			walk(rev1, rev2, rv);
		} catch (CancelledException ex) {
			// can't happen as long our Record class doesn't implement CancelSupport
			HgInvalidStateException t = new HgInvalidStateException("Internal error");
			t.initCause(ex);
			throw t;
		}
		return rv;
	}
	
	// originals shall include all names known in base revision, not those not yet consumed
	// see TestStatus#testDetectRenamesInNonFirstRev and log-renames test repository
	// where a and d in r5 are compared to a and b in r1, while d is in fact descendant of original a, and a is original b (through c)
	/*package-local*/static Path getOriginIfCopy(HgRepository hgRepo, Path fname, Nodeid fnameRev, Collection<Path> originals, int originalChangesetIndex) throws HgRuntimeException {
		HgDataFile df = hgRepo.getFileNode(fname);
		if (!df.exists()) {
			String msg = String.format("Didn't find file '%s' in the repo. Perhaps, bad storage name conversion?", fname);
			throw new HgInvalidFileException(msg, null).setFileName(fname).setRevisionIndex(originalChangesetIndex);
		}
		assert fnameRev != null;
		assert !Nodeid.NULL.equals(fnameRev); 
		int fileRevIndex = fnameRev == null ? 0 : df.getRevisionIndex(fnameRev);
		FileRenameHistory frh = new FileRenameHistory(originalChangesetIndex, df.getChangesetRevisionIndex(fileRevIndex));
		if (frh.isOutOfRange(df, fileRevIndex)) {
			return null;
		}
		frh.build(df, fileRevIndex);
		Chunk c = frh.chunkAt(originalChangesetIndex);
		if (c == null) {
			// file rename history doesn't go deep up to changeset of interest
			return null;
		}
		Path nameAtOrigin = c.file().getPath();
		if (originals.contains(nameAtOrigin)) {
			return nameAtOrigin;
		}
		return null;
	}

	// XXX for r1..r2 status, only modified, added, removed (and perhaps, clean) make sense
	// XXX Need to specify whether copy targets are in added or not (@see Inspector#copied above)
	/**
	 * Straightforward {@link HgStatusInspector} implementation that collects all status values.
	 * 
	 * <p>Naturally, {@link Record Records} originating from {@link HgStatusCollector} would report only <em>modified, added,
	 * removed</em> and <em>clean</em> values, other are available only when using {@link Record} with {@link HgWorkingCopyStatusCollector}.
	 * 
	 * <p>Note, this implementation records copied files as added, thus key values in {@link #getCopied()} map are subset of paths
	 * from {@link #getAdded()}.  
	 */
	public static class Record implements HgStatusInspector {
		// NOTE, shall not implement CancelSupport, or methods that use it and don't expect this exception shall be changed
		private List<Path> modified, added, removed, clean, missing, unknown, ignored;
		private Map<Path, Path> copied;
		private Map<Path, Exception> failures;
		
		private int startRev, endRev;
		private HgStatusCollector statusHelper;
		
		// XXX StatusCollector may additionally initialize Record instance to speed lookup of changed file revisions
		// here I need access to ManifestRevisionInspector via #raw(). Perhaps, non-static class (to get
		// implicit reference to StatusCollector) may be better?
		// Since users may want to reuse Record instance we've once created (and initialized), we need to  
		// ensure functionality is correct for each/any call (#walk checks instanceof Record and fixes it up)
		// Perhaps, distinct helper (sc.getRevisionHelper().nodeid(fname)) would be better, just not clear
		// how to supply [start..end] values there easily
		/*package-local*/void init(int startRevision, int endRevision, HgStatusCollector self) {
			startRev = startRevision;
			endRev = endRevision;
			statusHelper = self;
		}
		
		public Nodeid nodeidBeforeChange(Path fname) throws HgRuntimeException {
			if (statusHelper == null || startRev == BAD_REVISION) {
				return null;
			}
			if ((modified == null || !modified.contains(fname)) && (removed == null || !removed.contains(fname))) {
				return null;
			}
			return statusHelper.raw(startRev).nodeid(fname);
		}
		public Nodeid nodeidAfterChange(Path fname) throws HgRuntimeException {
			if (statusHelper == null || endRev == BAD_REVISION) {
				return null;
			}
			if ((modified == null || !modified.contains(fname)) && (added == null || !added.contains(fname))) {
				return null;
			}
			return statusHelper.raw(endRev).nodeid(fname);
		}
		
		public List<Path> getModified() {
			return proper(modified);
		}

		public List<Path> getAdded() {
			return proper(added);
		}

		public List<Path> getRemoved() {
			return proper(removed);
		}

		/**
		 * Map files from {@link #getAdded()} to their original filenames, if were copied/moved.
		 */
		public Map<Path,Path> getCopied() {
			if (copied == null) {
				return Collections.emptyMap();
			}
			return Collections.unmodifiableMap(copied);
		}

		public List<Path> getClean() {
			return proper(clean);
		}

		public List<Path> getMissing() {
			return proper(missing);
		}

		public List<Path> getUnknown() {
			return proper(unknown);
		}

		public List<Path> getIgnored() {
			return proper(ignored);
		}

		public Map<Path, Exception> getInvalid() {
			if (failures == null) {
				return Collections.emptyMap();
			}
			return Collections.unmodifiableMap(failures);
		}
		
		private static List<Path> proper(List<Path> l) {
			if (l == null) {
				return Collections.emptyList();
			}
			return Collections.unmodifiableList(l);
		}

		//
		//
		
		public void modified(Path fname) {
			modified = doAdd(modified, fname);
		}

		public void added(Path fname) {
			added = doAdd(added, fname);
		}

		public void copied(Path fnameOrigin, Path fnameAdded) {
			if (copied == null) {
				copied = new LinkedHashMap<Path, Path>();
			}
			added(fnameAdded);
			copied.put(fnameAdded, fnameOrigin);
		}

		public void removed(Path fname) {
			removed = doAdd(removed, fname);
		}

		public void clean(Path fname) {
			clean = doAdd(clean, fname);
		}

		public void missing(Path fname) {
			missing = doAdd(missing, fname);
		}

		public void unknown(Path fname) {
			unknown = doAdd(unknown, fname);
		}

		public void ignored(Path fname) {
			ignored = doAdd(ignored, fname);
		}
		
		public void invalid(Path fname, Exception ex) {
			if (failures == null) {
				failures = new LinkedHashMap<Path, Exception>();
			}
			failures.put(fname, ex);
		}

		private static List<Path> doAdd(List<Path> l, Path p) {
			if (l == null) {
				l = new LinkedList<Path>();
			}
			l.add(p);
			return l;
		}
	}

}