Mercurial > jhg
view src/org/tmatesoft/hg/repo/HgStatusCollector.java @ 694:7efabe0cddcf
Speed up (a) file rename history to minimize file reads; (b) file.isCopy(int) to read metadata for few revisions at once (use pattern assumes earlier revisions are likely to be queried, too); (c) HgIgnore.isIgnored by caching matched initial fragments (to substitute more expensive Matcher.matches with cheaper HashMap.contains)
author | Artem Tikhomirov <tikhomirov.artem@gmail.com> |
---|---|
date | Mon, 05 Aug 2013 17:42:10 +0200 |
parents | 72fc7774b87e |
children |
line wrap: on
line source
/* * Copyright (c) 2011-2012 TMate Software Ltd * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * For information on how to redistribute this software under * the terms of a license other than GNU General Public License * contact TMate Software at support@hg4j.com */ package org.tmatesoft.hg.repo; import static org.tmatesoft.hg.repo.HgRepository.*; import java.util.Collection; import java.util.Collections; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.TreeSet; import org.tmatesoft.hg.core.Nodeid; import org.tmatesoft.hg.internal.FileRenameHistory; import org.tmatesoft.hg.internal.FileRenameHistory.Chunk; import org.tmatesoft.hg.internal.IntMap; import org.tmatesoft.hg.internal.ManifestRevision; import org.tmatesoft.hg.internal.Pool; import org.tmatesoft.hg.util.CancelSupport; import org.tmatesoft.hg.util.CancelledException; import org.tmatesoft.hg.util.Convertor; import org.tmatesoft.hg.util.Path; /** * Collect status information for changes between two repository revisions. * * @author Artem Tikhomirov * @author TMate Software Ltd. */ public class HgStatusCollector { private final HgRepository repo; private final IntMap<ManifestRevision> cache; // sparse array, in fact // with cpython repository, ~70 000 changes, complete Log (direct out, no reverse) output // no cache limit, no nodeids and fname caching - OOME on changeset 1035 // no cache limit, but with cached nodeids and filenames - 1730+ // cache limit 100 - 19+ minutes to process 10000, and still working (too long, stopped) private final int cacheMaxSize = 50; // do not keep too much manifest revisions private Convertor<Path> pathPool; private final Pool<Nodeid> cacheNodes; private final Pool<Path> cacheFilenames; private final ManifestRevision emptyFakeState; private Path.Matcher scope = new Path.Matcher.Any(); // @see #detectCopies() private boolean detectCopies = true; public HgStatusCollector(HgRepository hgRepo) { this.repo = hgRepo; cache = new IntMap<ManifestRevision>(cacheMaxSize); cacheNodes = new Pool<Nodeid>(); cacheFilenames = new Pool<Path>(); emptyFakeState = createEmptyManifestRevision(); } public HgRepository getRepo() { return repo; } private ManifestRevision get(int rev) throws HgRuntimeException { ManifestRevision i = cache.get(rev); if (i == null) { if (rev == NO_REVISION) { return emptyFakeState; } ensureCacheSize(); i = new ManifestRevision(cacheNodes, cacheFilenames); cache.put(rev, i); repo.getManifest().walk(rev, rev, i); } return i; } private boolean cached(int revision) { return cache.containsKey(revision) || revision == NO_REVISION; } private void ensureCacheSize() { if (cache.size() > cacheMaxSize) { // assume usually we go from oldest to newest, hence remove oldest as most likely to be no longer necessary cache.removeFromStart(cache.size() - cacheMaxSize + 1 /* room for new element */); } } private void initCacheRange(int minRev, int maxRev) throws HgRuntimeException { ensureCacheSize(); // In fact, walk(minRev, maxRev) doesn't imply // there would be maxRev-minRev+1 revisions visited. For example, // check cpython repo with 'hg log -r 22418:22420 --debug' and admire // manifest revisions 66650, 21683, 21684. Thus, innocent walk(22418,22420) results in 40k+ revisions and OOME // Instead, be explicit of what revisions are of interest assert minRev <= maxRev; int[] revisionsToCollect = new int[maxRev - minRev + 1]; for (int x = minRev, i = 0; x <= maxRev; i++, x++) { revisionsToCollect[i] = x; } repo.getManifest().walk(new HgManifest.Inspector() { private ManifestRevision delegate; private boolean cacheHit; // range may include revisions we already know about, do not re-create them public boolean begin(int manifestRevision, Nodeid nid, int changelogRevision) { assert delegate == null; if (cache.containsKey(changelogRevision)) { // don't need to check emptyFakeState hit as revision never NO_REVISION here cacheHit = true; } else { cache.put(changelogRevision, delegate = new ManifestRevision(cacheNodes, cacheFilenames)); // cache may grow bigger than max size here, but it's ok as present simplistic cache clearing mechanism may // otherwise remove entries we just added delegate.begin(manifestRevision, nid, changelogRevision); cacheHit = false; } return true; } public boolean next(Nodeid nid, Path fname, HgManifest.Flags flags) { if (!cacheHit) { delegate.next(nid, fname, flags); } return true; } public boolean end(int revision) { if (!cacheHit) { delegate.end(revision); } cacheHit = false; delegate = null; return true; } }, revisionsToCollect); } /*package-local*/ static ManifestRevision createEmptyManifestRevision() { ManifestRevision fakeEmptyRev = new ManifestRevision(null, null); fakeEmptyRev.begin(NO_REVISION, null, NO_REVISION); fakeEmptyRev.end(NO_REVISION); return fakeEmptyRev; } /** * Access specific manifest revision * @param rev * @return * @throws HgInvalidControlFileException */ /*package-local*/ ManifestRevision raw(int rev) throws HgRuntimeException { return get(rev); } /*package-local*/ Convertor<Path> getPathPool() { if (pathPool == null) { pathPool = cacheFilenames; } return pathPool; } /** * Allows sharing of a common path cache */ public void setPathPool(Convertor<Path> pathConvertor) { pathPool = pathConvertor; } /** * Limit activity of the collector to certain sub-tree of the repository. * @param scopeMatcher tells whether collector shall report specific path, can be <code>null</code> */ public void setScope(Path.Matcher scopeMatcher) { // do not assign null, ever scope = scopeMatcher == null ? new Path.Matcher.Any() : scopeMatcher; } /** * Select whether Collector shall tell "added-new" from "added-by-copy/rename" files. * This is analogous to '-C' switch of 'hg status' command. * * <p>With copy detection turned off, files continue be reported as plain 'added' files. * * <p>By default, copy detection is <em>on</em>, as it's reasonably cheap. However, * in certain scenarios it may be reasonable to turn it off, for example when it's a merge * of two very different branches and there are a lot of files added/moved. * * Another legitimate reason to set detection to off if you're lazy to * implement {@link HgStatusInspector#copied(Path, Path)} ;) * * @param detect <code>true</code> if copies detection is desirable */ public void detectCopies(boolean detect) { // cpython, revision:72161, p1:72159, p2:72160 // p2 comes from another branch with 321 file added (looks like copied/moved, however, the isCopy // record present only for couple of them). With 2,5 ms per isCopy() operation, almost a second // is spent detecting origins (according to Marc, of little use in this scenario, as it's second parent // in the merge) - in fact, most of the time of the status operation detectCopies = detect; } /** * 'hg status --change REV' command counterpart. * * @throws CancelledException if operation execution was cancelled * @throws HgRuntimeException subclass thereof to indicate issues with the library. <em>Runtime exception</em> */ public void change(int revisionIndex, HgStatusInspector inspector) throws CancelledException, HgRuntimeException { int p; if (revisionIndex == 0) { p = NO_REVISION; } else { int[] parents = new int[2]; repo.getChangelog().parents(revisionIndex, parents, null, null); // #parents call above is responsible for NO_REVISION p = parents[0]; // hg --change alsways uses first parent, despite the fact there might be valid (-1, 18) pair of parents } walk(p, revisionIndex, inspector); } /** * Parameters <b>rev1</b> and <b>rev2</b> are changelog revision indexes, shall not be the same. Argument order matters. * Either rev1 or rev2 may be {@link HgRepository#NO_REVISION} to indicate comparison to empty repository * * @param rev1 <em>from</em> changeset index, non-negative or {@link HgRepository#TIP} * @param rev2 <em>to</em> changeset index, non-negative or {@link HgRepository#TIP} * @param inspector callback for status information * @throws CancelledException if operation execution was cancelled * @throws HgRuntimeException subclass thereof to indicate issues with the library. <em>Runtime exception</em> * @throws IllegalArgumentException inspector other incorrect argument values */ public void walk(int rev1, int rev2, HgStatusInspector inspector) throws CancelledException, HgRuntimeException, IllegalArgumentException { if (rev1 == rev2) { throw new IllegalArgumentException(); } if (inspector == null) { throw new IllegalArgumentException(); } final int lastChangelogRevision = repo.getChangelog().getLastRevision(); if (rev1 == TIP) { rev1 = lastChangelogRevision; } if (rev2 == TIP) { rev2 = lastChangelogRevision; } if (rev1 != NO_REVISION && (HgInternals.wrongRevisionIndex(rev1) || rev1 == WORKING_COPY || rev1 == BAD_REVISION || rev1 > lastChangelogRevision)) { throw new HgInvalidRevisionException(rev1); } if (rev2 != NO_REVISION && (HgInternals.wrongRevisionIndex(rev2) || rev2 == WORKING_COPY || rev2 == BAD_REVISION || rev2 > lastChangelogRevision)) { throw new HgInvalidRevisionException(rev2); } if (inspector instanceof Record) { ((Record) inspector).init(rev1, rev2, this); } // in fact, rev1 and rev2 are often next (or close) to each other, // thus, we can optimize Manifest reads here (manifest.walk(rev1, rev2)) ManifestRevision r1, r2 ; boolean need1 = !cached(rev1), need2 = !cached(rev2); if (need1 || need2) { int minRev, maxRev; if (need1 && need2 && Math.abs(rev1 - rev2) < 5 /*subjective equivalent of 'close enough'*/) { minRev = rev1 < rev2 ? rev1 : rev2; maxRev = minRev == rev1 ? rev2 : rev1; if (minRev > 0) { minRev--; // expand range a bit } initCacheRange(minRev, maxRev); need1 = need2 = false; } // either both unknown and far from each other, or just one of them. // read with neighbors to save potential subsequent calls for neighboring elements // XXX perhaps, if revlog.baseRevision is cheap, shall expand minRev up to baseRevision // which going to be read anyway if (need1) { minRev = rev1; maxRev = rev1 < lastChangelogRevision-5 ? rev1+5 : lastChangelogRevision; initCacheRange(minRev, maxRev); } if (need2) { minRev = rev2; maxRev = rev2 < lastChangelogRevision-5 ? rev2+5 : lastChangelogRevision; initCacheRange(minRev, maxRev); } } r1 = get(rev1); r2 = get(rev2); final CancelSupport cs = CancelSupport.Factory.get(inspector); Collection<Path> allBaseFiles = r1.files(); TreeSet<Path> r1Files = new TreeSet<Path>(allBaseFiles); for (Path r2fname : r2.files()) { if (!scope.accept(r2fname)) { continue; } if (r1Files.remove(r2fname)) { Nodeid nidR1 = r1.nodeid(r2fname); Nodeid nidR2 = r2.nodeid(r2fname); HgManifest.Flags flagsR1 = r1.flags(r2fname); HgManifest.Flags flagsR2 = r2.flags(r2fname); if (nidR1.equals(nidR2) && flagsR2 == flagsR1) { inspector.clean(r2fname); } else { inspector.modified(r2fname); } cs.checkCancelled(); } else { try { Path copyTarget = r2fname; Path copyOrigin = detectCopies ? getOriginIfCopy(repo, copyTarget, r2.nodeid(copyTarget), allBaseFiles, rev1) : null; if (copyOrigin != null) { inspector.copied(getPathPool().mangle(copyOrigin) /*pipe through pool, just in case*/, copyTarget); } else { inspector.added(copyTarget); } } catch (HgInvalidFileException ex) { // record exception to a mediator and continue, // for a single file not to be irresolvable obstacle for a status operation inspector.invalid(r2fname, ex); } cs.checkCancelled(); } } for (Path r1fname : r1Files) { if (scope.accept(r1fname)) { inspector.removed(r1fname); cs.checkCancelled(); } } } /** * Collects status between two revisions, changes from <b>rev1</b> up to <b>rev2</b>. * * @param rev1 <em>from</em> changeset index * @param rev2 <em>to</em> changeset index * @return information object that describes change between the revisions * @throws HgInvalidRevisionException if any supplied revision doesn't identify changeset revision. <em>Runtime exception</em> * @throws HgInvalidControlFileException if failed to access revlog index/data entry. <em>Runtime exception</em> * @throws HgRuntimeException subclass thereof to indicate other issues with the library. <em>Runtime exception</em> */ public Record status(int rev1, int rev2) throws HgRuntimeException { Record rv = new Record(); try { walk(rev1, rev2, rv); } catch (CancelledException ex) { // can't happen as long our Record class doesn't implement CancelSupport HgInvalidStateException t = new HgInvalidStateException("Internal error"); t.initCause(ex); throw t; } return rv; } // originals shall include all names known in base revision, not those not yet consumed // see TestStatus#testDetectRenamesInNonFirstRev and log-renames test repository // where a and d in r5 are compared to a and b in r1, while d is in fact descendant of original a, and a is original b (through c) /*package-local*/static Path getOriginIfCopy(HgRepository hgRepo, Path fname, Nodeid fnameRev, Collection<Path> originals, int originalChangesetIndex) throws HgRuntimeException { HgDataFile df = hgRepo.getFileNode(fname); if (!df.exists()) { String msg = String.format("Didn't find file '%s' in the repo. Perhaps, bad storage name conversion?", fname); throw new HgInvalidFileException(msg, null).setFileName(fname).setRevisionIndex(originalChangesetIndex); } assert fnameRev != null; assert !Nodeid.NULL.equals(fnameRev); int fileRevIndex = fnameRev == null ? 0 : df.getRevisionIndex(fnameRev); FileRenameHistory frh = new FileRenameHistory(originalChangesetIndex, df.getChangesetRevisionIndex(fileRevIndex)); if (frh.isOutOfRange(df, fileRevIndex)) { return null; } frh.build(df, fileRevIndex); Chunk c = frh.chunkAt(originalChangesetIndex); if (c == null) { // file rename history doesn't go deep up to changeset of interest return null; } Path nameAtOrigin = c.file().getPath(); if (originals.contains(nameAtOrigin)) { return nameAtOrigin; } return null; } // XXX for r1..r2 status, only modified, added, removed (and perhaps, clean) make sense // XXX Need to specify whether copy targets are in added or not (@see Inspector#copied above) /** * Straightforward {@link HgStatusInspector} implementation that collects all status values. * * <p>Naturally, {@link Record Records} originating from {@link HgStatusCollector} would report only <em>modified, added, * removed</em> and <em>clean</em> values, other are available only when using {@link Record} with {@link HgWorkingCopyStatusCollector}. * * <p>Note, this implementation records copied files as added, thus key values in {@link #getCopied()} map are subset of paths * from {@link #getAdded()}. */ public static class Record implements HgStatusInspector { // NOTE, shall not implement CancelSupport, or methods that use it and don't expect this exception shall be changed private List<Path> modified, added, removed, clean, missing, unknown, ignored; private Map<Path, Path> copied; private Map<Path, Exception> failures; private int startRev, endRev; private HgStatusCollector statusHelper; // XXX StatusCollector may additionally initialize Record instance to speed lookup of changed file revisions // here I need access to ManifestRevisionInspector via #raw(). Perhaps, non-static class (to get // implicit reference to StatusCollector) may be better? // Since users may want to reuse Record instance we've once created (and initialized), we need to // ensure functionality is correct for each/any call (#walk checks instanceof Record and fixes it up) // Perhaps, distinct helper (sc.getRevisionHelper().nodeid(fname)) would be better, just not clear // how to supply [start..end] values there easily /*package-local*/void init(int startRevision, int endRevision, HgStatusCollector self) { startRev = startRevision; endRev = endRevision; statusHelper = self; } public Nodeid nodeidBeforeChange(Path fname) throws HgRuntimeException { if (statusHelper == null || startRev == BAD_REVISION) { return null; } if ((modified == null || !modified.contains(fname)) && (removed == null || !removed.contains(fname))) { return null; } return statusHelper.raw(startRev).nodeid(fname); } public Nodeid nodeidAfterChange(Path fname) throws HgRuntimeException { if (statusHelper == null || endRev == BAD_REVISION) { return null; } if ((modified == null || !modified.contains(fname)) && (added == null || !added.contains(fname))) { return null; } return statusHelper.raw(endRev).nodeid(fname); } public List<Path> getModified() { return proper(modified); } public List<Path> getAdded() { return proper(added); } public List<Path> getRemoved() { return proper(removed); } /** * Map files from {@link #getAdded()} to their original filenames, if were copied/moved. */ public Map<Path,Path> getCopied() { if (copied == null) { return Collections.emptyMap(); } return Collections.unmodifiableMap(copied); } public List<Path> getClean() { return proper(clean); } public List<Path> getMissing() { return proper(missing); } public List<Path> getUnknown() { return proper(unknown); } public List<Path> getIgnored() { return proper(ignored); } public Map<Path, Exception> getInvalid() { if (failures == null) { return Collections.emptyMap(); } return Collections.unmodifiableMap(failures); } private static List<Path> proper(List<Path> l) { if (l == null) { return Collections.emptyList(); } return Collections.unmodifiableList(l); } // // public void modified(Path fname) { modified = doAdd(modified, fname); } public void added(Path fname) { added = doAdd(added, fname); } public void copied(Path fnameOrigin, Path fnameAdded) { if (copied == null) { copied = new LinkedHashMap<Path, Path>(); } added(fnameAdded); copied.put(fnameAdded, fnameOrigin); } public void removed(Path fname) { removed = doAdd(removed, fname); } public void clean(Path fname) { clean = doAdd(clean, fname); } public void missing(Path fname) { missing = doAdd(missing, fname); } public void unknown(Path fname) { unknown = doAdd(unknown, fname); } public void ignored(Path fname) { ignored = doAdd(ignored, fname); } public void invalid(Path fname, Exception ex) { if (failures == null) { failures = new LinkedHashMap<Path, Exception>(); } failures.put(fname, ex); } private static List<Path> doAdd(List<Path> l, Path p) { if (l == null) { l = new LinkedList<Path>(); } l.add(p); return l; } } }