Mercurial > hg4j
diff src/org/tmatesoft/hg/internal/KeywordFilter.java @ 711:a62079bc422b
Keyword filtering that doesn't depend on input buffer size and the way input lines got split between filter() calls. KewordFilter got state to keep processed suspicious ...$ lines
author | Artem Tikhomirov <tikhomirov.artem@gmail.com> |
---|---|
date | Fri, 11 Oct 2013 21:35:41 +0200 |
parents | cf200271439a |
children |
line wrap: on
line diff
--- a/src/org/tmatesoft/hg/internal/KeywordFilter.java Mon Oct 07 01:56:05 2013 +0200 +++ b/src/org/tmatesoft/hg/internal/KeywordFilter.java Fri Oct 11 21:35:41 2013 +0200 @@ -21,6 +21,7 @@ import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Date; +import java.util.Map; import java.util.TreeMap; import org.tmatesoft.hg.core.Nodeid; @@ -36,13 +37,12 @@ * @author TMate Software Ltd. */ public class KeywordFilter implements Filter { - // present implementation is stateless, however, filter use pattern shall not assume that. In fact, Factory may us that private final HgRepository repo; private final boolean isExpanding; - private final TreeMap<String,String> keywords; - private final int minBufferLen; + private final Map<String,String> keywords; private final Path path; private RawChangeset latestFileCset; + private final ByteVector unprocessedBuffer; /** * @@ -50,31 +50,12 @@ * @param path * @param expand <code>true</code> to expand keywords, <code>false</code> to shrink */ - private KeywordFilter(HgRepository hgRepo, Path p, boolean expand) { + private KeywordFilter(HgRepository hgRepo, Path p, Map<String, String> kw, boolean expand) { repo = hgRepo; path = p; isExpanding = expand; - keywords = new TreeMap<String,String>(); - keywords.put("Id", "Id"); - keywords.put("Revision", "Revision"); - keywords.put("Author", "Author"); - keywords.put("Date", "Date"); - keywords.put("LastChangedRevision", "LastChangedRevision"); - keywords.put("LastChangedBy", "LastChangedBy"); - keywords.put("LastChangedDate", "LastChangedDate"); - keywords.put("Source", "Source"); - keywords.put("Header", "Header"); - - int l = 0; - for (String s : keywords.keySet()) { - if (s.length() > l) { - l = s.length(); - } - } - // TODO post-1.0 later may implement #filter() not to read full kw value (just "$kw:"). However, limit of maxLen + 2 would keep valid. - // for buffers less then minBufferLen, there are chances #filter() implementation would never end - // (i.e. for input "$LongestKey"$ - minBufferLen = l + 2 + (isExpanding ? 0 : 120 /*any reasonable constant for max possible kw value length*/); + keywords = kw; + unprocessedBuffer = expand ? new ByteVector(0, 0) : new ByteVector(120, 50); } /** @@ -84,151 +65,150 @@ * start with them */ public ByteBuffer filter(ByteBuffer src) { - int keywordStart = indexOf(src, '$', src.position(), false); - if (keywordStart != -1 && src.capacity() < minBufferLen) { - // FIXME this check is unlucky when small files are read for status 'areTheSame' check - small buffer is allocated. - // the check for keywordStart('$') is a temp solution to minimize the chances to get this exception. - // Complete solution requires complete rewriting of this method to respect cases when keywords are split between buffers. - // With 'honest' partial kw handling, need for this check would be gone. - throw new IllegalStateException(String.format("Need buffer of at least %d bytes to ensure filter won't hang", minBufferLen)); - } - ByteBuffer rv = null; - int x = src.position(); - int copyFrom = x; // needs to be updated each time we copy a slice, but not each time we modify source index (x) - while (x < src.limit()) { - if (keywordStart == -1) { - int i = indexOf(src, '$', x, false); - if (i == -1) { - if (rv == null) { - return src; - } else { - copySlice(src, copyFrom, src.limit(), rv); - rv.flip(); - src.position(src.limit()); - return rv; + // when unprocessedBuffer is empty, we are looking for first $ in the input, + // when we've already got anything unprocessed, newline is of interest, too + int kwBreak = indexOf(src, '$', src.position(), !unprocessedBuffer.isEmpty()); + ByteBuffer outBuffer = null; + while (kwBreak != -1) { + if (unprocessedBuffer.isEmpty()) { + // both expand and collapse cases + assert src.get(kwBreak) == '$'; + + int end = indexOf(src, '$', kwBreak+1, true); + if (end == -1) { + for (int i = kwBreak; i < src.limit(); i++) { + unprocessedBuffer.add(src.get(i)); } - } - keywordStart = i; - // fall-through - } - if (keywordStart >= 0) { - int i = indexOf(src, '$', keywordStart+1, true); - if (i == -1) { - // end of buffer reached - if (rv == null) { - if (keywordStart == x) { - // TODO post-1.0 in fact, x might be equal to keywordStart and to src.position() here ('$' is first character in the buffer, - // and there are no other '$' not eols till the end of the buffer). This would lead to deadlock (filter won't consume any - // bytes). To prevent this, either shall copy bytes [keywordStart..buffer.limit()) to local buffer and use it on the next invocation, - // or add lookup of the keywords right after first '$' is found (do not wait for closing '$'). For now, large enough src buffer would be sufficient - // not to run into such situation - throw new IllegalStateException("Try src buffer of a greater size"); + src.limit(kwBreak); + kwBreak = -1; + // src up to kwBreak is left and returned either with outBuffer or alone + } else if (src.get(end) == '$') { + StringBuilder sb = new StringBuilder(end - kwBreak); + for (int i = kwBreak+1; i < end; i++) { + if (src.get(i) == ':' || src.get(i) == ' ') { + break; } - rv = ByteBuffer.allocate(keywordStart - copyFrom); + sb.append((char) src.get(i)); } - // copy all from source till latest possible kw start - copySlice(src, copyFrom, keywordStart, rv); - rv.flip(); - // and tell caller we've consumed only to the potential kw start - src.position(keywordStart); - return rv; - } else if (src.get(i) == '$') { - // end of keyword, or start of a new one. - String keyword; - if ((keyword = matchKeyword(src, keywordStart, i)) != null) { - if (rv == null) { - // src.remaining(), not .capacity because src is not read, and remaining represents - // actual bytes count, while capacity - potential. - // Factor of 4 is pure guess and a HACK, need to be fixed with re-expanding buffer on demand - rv = ByteBuffer.allocate(isExpanding ? src.remaining() * 4 : src.remaining()); + final String keyword = sb.toString(); + if (knownKeyword(keyword)) { + // copy src up to kw, including starting $keyword + outBuffer = append(outBuffer, src, kwBreak - src.position() + 1+keyword.length()); + // replace kwStart..end with new content + outBuffer = ensureCapacityFor(outBuffer, (isExpanding ? 200 : 1)); + if (isExpanding) { + outBuffer.put((byte) ':'); + outBuffer.put((byte) ' '); + outBuffer = expandKeywordValue(keyword, outBuffer); + outBuffer.put((byte) ' '); } - copySlice(src, copyFrom, keywordStart+1, rv); - rv.put(keyword.getBytes()); - if (isExpanding) { - rv.put((byte) ':'); - rv.put((byte) ' '); - expandKeywordValue(keyword, rv); - rv.put((byte) ' '); - } - rv.put((byte) '$'); - keywordStart = -1; - x = i+1; - copyFrom = x; - continue; + outBuffer.put((byte) '$'); + // src is consumed up to end + src.position(end+1); + kwBreak = indexOf(src, '$', end+1, false); } else { - if (rv != null) { - // we've already did some substitution, thus need to copy bytes we've scanned. - copySlice(src, x, i, rv); - copyFrom = i; - } // no else in attempt to avoid rv creation if no real kw would be found - keywordStart = i; - x = i; // '$' at i wasn't consumed, hence x points to i, not i+1. This is to avoid problems with case: "sdfsd $ asdfs $Id$ sdf" - continue; + // no (or unknown) keyword, try with '$' at src[end] + kwBreak = end; } } else { - assert src.get(i) == '\n' || src.get(i) == '\r'; - // line break - if (rv != null) { - copySlice(src, x, i+1, rv); - copyFrom = i+1; + // newline, ignore keyword start + kwBreak = indexOf(src, '$', end+1, false); + } + } else { + // we've got smth unprocessed, and we've matched either $ or NL + // the only chance to get here is when src is in the very start + if (src.get(kwBreak) == '$') { + // closed tag + for (int i = src.position(); i <= kwBreak; i++) { + // consume src: going to handle its [position*()..kwBreak] as part of unprocessedBuffer + unprocessedBuffer.add(src.get()); } - x = i+1; - keywordStart = -1; // Wasn't keyword, really - continue; // try once again + StringBuilder sb = new StringBuilder(unprocessedBuffer.size()); + assert unprocessedBuffer.get(0) == '$'; + for (int i = 1; i < unprocessedBuffer.size(); i++) { + char ch = (char) unprocessedBuffer.get(i); + if (ch == ':' || ch == ' ') { + break; + } + sb.append(ch); + } + final String keyword = sb.toString(); + if (knownKeyword(keyword)) { + outBuffer = ensureCapacityFor(outBuffer, keyword.length() + (isExpanding ? 200 : 2)); + outBuffer.put((byte) '$'); + outBuffer.put(keyword.getBytes()); + if (isExpanding) { + outBuffer.put((byte) ':'); + outBuffer.put((byte) ' '); + outBuffer = expandKeywordValue(keyword, outBuffer); + outBuffer.put((byte) ' '); + } + outBuffer.put((byte) '$'); + } else { + outBuffer = append(outBuffer, unprocessedBuffer.toByteArray()); + } + // src part is consumed already, do nothing here, look for next possible kw + kwBreak = indexOf(src, '$', kwBreak+1, false); + } else { + // newline => tag without close + outBuffer = append(outBuffer, unprocessedBuffer.toByteArray()); + kwBreak = indexOf(src, '$', kwBreak+1, false); } + unprocessedBuffer.clear(); } + } while (kwBreak != -1); + if (outBuffer == null) { + return src; } - if (keywordStart != -1) { - if (rv == null) { - // no expansion happened yet, and we have potential kw start - rv = ByteBuffer.allocate(keywordStart - src.position()); - copySlice(src, src.position(), keywordStart, rv); + outBuffer = ensureCapacityFor(outBuffer, src.remaining()); + outBuffer.put(src); + outBuffer.flip(); + return outBuffer; + } + private boolean knownKeyword(String kw) { + return keywords.containsKey(kw); + } + + private static ByteBuffer append(ByteBuffer out, byte[] data) { + out = ensureCapacityFor(out, data.length); + out.put(data); + return out; + } + private static ByteBuffer append(ByteBuffer out, ByteBuffer in, int count) { + out = ensureCapacityFor(out, count); + while (count-- > 0) { + out.put(in.get()); + } + return out; + } + private static ByteBuffer ensureCapacityFor(ByteBuffer out, int exansion) { + if (out == null || out.remaining() < exansion) { + ByteBuffer newOut = ByteBuffer.allocate(out == null ? exansion*2 : out.capacity() + exansion); + if (out != null) { + out.flip(); + newOut.put(out); } - src.position(keywordStart); + return newOut; } - if (rv != null) { - rv.flip(); - return rv; - } - return src; + return out; } - /** - * @param keyword - * @param rv - */ - private void expandKeywordValue(String keyword, ByteBuffer rv) { + private ByteBuffer expandKeywordValue(String keyword, ByteBuffer rv) { + byte[] toInject; if ("Id".equals(keyword)) { - rv.put(identityString().getBytes()); + toInject = identityString().getBytes(); } else if ("Revision".equals(keyword)) { - rv.put(revision().getBytes()); + toInject = revision().getBytes(); } else if ("Author".equals(keyword)) { - rv.put(username().getBytes()); + toInject = username().getBytes(); } else if ("Date".equals(keyword)) { - rv.put(date().getBytes()); + toInject = date().getBytes(); } else { throw new IllegalStateException(String.format("Keyword %s is not yet supported", keyword)); } - } - - private String matchKeyword(ByteBuffer src, int kwStart, int kwEnd) { - assert kwEnd - kwStart - 1 > 0; - assert src.get(kwStart) == src.get(kwEnd) && src.get(kwEnd) == '$'; - char[] chars = new char[kwEnd - kwStart - 1]; - int i; - for (i = 0; i < chars.length; i++) { - char c = (char) src.get(kwStart + 1 + i); - if (c == ':') { - break; - } - chars[i] = c; - } - String kw = new String(chars, 0, i); -// XXX may use subMap to look up keywords based on few available characters (not waiting till closing $) -// System.out.println(keywords.subMap("I", "J")); -// System.out.println(keywords.subMap("A", "B")); -// System.out.println(keywords.subMap("Au", "B")); - return keywords.get(kw); + rv = ensureCapacityFor(rv, toInject.length); + rv.put(toInject); + return rv; } // copies part of the src buffer, [from..to). doesn't modify src position @@ -306,9 +286,22 @@ } public static class Factory implements Filter.Factory { - + private final Map<String,String> keywords; private HgRepository repo; private Path.Matcher matcher; + + public Factory() { + keywords = new TreeMap<String,String>(); + keywords.put("Id", "Id"); + keywords.put("Revision", "Revision"); + keywords.put("Author", "Author"); + keywords.put("Date", "Date"); + keywords.put("LastChangedRevision", "LastChangedRevision"); + keywords.put("LastChangedBy", "LastChangedBy"); + keywords.put("LastChangedDate", "LastChangedDate"); + keywords.put("Source", "Source"); + keywords.put("Header", "Header"); + } public void initialize(HgRepository hgRepo) { repo = hgRepo; @@ -324,7 +317,7 @@ public Filter create(Path path, Options opts) { if (matcher.accept(path)) { - return new KeywordFilter(repo, path, opts.getDirection() == Filter.Direction.FromRepo); + return new KeywordFilter(repo, path, keywords, opts.getDirection() == Filter.Direction.FromRepo); } return null; }