view src/org/tmatesoft/hg/internal/ChangesetParser.java @ 711:a62079bc422b

Keyword filtering that doesn't depend on input buffer size and the way input lines got split between filter() calls. KewordFilter got state to keep processed suspicious ...$ lines
author Artem Tikhomirov <tikhomirov.artem@gmail.com>
date Fri, 11 Oct 2013 21:35:41 +0200
parents 545b1d4cc11d
children
line wrap: on
line source
/*
 * Copyright (c) 2010-2013 TMate Software Ltd
 *  
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 2 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * For information on how to redistribute this software under
 * the terms of a license other than GNU General Public License
 * contact TMate Software at support@hg4j.com
 */
package org.tmatesoft.hg.internal;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.tmatesoft.hg.core.Nodeid;
import org.tmatesoft.hg.core.SessionContext;
import org.tmatesoft.hg.repo.HgChangelog.RawChangeset;
import org.tmatesoft.hg.repo.HgInvalidDataFormatException;
import org.tmatesoft.hg.repo.HgRepository;

/**
 * @see mercurial/changelog.py:read()
 * 
 *      <pre>
 *         format used:
 *         nodeid\n        : manifest node in ascii
 *         user\n          : user, no \n or \r allowed
 *         time tz extra\n : date (time is int or float, timezone is int)
 *                         : extra is metadatas, encoded and separated by '\0'
 *                         : older versions ignore it
 *         files\n\n       : files modified by the cset, no \n or \r allowed
 *         (.*)            : comment (free text, ideally utf-8)
 * 
 *         changelog v0 doesn't use extra
 * </pre>
 * 
 * Extracted from internals of HgChangelog (the code initially from inside RawChangeset)
 * 
 * @author Artem Tikhomirov
 * @author TMate Software Ltd.
 */
public final class ChangesetParser {
	private final EncodingHelper encHelper;
	// it's likely user names get repeated again and again throughout repository. 
	private final Pool<String> usersPool;
	private final Pool<String> filesPool;
	private final CsetFactory factory;
	
	public ChangesetParser(SessionContext.Source sessionContex, CsetFactory csetFactory) {
		assert csetFactory != null;
		encHelper = Internals.buildFileNameEncodingHelper(sessionContex);
		usersPool = new Pool<String>();
		filesPool = new Pool<String>();
		factory = csetFactory;
	}
	
	public void dispose() {
		usersPool.clear();
		filesPool.clear();
	}

	public RawChangeset parse(DataAccess da) throws IOException, HgInvalidDataFormatException {
		byte[] data = da.byteArray();
		return parse(data);
	}
	
	public RawChangeset parse(byte[] data) throws HgInvalidDataFormatException {
		return init(data, 0, data.length);
	}

	private RawChangeset init(byte[] data, int offset, int length) throws HgInvalidDataFormatException {
		final int bufferEndIndex = offset + length;
		final byte lineBreak = (byte) '\n';
		int breakIndex1 = indexOf(data, lineBreak, offset, bufferEndIndex);
		if (breakIndex1 == -1) {
			throw new HgInvalidDataFormatException("Bad Changeset data");
		}
		Nodeid _nodeid = Nodeid.fromAscii(data, 0, breakIndex1);
		int breakIndex2 = indexOf(data, lineBreak, breakIndex1 + 1, bufferEndIndex);
		if (breakIndex2 == -1) {
			throw new HgInvalidDataFormatException("Bad Changeset data");
		}
		String _user;
		_user = encHelper.userFromChangeset(data, breakIndex1 + 1, breakIndex2 - breakIndex1 - 1);
		_user = usersPool.unify(_user);

		int breakIndex3 = indexOf(data, lineBreak, breakIndex2 + 1, bufferEndIndex);
		if (breakIndex3 == -1) {
			throw new HgInvalidDataFormatException("Bad Changeset data");
		}
		String _timeString = new String(data, breakIndex2 + 1, breakIndex3 - breakIndex2 - 1);
		int space1 = _timeString.indexOf(' ');
		if (space1 == -1) {
			throw new HgInvalidDataFormatException(String.format("Bad Changeset data: %s in [%d..%d]", "time string", breakIndex2+1, breakIndex3));
		}
		int space2 = _timeString.indexOf(' ', space1 + 1);
		if (space2 == -1) {
			space2 = _timeString.length();
		}
		long unixTime = Long.parseLong(_timeString.substring(0, space1));
		int _timezone = Integer.parseInt(_timeString.substring(space1 + 1, space2));
		// unixTime is local time, and timezone records difference of the local time to UTC.
		Date _time = new Date(unixTime * 1000);
		String _extras = space2 < _timeString.length() ? _timeString.substring(space2 + 1) : null;
		Map<String, String> _extrasMap = parseExtras(_extras);
		//
		int lastStart = breakIndex3 + 1;
		int breakIndex4 = indexOf(data, lineBreak, lastStart, bufferEndIndex);
		ArrayList<String> _files = null;
		if (breakIndex4 > lastStart) {
			// if breakIndex4 == lastStart, we already found \n\n and hence there are no files (e.g. merge revision)
			_files = new ArrayList<String>(5);
			while (breakIndex4 != -1 && breakIndex4 + 1 < bufferEndIndex) {
				String fname = encHelper.fileFromChangeset(data, lastStart, breakIndex4 - lastStart);
				_files.add(filesPool.unify(fname));
				lastStart = breakIndex4 + 1;
				if (data[breakIndex4 + 1] == lineBreak) {
					// found \n\n
					break;
				} else {
					breakIndex4 = indexOf(data, lineBreak, lastStart, bufferEndIndex);
				}
			}
			if (breakIndex4 == -1 || breakIndex4 >= bufferEndIndex) {
				throw new HgInvalidDataFormatException("Bad Changeset data");
			}
		} else {
			breakIndex4--;
		}
		String _comment = encHelper.commentFromChangeset(data, breakIndex4 + 2, bufferEndIndex - breakIndex4 - 2);
		RawChangeset target = factory.create(_nodeid, _user, _time, _timezone, _files, _comment, _extrasMap);
		return target; 
	}

	private Map<String, String> parseExtras(String _extras) {
		final String extras_branch_key = "branch";
		_extras = _extras == null ? null : _extras.trim();
		if (_extras == null || _extras.length() == 0) {
			return Collections.singletonMap(extras_branch_key, HgRepository.DEFAULT_BRANCH_NAME);
		}
		Map<String, String> _extrasMap = new HashMap<String, String>();
		int lastIndex = 0;
		do {
			String pair;
			int sp = _extras.indexOf('\0', lastIndex);
			if (sp == -1) {
				sp = _extras.length();
			}
			if (sp > lastIndex) {
				pair = _extras.substring(lastIndex, sp);
				pair = decode(pair);
				int eq = pair.indexOf(':');
				_extrasMap.put(pair.substring(0, eq), pair.substring(eq + 1));
				lastIndex = sp + 1;
			}
		} while (lastIndex < _extras.length());
		if (!_extrasMap.containsKey(extras_branch_key)) {
			_extrasMap.put(extras_branch_key, HgRepository.DEFAULT_BRANCH_NAME);
		}
		return Collections.unmodifiableMap(_extrasMap);
	}

	private static int indexOf(byte[] src, byte what, int startOffset, int endIndex) {
		for (int i = startOffset; i < endIndex; i++) {
			if (src[i] == what) {
				return i;
			}
		}
		return -1;
	}
	
	private static String decode(String s) {
		if (s != null && s.indexOf('\\') != -1) {
			// TestAuxUtilities#testChangelogExtrasDecode
			return s.replace("\\\\", "\\").replace("\\n", "\n").replace("\\r", "\r").replace("\\0", "\00");
		}
		return s;
	}

	public interface CsetFactory {
		public RawChangeset create(Nodeid nodeid, String user, Date time, int timezone, List<String> files, String comment, Map<String, String> extrasMap);
	}
}