view src/org/tmatesoft/hg/internal/StoragePathHelper.java @ 411:464b4404e75d smartgit3

Issue 29: Bad storage path translation - translate Unicode chars to filesystem encoding
author Artem Tikhomirov <tikhomirov.artem@gmail.com>
date Tue, 20 Mar 2012 17:56:50 +0100
parents 6d2c6b2469fc
children 528b6780a8bd
line wrap: on
line source
/*
 * Copyright (c) 2011-2012 TMate Software Ltd
 *  
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 2 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * For information on how to redistribute this software under
 * the terms of a license other than GNU General Public License
 * contact TMate Software at support@hg4j.com
 */
package org.tmatesoft.hg.internal;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.util.Arrays;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.tmatesoft.hg.util.PathRewrite;

/**
 * @see http://mercurial.selenic.com/wiki/CaseFoldingPlan
 * @see http://mercurial.selenic.com/wiki/fncacheRepoFormat
 * @see http://mercurial.selenic.com/wiki/EncodingStrategy
 * 
 * @author Artem Tikhomirov
 * @author TMate Software Ltd.
 */
class StoragePathHelper implements PathRewrite {
	
	private final boolean store;
	private final boolean fncache;
	private final boolean dotencode;
	private final Pattern suffix2replace;
	private final CharsetEncoder csEncoder;
	private final char[] hexEncodedByte = new char[] {'~', '0', '0'};
	private final ByteBuffer byteEncodingBuf;
	private final CharBuffer charEncodingBuf;
	
	public StoragePathHelper(boolean isStore, boolean isFncache, boolean isDotencode) {
		this(isStore, isFncache, isDotencode, Charset.defaultCharset());
	}

	public StoragePathHelper(boolean isStore, boolean isFncache, boolean isDotencode, Charset fsEncoding) {
		assert fsEncoding != null;
		store = isStore;
		fncache = isFncache;
		dotencode = isDotencode;
		suffix2replace = Pattern.compile("\\.([id]|hg)/");
		csEncoder = fsEncoding.newEncoder(); // FIXME catch exception and rethrow as our's RT
		byteEncodingBuf = ByteBuffer.allocate(Math.round(csEncoder.maxBytesPerChar()) + 1/*in fact, need ceil, hence +1*/);
		charEncodingBuf = CharBuffer.allocate(1);
	}

	// FIXME document what path argument is, whether it includes .i or .d, and whether it's 'normalized' (slashes) or not.
	// since .hg/store keeps both .i files and files without extension (e.g. fncache), guees, for data == false 
	// we shall assume path has extension
	public CharSequence rewrite(CharSequence p) {
		final String STR_STORE = "store/";
		final String STR_DATA = "data/";
		final String STR_DH = "dh/";
		final String reservedChars = "\\:*?\"<>|";
		
		Matcher suffixMatcher = suffix2replace.matcher(p);
		CharSequence path;
		// Matcher.replaceAll, but without extra toString
		boolean found = suffixMatcher.find();
		if (found) {
			StringBuffer sb = new StringBuffer(p.length()  + 20);
			do {
				suffixMatcher.appendReplacement(sb, ".$1.hg/");
			} while (found = suffixMatcher.find());
			suffixMatcher.appendTail(sb);
			path = sb;
		} else {
			path = p;
		}
		
		StringBuilder sb = new StringBuilder(path.length() << 1);
		if (store || fncache) {
			for (int i = 0; i < path.length(); i++) {
				final char ch = path.charAt(i);
				if (ch >= 'a' && ch <= 'z') {
					sb.append(ch); // POIRAE
				} else if (ch >= 'A' && ch <= 'Z') {
					sb.append('_');
					sb.append(Character.toLowerCase(ch)); // Perhaps, (char) (((int) ch) + 32)? Even better, |= 0x20? 
				} else if (reservedChars.indexOf(ch) != -1) {
					sb.append(toHexByte(ch));
				} else if ((ch >= '~' /*126*/ && ch <= 255) || ch < ' ' /*32*/) {
					sb.append(toHexByte(ch));
				} else if (ch == '_') {
					sb.append('_');
					sb.append('_');
				} else {
					// either ASCII char that doesn't require special handling, or an Unicode character to get encoded
					// according to filesystem/native encoding, see http://mercurial.selenic.com/wiki/EncodingStrategy
					// despite of what the page says, use of native encoding seems worst solution to me (repositories
					// can't be easily shared between OS'es with different encodings then, e.g. Win1251 and Linux UTF8).
					// If the ease of sharing was not the point, what's the reason to mangle with names at all then (
					// lowercase and exclude reserved device names).
					if (ch < '~' /*126*/ || !csEncoder.canEncode(ch)) {
						sb.append(ch);
					} else {
						appendEncoded(sb, ch);
					}
				}
			}
			// auxencode
			if (fncache) {
				encodeWindowsDeviceNames(sb);
			}
		}
		final int MAX_PATH_LEN = 120;
		if (fncache && (sb.length() + STR_DATA.length() + ".i".length() > MAX_PATH_LEN)) {
			// TODO [post-1.0] Mercurial uses system encoding for paths, hence we need to pass bytes to DigestHelper
			// to ensure our sha1 value (default encoding of unicode string if one looks into DH impl) match that 
			// produced by Mercurial (based on native string). 
			String digest = new DigestHelper().sha1(STR_DATA, path, ".i").asHexString();
			final int DIR_PREFIX_LEN = 8;
			 // not sure why (-4) is here. 120 - 40 = up to 80 for path with ext. dh/ + ext(.i) = 3+2
			final int MAX_DIR_PREFIX = 8 * (DIR_PREFIX_LEN + 1) - 4;
			sb = new StringBuilder(MAX_PATH_LEN);
			for (int i = 0; i < path.length(); i++) {
				final char ch = path.charAt(i);
				if (ch >= 'a' && ch <= 'z') {
					sb.append(ch);
				} else if (ch >= 'A' && ch <= 'Z') {
					sb.append((char) (ch | 0x20)); // lowercase 
				} else if (reservedChars.indexOf(ch) != -1) {
					sb.append(toHexByte(ch));
				} else if ((ch >= '~' /*126*/ && ch <= 255) || ch < ' ' /*32*/) {
					sb.append(toHexByte(ch));
				} else {
					if (ch < '~' /*126*/ || !csEncoder.canEncode(ch)) {
						sb.append(ch);
					} else {
						appendEncoded(sb, ch);
					}
				}
			}
			encodeWindowsDeviceNames(sb);
			int fnameStart = sb.lastIndexOf("/"); // since we rewrite file names, it never ends with slash (for dirs, I'd pass length-2);
			StringBuilder completeHashName = new StringBuilder(MAX_PATH_LEN);
			completeHashName.append(STR_STORE);
			completeHashName.append(STR_DH);
			if (fnameStart == -1) {
				// no dirs, just long filename
				sb.setLength(MAX_PATH_LEN - 40 /*digest.length()*/ - STR_DH.length() - ".i".length());
				completeHashName.append(sb);
			} else {
				StringBuilder sb2 = new StringBuilder(MAX_PATH_LEN);
				int x = 0;
				do {
					int i = sb.indexOf("/", x);
					final int sb2Len = sb2.length(); 
					if (i-x <= DIR_PREFIX_LEN) { // a b c d e f g h /
						sb2.append(sb, x, i + 1); // with slash
					} else {
						sb2.append(sb, x, x + DIR_PREFIX_LEN);
						// may unexpectedly end with bad character
						final int last = sb2.length()-1;
						char lastChar = sb2.charAt(last); 
						assert lastChar == sb.charAt(x + DIR_PREFIX_LEN - 1);
						if (lastChar == '.' || lastChar == ' ') {
							sb2.setCharAt(last, '_');
						}
						sb2.append('/');
					}
					if (sb2.length()-1 > MAX_DIR_PREFIX) {
						sb2.setLength(sb2Len); // strip off last segment, it's too much
						break;
					}
					x = i+1; 
				} while (x < fnameStart);
				assert sb2.charAt(sb2.length() - 1) == '/';
				int left = MAX_PATH_LEN - sb2.length() - 40 /*digest.length()*/ - STR_DH.length() - ".i".length();
				assert left >= 0;
				fnameStart++; // move from / to actual name
				if (fnameStart + left > sb.length()) {
					// there left less chars in the mangled name that we can fit
					sb2.append(sb, fnameStart, sb.length());
					int stillAvailable = (fnameStart+left) - sb.length();
					// stillAvailable > 0;
					sb2.append(".i", 0, stillAvailable > 2 ? 2 : stillAvailable);
				} else {
					// add as much as we can
					sb2.append(sb, fnameStart, fnameStart+left);
				}
				completeHashName.append(sb2);
			}
			completeHashName.append(digest);
			sb = completeHashName;
		} else if (store) {
			sb.insert(0, STR_STORE + STR_DATA);
		}
		sb.append(".i");
		return sb.toString();
	}
	
	private void encodeWindowsDeviceNames(StringBuilder sb) {
		int x = 0; // last segment start
		final TreeSet<String> windowsReservedFilenames = new TreeSet<String>();
		windowsReservedFilenames.addAll(Arrays.asList("con prn aux nul com1 com2 com3 com4 com5 com6 com7 com8 com9 lpt1 lpt2 lpt3 lpt4 lpt5 lpt6 lpt7 lpt8 lpt9".split(" "))); 
		do {
			int i = sb.indexOf("/", x);
			if (i == -1) {
				i = sb.length();
			}
			// windows reserved filenames are at least of length 3 
			if (i - x >= 3) {
				boolean found = false;
				if (i-x == 3 || i-x == 4) {
					found = windowsReservedFilenames.contains(sb.subSequence(x, i));
				} else if (sb.charAt(x+3) == '.') { // implicit i-x > 3
					found = windowsReservedFilenames.contains(sb.subSequence(x, x+3));
				} else if (i-x > 4 && sb.charAt(x+4) == '.') {
					found = windowsReservedFilenames.contains(sb.subSequence(x, x+4));
				}
				if (found) {
					// x+2 as we change the third letter in device name
					replace(sb, x+2, toHexByte(sb.charAt(x+2)));
					i += 2;
				}
			}
			if (dotencode && (sb.charAt(x) == '.' || sb.charAt(x) == ' ')) {
				char dotOrSpace = sb.charAt(x); // beware, replace() below changes charAt(x), rather get a copy 
				// not to get ~7e for '.' instead of ~2e, if later refactoring changes the logic 
				replace(sb, x, toHexByte(dotOrSpace));
				i += 2;
			}
			x = i+1;
		} while (x < sb.length());
	}
	
	// shall be synchronized in case of multithreaded use
	private void appendEncoded(StringBuilder sb, char ch) {
		charEncodingBuf.clear();
		byteEncodingBuf.clear();
		charEncodingBuf.put(ch).flip();
		csEncoder.encode(charEncodingBuf, byteEncodingBuf, false);
		byteEncodingBuf.flip();
		while (byteEncodingBuf.hasRemaining()) {
			sb.append(toHexByte(byteEncodingBuf.get()));
		}
	}

	/**
	 * replace char at sb[index] with a sequence
	 */
	private static void replace(StringBuilder sb, int index, char[] with) {
		// there's StringBuilder.replace(int, int+1, String), but with char[] - I don't want to make a string out of hexEncodedByte
		sb.setCharAt(index, with[0]);
		sb.insert(index+1, with, 1, with.length - 1);
	}

	/**
	 * put hex representation of byte ch into buf from specified offset 
	 */
	private char[] toHexByte(int ch) {
		final String hexDigits = "0123456789abcdef";
		hexEncodedByte[1] = hexDigits.charAt((ch & 0x00F0) >>> 4);
		hexEncodedByte[2] = hexDigits.charAt(ch & 0x0F);
		return hexEncodedByte;
	}
}