changeset 411:464b4404e75d smartgit3

Issue 29: Bad storage path translation - translate Unicode chars to filesystem encoding
author Artem Tikhomirov <tikhomirov.artem@gmail.com>
date Tue, 20 Mar 2012 17:56:50 +0100 (2012-03-20)
parents df5009d67be2
children 63c5a9d7ca3f
files src/org/tmatesoft/hg/internal/Internals.java src/org/tmatesoft/hg/internal/StoragePathHelper.java test/org/tmatesoft/hg/test/TestStorePath.java
diffstat 3 files changed, 171 insertions(+), 32 deletions(-) [+]
line wrap: on
line diff
--- a/src/org/tmatesoft/hg/internal/Internals.java	Fri Mar 16 20:19:51 2012 +0100
+++ b/src/org/tmatesoft/hg/internal/Internals.java	Tue Mar 20 17:56:50 2012 +0100
@@ -21,6 +21,7 @@
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
+import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
@@ -56,13 +57,30 @@
 	 */
 	public static final String CFG_PROPERTY_REVLOG_STREAM_CACHE = "hg4j.repo.disable_revlog_cache";
 	
+	/**
+	 * Name of charset to use when translating Unicode filenames to Mercurial storage paths, string, 
+	 * to resolve with {@link Charset#forName(String)}.
+	 * E.g. <code>"cp1251"</code> or <code>"Latin-1"</code>.
+	 * 
+	 * <p>Mercurial uses system encoding when mangling storage paths. Default value
+	 * based on 'file.encoding' Java system property is usually fine here, however
+	 * in certain scenarios it may be desirable to force a different one, and this 
+	 * property is exactly for this purpose.
+	 * 
+	 * <p>E.g. Eclipse defaults to project encoding (Launch config, Common page) when launching an application, 
+	 * and if your project happen to use anything but filesystem default (say, UTF8 on cp1251 system),
+	 * native storage paths won't match
+	 */
+	public static final String CFG_PROPERT_FS_FILENAME_ENCODING = "hg.fs.filename.encoding";
+	
 	private int requiresFlags = 0;
 	private List<Filter.Factory> filterFactories;
+	private final SessionContext sessionContext;
 	private final boolean isCaseSensitiveFileSystem;
 	private final boolean shallCacheRevlogsInRepo;
-	
 
 	public Internals(SessionContext ctx) {
+		this.sessionContext = ctx;
 		isCaseSensitiveFileSystem = !runningOnWindows();
 		Object p = ctx.getProperty(CFG_PROPERTY_REVLOG_STREAM_CACHE, true);
 		shallCacheRevlogsInRepo = p instanceof Boolean ? ((Boolean) p).booleanValue() : Boolean.parseBoolean(String.valueOf(p));
@@ -91,7 +109,21 @@
 
 	// XXX perhaps, should keep both fields right here, not in the HgRepository
 	public PathRewrite buildDataFilesHelper() {
-		return new StoragePathHelper((requiresFlags & STORE) != 0, (requiresFlags & FNCACHE) != 0, (requiresFlags & DOTENCODE) != 0);
+		Object altEncoding = sessionContext.getProperty(CFG_PROPERT_FS_FILENAME_ENCODING, null);
+		Charset cs;
+		if (altEncoding == null) {
+			cs = Charset.defaultCharset();
+		} else {
+			try {
+				cs = Charset.forName(altEncoding.toString());
+			} catch (IllegalArgumentException ex) {
+				// both IllegalCharsetNameException and UnsupportedCharsetException are subclasses of IAE, too
+				// not severe enough to throw an exception, imo. Just record the fact it's bad ad we ignore it 
+				sessionContext.getLog().error(getClass(), ex, String.format("Bad configuration value for filename encoding %s", altEncoding));
+				cs = Charset.defaultCharset();
+			}
+		}
+		return new StoragePathHelper((requiresFlags & STORE) != 0, (requiresFlags & FNCACHE) != 0, (requiresFlags & DOTENCODE) != 0, cs);
 	}
 
 	public PathRewrite buildRepositoryFilesHelper() {
--- a/src/org/tmatesoft/hg/internal/StoragePathHelper.java	Fri Mar 16 20:19:51 2012 +0100
+++ b/src/org/tmatesoft/hg/internal/StoragePathHelper.java	Tue Mar 20 17:56:50 2012 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011 TMate Software Ltd
+ * Copyright (c) 2011-2012 TMate Software Ltd
  *  
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -16,14 +16,21 @@
  */
 package org.tmatesoft.hg.internal;
 
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetEncoder;
 import java.util.Arrays;
 import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.tmatesoft.hg.util.PathRewrite;
 
 /**
  * @see http://mercurial.selenic.com/wiki/CaseFoldingPlan
  * @see http://mercurial.selenic.com/wiki/fncacheRepoFormat
+ * @see http://mercurial.selenic.com/wiki/EncodingStrategy
  * 
  * @author Artem Tikhomirov
  * @author TMate Software Ltd.
@@ -33,11 +40,25 @@
 	private final boolean store;
 	private final boolean fncache;
 	private final boolean dotencode;
+	private final Pattern suffix2replace;
+	private final CharsetEncoder csEncoder;
+	private final char[] hexEncodedByte = new char[] {'~', '0', '0'};
+	private final ByteBuffer byteEncodingBuf;
+	private final CharBuffer charEncodingBuf;
+	
+	public StoragePathHelper(boolean isStore, boolean isFncache, boolean isDotencode) {
+		this(isStore, isFncache, isDotencode, Charset.defaultCharset());
+	}
 
-	public StoragePathHelper(boolean isStore, boolean isFncache, boolean isDotencode) {
+	public StoragePathHelper(boolean isStore, boolean isFncache, boolean isDotencode, Charset fsEncoding) {
+		assert fsEncoding != null;
 		store = isStore;
 		fncache = isFncache;
 		dotencode = isDotencode;
+		suffix2replace = Pattern.compile("\\.([id]|hg)/");
+		csEncoder = fsEncoding.newEncoder(); // FIXME catch exception and rethrow as our's RT
+		byteEncodingBuf = ByteBuffer.allocate(Math.round(csEncoder.maxBytesPerChar()) + 1/*in fact, need ceil, hence +1*/);
+		charEncodingBuf = CharBuffer.allocate(1);
 	}
 
 	// FIXME document what path argument is, whether it includes .i or .d, and whether it's 'normalized' (slashes) or not.
@@ -48,13 +69,24 @@
 		final String STR_DATA = "data/";
 		final String STR_DH = "dh/";
 		final String reservedChars = "\\:*?\"<>|";
-		char[] hexByte = new char[2];
 		
-		String path = p.toString();
-		path = path.replace(".hg/", ".hg.hg/").replace(".i/", ".i.hg/").replace(".d/", ".d.hg/");
+		Matcher suffixMatcher = suffix2replace.matcher(p);
+		CharSequence path;
+		// Matcher.replaceAll, but without extra toString
+		boolean found = suffixMatcher.find();
+		if (found) {
+			StringBuffer sb = new StringBuffer(p.length()  + 20);
+			do {
+				suffixMatcher.appendReplacement(sb, ".$1.hg/");
+			} while (found = suffixMatcher.find());
+			suffixMatcher.appendTail(sb);
+			path = sb;
+		} else {
+			path = p;
+		}
+		
 		StringBuilder sb = new StringBuilder(path.length() << 1);
 		if (store || fncache) {
-			// encodefilename
 			for (int i = 0; i < path.length(); i++) {
 				final char ch = path.charAt(i);
 				if (ch >= 'a' && ch <= 'z') {
@@ -63,16 +95,24 @@
 					sb.append('_');
 					sb.append(Character.toLowerCase(ch)); // Perhaps, (char) (((int) ch) + 32)? Even better, |= 0x20? 
 				} else if (reservedChars.indexOf(ch) != -1) {
-					sb.append('~');
-					sb.append(toHexByte(ch, hexByte));
+					sb.append(toHexByte(ch));
 				} else if ((ch >= '~' /*126*/ && ch <= 255) || ch < ' ' /*32*/) {
-					sb.append('~');
-					sb.append(toHexByte(ch, hexByte));
+					sb.append(toHexByte(ch));
 				} else if (ch == '_') {
 					sb.append('_');
 					sb.append('_');
 				} else {
-					sb.append(ch);
+					// either ASCII char that doesn't require special handling, or an Unicode character to get encoded
+					// according to filesystem/native encoding, see http://mercurial.selenic.com/wiki/EncodingStrategy
+					// despite of what the page says, use of native encoding seems worst solution to me (repositories
+					// can't be easily shared between OS'es with different encodings then, e.g. Win1251 and Linux UTF8).
+					// If the ease of sharing was not the point, what's the reason to mangle with names at all then (
+					// lowercase and exclude reserved device names).
+					if (ch < '~' /*126*/ || !csEncoder.canEncode(ch)) {
+						sb.append(ch);
+					} else {
+						appendEncoded(sb, ch);
+					}
 				}
 			}
 			// auxencode
@@ -82,6 +122,9 @@
 		}
 		final int MAX_PATH_LEN = 120;
 		if (fncache && (sb.length() + STR_DATA.length() + ".i".length() > MAX_PATH_LEN)) {
+			// TODO [post-1.0] Mercurial uses system encoding for paths, hence we need to pass bytes to DigestHelper
+			// to ensure our sha1 value (default encoding of unicode string if one looks into DH impl) match that 
+			// produced by Mercurial (based on native string). 
 			String digest = new DigestHelper().sha1(STR_DATA, path, ".i").asHexString();
 			final int DIR_PREFIX_LEN = 8;
 			 // not sure why (-4) is here. 120 - 40 = up to 80 for path with ext. dh/ + ext(.i) = 3+2
@@ -94,13 +137,15 @@
 				} else if (ch >= 'A' && ch <= 'Z') {
 					sb.append((char) (ch | 0x20)); // lowercase 
 				} else if (reservedChars.indexOf(ch) != -1) {
-					sb.append('~');
-					sb.append(toHexByte(ch, hexByte));
+					sb.append(toHexByte(ch));
 				} else if ((ch >= '~' /*126*/ && ch <= 255) || ch < ' ' /*32*/) {
-					sb.append('~');
-					sb.append(toHexByte(ch, hexByte));
+					sb.append(toHexByte(ch));
 				} else {
-					sb.append(ch);
+					if (ch < '~' /*126*/ || !csEncoder.canEncode(ch)) {
+						sb.append(ch);
+					} else {
+						appendEncoded(sb, ch);
+					}
 				}
 			}
 			encodeWindowsDeviceNames(sb);
@@ -163,7 +208,6 @@
 	}
 	
 	private void encodeWindowsDeviceNames(StringBuilder sb) {
-		char[] hexByte = new char[2];
 		int x = 0; // last segment start
 		final TreeSet<String> windowsReservedFilenames = new TreeSet<String>();
 		windowsReservedFilenames.addAll(Arrays.asList("con prn aux nul com1 com2 com3 com4 com5 com6 com7 com8 com9 lpt1 lpt2 lpt3 lpt4 lpt5 lpt6 lpt7 lpt8 lpt9".split(" "))); 
@@ -183,25 +227,49 @@
 					found = windowsReservedFilenames.contains(sb.subSequence(x, x+4));
 				}
 				if (found) {
-					sb.insert(x+3, toHexByte(sb.charAt(x+2), hexByte));
-					sb.setCharAt(x+2, '~');
+					// x+2 as we change the third letter in device name
+					replace(sb, x+2, toHexByte(sb.charAt(x+2)));
 					i += 2;
 				}
 			}
 			if (dotencode && (sb.charAt(x) == '.' || sb.charAt(x) == ' ')) {
-				sb.insert(x+1, toHexByte(sb.charAt(x), hexByte));
-				sb.setCharAt(x, '~'); // setChar *after* charAt/insert to get ~2e, not ~7e for '.'
+				char dotOrSpace = sb.charAt(x); // beware, replace() below changes charAt(x), rather get a copy 
+				// not to get ~7e for '.' instead of ~2e, if later refactoring changes the logic 
+				replace(sb, x, toHexByte(dotOrSpace));
 				i += 2;
 			}
 			x = i+1;
 		} while (x < sb.length());
 	}
+	
+	// shall be synchronized in case of multithreaded use
+	private void appendEncoded(StringBuilder sb, char ch) {
+		charEncodingBuf.clear();
+		byteEncodingBuf.clear();
+		charEncodingBuf.put(ch).flip();
+		csEncoder.encode(charEncodingBuf, byteEncodingBuf, false);
+		byteEncodingBuf.flip();
+		while (byteEncodingBuf.hasRemaining()) {
+			sb.append(toHexByte(byteEncodingBuf.get()));
+		}
+	}
 
-	private static char[] toHexByte(int ch, char[] buf) {
-		assert buf.length > 1;
+	/**
+	 * replace char at sb[index] with a sequence
+	 */
+	private static void replace(StringBuilder sb, int index, char[] with) {
+		// there's StringBuilder.replace(int, int+1, String), but with char[] - I don't want to make a string out of hexEncodedByte
+		sb.setCharAt(index, with[0]);
+		sb.insert(index+1, with, 1, with.length - 1);
+	}
+
+	/**
+	 * put hex representation of byte ch into buf from specified offset 
+	 */
+	private char[] toHexByte(int ch) {
 		final String hexDigits = "0123456789abcdef";
-		buf[0] = hexDigits.charAt((ch & 0x00F0) >>> 4);
-		buf[1] = hexDigits.charAt(ch & 0x0F);
-		return buf;
+		hexEncodedByte[1] = hexDigits.charAt((ch & 0x00F0) >>> 4);
+		hexEncodedByte[2] = hexDigits.charAt(ch & 0x0F);
+		return hexEncodedByte;
 	}
 }
--- a/test/org/tmatesoft/hg/test/TestStorePath.java	Fri Mar 16 20:19:51 2012 +0100
+++ b/test/org/tmatesoft/hg/test/TestStorePath.java	Tue Mar 20 17:56:50 2012 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011 TMate Software Ltd
+ * Copyright (c) 2011-2012 TMate Software Ltd
  *  
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -16,9 +16,13 @@
  */
 package org.tmatesoft.hg.test;
 
+import java.util.HashMap;
+import java.util.Map;
+
 import junit.framework.Assert;
 
 import org.hamcrest.CoreMatchers;
+import org.junit.Before;
 import org.junit.Rule;
 import org.junit.Test;
 import org.tmatesoft.hg.internal.BasicSessionContext;
@@ -36,18 +40,30 @@
 	public ErrorCollectorExt errorCollector = new ErrorCollectorExt();
 	
 	private PathRewrite storePathHelper;
+	private final Map<String, Object> propertyOverrides = new HashMap<String, Object>();
+
+	private Internals internals;
 
 	public static void main(String[] args) throws Throwable {
 		final TestStorePath test = new TestStorePath();
 		test.testWindowsFilenames();
 		test.testHashLongPath();
+		test.testSuffixReplacement();
 		test.errorCollector.verify();
 	}
 	
 	public TestStorePath() {
-		final Internals i = new Internals(new BasicSessionContext(null, null));
-		i.setStorageConfig(1, 0x7);
-		storePathHelper = i.buildDataFilesHelper();
+		propertyOverrides.put("hg.consolelog.debug", true);
+		internals = new Internals(new BasicSessionContext(propertyOverrides, null, null));
+		internals.setStorageConfig(1, 0x7);
+		storePathHelper = internals.buildDataFilesHelper();
+	}
+	
+	@Before
+	public void setup() {
+		// just in case there are leftovers from #testNationalChars() and another test builds a helper
+		propertyOverrides.clear();
+		propertyOverrides.put("hg.consolelog.debug", true);
 	}
 
 	@Test
@@ -88,4 +104,27 @@
 		errorCollector.checkThat(storePathHelper.rewrite(n2), CoreMatchers.<CharSequence>equalTo(r2));
 		errorCollector.checkThat(storePathHelper.rewrite(n3), CoreMatchers.<CharSequence>equalTo(r3));
 	}
+
+	@Test
+	public void testSuffixReplacement() {
+		String s1 = "aaa/bbb.hg/ccc.i/ddd.hg/xx.i";
+		String s2 = "bbb.d/aa.hg/bbb.hg/yy.d";
+		String r1 = s1.replace(".hg/", ".hg.hg/").replace(".i/", ".i.hg/").replace(".d/", ".d.hg/");
+		String r2 = s2.replace(".hg/", ".hg.hg/").replace(".i/", ".i.hg/").replace(".d/", ".d.hg/");
+		errorCollector.checkThat(storePathHelper.rewrite(s1), CoreMatchers.<CharSequence>equalTo("store/data/" + r1 + ".i"));
+		errorCollector.checkThat(storePathHelper.rewrite(s2), CoreMatchers.<CharSequence>equalTo("store/data/" + r2 + ".i"));
+	}
+	
+	@Test
+	public void testNationalChars() {
+		String s = "Привет.txt";
+		//
+		propertyOverrides.put(Internals.CFG_PROPERT_FS_FILENAME_ENCODING, "cp1251");
+		PathRewrite sph = internals.buildDataFilesHelper();
+		errorCollector.checkThat(sph.rewrite(s), CoreMatchers.<CharSequence>equalTo("store/data/~cf~f0~e8~e2~e5~f2.txt.i"));
+		//
+		propertyOverrides.put(Internals.CFG_PROPERT_FS_FILENAME_ENCODING, "UTF8");
+		sph = internals.buildDataFilesHelper();
+		errorCollector.checkThat(sph.rewrite(s), CoreMatchers.<CharSequence>equalTo("store/data/~d0~9f~d1~80~d0~b8~d0~b2~d0~b5~d1~82.txt.i"));
+	}
 }