diff src/org/tmatesoft/hg/internal/EncodingHelper.java @ 667:fba85bc1dfb8

Refactoring: move all encoding/decoding operations into single place, EncodingHelper
author Artem Tikhomirov <tikhomirov.artem@gmail.com>
date Thu, 11 Jul 2013 17:54:08 +0200
parents 47b7bedf0569
children f568330dd9c0
line wrap: on
line diff
--- a/src/org/tmatesoft/hg/internal/EncodingHelper.java	Wed Jul 10 20:22:07 2013 +0200
+++ b/src/org/tmatesoft/hg/internal/EncodingHelper.java	Thu Jul 11 17:54:08 2013 +0200
@@ -18,6 +18,7 @@
 
 import static org.tmatesoft.hg.util.LogFacility.Severity.Error;
 
+import java.io.UnsupportedEncodingException;
 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
 import java.nio.charset.CharacterCodingException;
@@ -26,15 +27,19 @@
 import java.nio.charset.CharsetEncoder;
 
 import org.tmatesoft.hg.core.SessionContext;
+import org.tmatesoft.hg.repo.HgInvalidStateException;
 
 /**
  * Keep all encoding-related issues in the single place
  * NOT thread-safe (encoder and decoder requires synchronized access)
+ * 
+ * @see http://mercurial.selenic.com/wiki/EncodingStrategy
+ * @see http://mercurial.selenic.com/wiki/WindowsUTF8Plan
+ * @see http://mercurial.selenic.com/wiki/CharacterEncodingOnWindows
  * @author Artem Tikhomirov
  * @author TMate Software Ltd.
  */
 public class EncodingHelper {
-	// XXX perhaps, shall not be full of statics, but rather an instance coming from e.g. HgRepository?
 	/*
 	 * To understand what Mercurial thinks of UTF-8 and Unix byte approach to names, see
 	 * http://mercurial.808500.n3.nabble.com/Unicode-support-request-td3430704.html
@@ -43,11 +48,21 @@
 	private final SessionContext sessionContext;
 	private final CharsetEncoder encoder;
 	private final CharsetDecoder decoder;
+	private final CharsetEncoder utfEncoder;
+	private final CharsetDecoder utfDecoder;
 	
-	EncodingHelper(Charset fsEncoding, SessionContext ctx) {
-		sessionContext = ctx;
+	EncodingHelper(Charset fsEncoding, SessionContext.Source ctx) {
+		sessionContext = ctx.getSessionContext();
 		decoder = fsEncoding.newDecoder();
 		encoder = fsEncoding.newEncoder();
+		Charset utf8 = getUTF8();
+		if (fsEncoding.equals(utf8)) {
+			utfDecoder = decoder;
+			utfEncoder = encoder;
+		} else {
+			utfDecoder = utf8.newDecoder();
+			utfEncoder = utf8.newEncoder();
+		}
 	}
 
 	/**
@@ -65,7 +80,7 @@
 			// perhaps, can return byte[0] in this case?
 			throw new IllegalArgumentException();
 		}
-		return encodeWithSystemDefaultFallback(s);
+		return toArray(encodeWithSystemDefaultFallback(s));
 	}
 
 	/**
@@ -79,9 +94,51 @@
 		if (fname == null) {
 			throw new IllegalArgumentException();
 		}
+		return toArray(encodeWithSystemDefaultFallback(fname));
+	}
+	
+	/**
+	 * prepare filename to be serialized into fncache file
+	 */
+	public ByteBuffer toFNCache(CharSequence fname) {
 		return encodeWithSystemDefaultFallback(fname);
 	}
+	
+	public byte[] toBundle(CharSequence fname) {
+		// yes, mercurial transfers filenames in local encoding
+		// so that if your local encoding doesn't match that on server, 
+		// and you use native characters, you'd likely fail
+		return toArray(encodeWithSystemDefaultFallback(fname));
+	}
+	public String fromBundle(byte[] data, int start, int length) {
+		return decodeWithSystemDefaultFallback(data, start, length);
+	}
+	
+	
+	public String userFromChangeset(byte[] data, int start, int length) {
+		return decodeUnicodeWithFallback(data, start, length);
+	}
+	
+	public String commentFromChangeset(byte[] data, int start, int length) {
+		return decodeUnicodeWithFallback(data, start, length);
+	}
+	
+	public String fileFromChangeset(byte[] data, int start, int length) {
+		return decodeWithSystemDefaultFallback(data, start, length);
+	}
 
+	public byte[] userToChangeset(CharSequence user) {
+		return toArray(encodeUnicode(user));
+	}
+	
+	public byte[] commentToChangeset(CharSequence comment) {
+		return toArray(encodeUnicode(comment));
+	}
+	
+	public byte[] fileToChangeset(CharSequence file) {
+		return toArray(encodeWithSystemDefaultFallback(file));
+	}
+	
 	private String decodeWithSystemDefaultFallback(byte[] data, int start, int length) {
 		try {
 			return decoder.decode(ByteBuffer.wrap(data, start, length)).toString();
@@ -92,18 +149,53 @@
 		}
 	}
 	
-	private byte[] encodeWithSystemDefaultFallback(CharSequence s) {
+	private ByteBuffer encodeWithSystemDefaultFallback(CharSequence s) {
 		try {
 			// synchronized(encoder) {
-			ByteBuffer bb = encoder.encode(CharBuffer.wrap(s));
+			return encoder.encode(CharBuffer.wrap(s));
 			// }
-			byte[] rv = new byte[bb.remaining()];
-			bb.get(rv, 0, rv.length);
-			return rv;
 		} catch (CharacterCodingException ex) {
 			sessionContext.getLog().dump(getClass(), Error, ex, String.format("Use of charset %s failed, resort to system default", charset().name()));
 			// resort to system-default
-			return s.toString().getBytes();
+			return ByteBuffer.wrap(s.toString().getBytes());
+		}
+	}
+
+	private byte[] toArray(ByteBuffer bb) {
+		byte[] rv;
+		if (bb.hasArray() && bb.arrayOffset() == 0) {
+			rv = bb.array();
+			if (rv.length == bb.remaining()) {
+				return rv;
+			}
+			// fall through
+		}
+		rv = new byte[bb.remaining()];
+		bb.get(rv, 0, rv.length);
+		return rv;
+	}
+
+	private String decodeUnicodeWithFallback(byte[] data, int start, int length) {
+		try {
+			return utfDecoder.decode(ByteBuffer.wrap(data, start, length)).toString();
+		} catch (CharacterCodingException ex) {
+			// TODO post-1.2 respect ui.fallbackencoding actual setting
+			return new String(data, start, length, Charset.forName("ISO-8859-1"));
+		}
+	}
+	
+	private ByteBuffer encodeUnicode(CharSequence s) {
+		// 
+		try {
+			return utfEncoder.encode(CharBuffer.wrap(s));
+		} catch (CharacterCodingException ex) {
+			byte[] rv;
+			try {
+				rv = s.toString().getBytes(getUTF8().name()); // XXX Java 1.5
+			} catch (UnsupportedEncodingException e) {
+				throw new HgInvalidStateException("Unexpected error trying to get UTF-8 encoding"); 
+			}
+			return ByteBuffer.wrap(rv);
 		}
 	}