view src/org/tmatesoft/hg/internal/EncodingHelper.java @ 667:fba85bc1dfb8

Refactoring: move all encoding/decoding operations into single place, EncodingHelper
author Artem Tikhomirov <tikhomirov.artem@gmail.com>
date Thu, 11 Jul 2013 17:54:08 +0200
parents 47b7bedf0569
children f568330dd9c0
line wrap: on
line source
/*
 * Copyright (c) 2011-2013 TMate Software Ltd
 *  
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 2 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * For information on how to redistribute this software under
 * the terms of a license other than GNU General Public License
 * contact TMate Software at support@hg4j.com
 */
package org.tmatesoft.hg.internal;

import static org.tmatesoft.hg.util.LogFacility.Severity.Error;

import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;

import org.tmatesoft.hg.core.SessionContext;
import org.tmatesoft.hg.repo.HgInvalidStateException;

/**
 * Keep all encoding-related issues in the single place
 * NOT thread-safe (encoder and decoder requires synchronized access)
 * 
 * @see http://mercurial.selenic.com/wiki/EncodingStrategy
 * @see http://mercurial.selenic.com/wiki/WindowsUTF8Plan
 * @see http://mercurial.selenic.com/wiki/CharacterEncodingOnWindows
 * @author Artem Tikhomirov
 * @author TMate Software Ltd.
 */
public class EncodingHelper {
	/*
	 * To understand what Mercurial thinks of UTF-8 and Unix byte approach to names, see
	 * http://mercurial.808500.n3.nabble.com/Unicode-support-request-td3430704.html
	 */
	
	private final SessionContext sessionContext;
	private final CharsetEncoder encoder;
	private final CharsetDecoder decoder;
	private final CharsetEncoder utfEncoder;
	private final CharsetDecoder utfDecoder;
	
	EncodingHelper(Charset fsEncoding, SessionContext.Source ctx) {
		sessionContext = ctx.getSessionContext();
		decoder = fsEncoding.newDecoder();
		encoder = fsEncoding.newEncoder();
		Charset utf8 = getUTF8();
		if (fsEncoding.equals(utf8)) {
			utfDecoder = decoder;
			utfEncoder = encoder;
		} else {
			utfDecoder = utf8.newDecoder();
			utfEncoder = utf8.newEncoder();
		}
	}

	/**
	 * Translate file names from manifest to amazing Unicode string 
	 */
	public String fromManifest(byte[] data, int start, int length) {
		return decodeWithSystemDefaultFallback(data, start, length);
	}
	
	/**
	 * @return byte representation of the string directly comparable to bytes in manifest
	 */
	public byte[] toManifest(CharSequence s) {
		if (s == null) {
			// perhaps, can return byte[0] in this case?
			throw new IllegalArgumentException();
		}
		return toArray(encodeWithSystemDefaultFallback(s));
	}

	/**
	 * Translate file names from dirstate to amazing Unicode string 
	 */
	public String fromDirstate(byte[] data, int start, int length) {
		return decodeWithSystemDefaultFallback(data, start, length);
	}
	
	public byte[] toDirstate(CharSequence fname) {
		if (fname == null) {
			throw new IllegalArgumentException();
		}
		return toArray(encodeWithSystemDefaultFallback(fname));
	}
	
	/**
	 * prepare filename to be serialized into fncache file
	 */
	public ByteBuffer toFNCache(CharSequence fname) {
		return encodeWithSystemDefaultFallback(fname);
	}
	
	public byte[] toBundle(CharSequence fname) {
		// yes, mercurial transfers filenames in local encoding
		// so that if your local encoding doesn't match that on server, 
		// and you use native characters, you'd likely fail
		return toArray(encodeWithSystemDefaultFallback(fname));
	}
	public String fromBundle(byte[] data, int start, int length) {
		return decodeWithSystemDefaultFallback(data, start, length);
	}
	
	
	public String userFromChangeset(byte[] data, int start, int length) {
		return decodeUnicodeWithFallback(data, start, length);
	}
	
	public String commentFromChangeset(byte[] data, int start, int length) {
		return decodeUnicodeWithFallback(data, start, length);
	}
	
	public String fileFromChangeset(byte[] data, int start, int length) {
		return decodeWithSystemDefaultFallback(data, start, length);
	}

	public byte[] userToChangeset(CharSequence user) {
		return toArray(encodeUnicode(user));
	}
	
	public byte[] commentToChangeset(CharSequence comment) {
		return toArray(encodeUnicode(comment));
	}
	
	public byte[] fileToChangeset(CharSequence file) {
		return toArray(encodeWithSystemDefaultFallback(file));
	}
	
	private String decodeWithSystemDefaultFallback(byte[] data, int start, int length) {
		try {
			return decoder.decode(ByteBuffer.wrap(data, start, length)).toString();
		} catch (CharacterCodingException ex) {
			sessionContext.getLog().dump(getClass(), Error, ex, String.format("Use of charset %s failed, resort to system default", charset().name()));
			// resort to system-default
			return new String(data, start, length);
		}
	}
	
	private ByteBuffer encodeWithSystemDefaultFallback(CharSequence s) {
		try {
			// synchronized(encoder) {
			return encoder.encode(CharBuffer.wrap(s));
			// }
		} catch (CharacterCodingException ex) {
			sessionContext.getLog().dump(getClass(), Error, ex, String.format("Use of charset %s failed, resort to system default", charset().name()));
			// resort to system-default
			return ByteBuffer.wrap(s.toString().getBytes());
		}
	}

	private byte[] toArray(ByteBuffer bb) {
		byte[] rv;
		if (bb.hasArray() && bb.arrayOffset() == 0) {
			rv = bb.array();
			if (rv.length == bb.remaining()) {
				return rv;
			}
			// fall through
		}
		rv = new byte[bb.remaining()];
		bb.get(rv, 0, rv.length);
		return rv;
	}

	private String decodeUnicodeWithFallback(byte[] data, int start, int length) {
		try {
			return utfDecoder.decode(ByteBuffer.wrap(data, start, length)).toString();
		} catch (CharacterCodingException ex) {
			// TODO post-1.2 respect ui.fallbackencoding actual setting
			return new String(data, start, length, Charset.forName("ISO-8859-1"));
		}
	}
	
	private ByteBuffer encodeUnicode(CharSequence s) {
		// 
		try {
			return utfEncoder.encode(CharBuffer.wrap(s));
		} catch (CharacterCodingException ex) {
			byte[] rv;
			try {
				rv = s.toString().getBytes(getUTF8().name()); // XXX Java 1.5
			} catch (UnsupportedEncodingException e) {
				throw new HgInvalidStateException("Unexpected error trying to get UTF-8 encoding"); 
			}
			return ByteBuffer.wrap(rv);
		}
	}

	private Charset charset() {
		return encoder.charset();
	}

	public static Charset getUTF8() {
		return Charset.forName("UTF-8");
	}
}