view src/org/tmatesoft/hg/internal/EncodingHelper.java @ 709:497e697636fc

Report merged lines as changed block if possible, not as a sequence of added/deleted blocks. To facilitate access to merge parent lines AddBlock got mergeLineAt() method that reports index of the line in the second parent (if any), while insertedAt() has been changed to report index in the first parent always
author Artem Tikhomirov <tikhomirov.artem@gmail.com>
date Wed, 21 Aug 2013 16:23:27 +0200
parents f568330dd9c0
children
line wrap: on
line source
/*
 * Copyright (c) 2011-2013 TMate Software Ltd
 *  
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 2 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * For information on how to redistribute this software under
 * the terms of a license other than GNU General Public License
 * contact TMate Software at support@hg4j.com
 */
package org.tmatesoft.hg.internal;

import static org.tmatesoft.hg.util.LogFacility.Severity.Error;

import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;

import org.tmatesoft.hg.core.SessionContext;
import org.tmatesoft.hg.repo.HgInvalidStateException;

/**
 * Keep all encoding-related issues in the single place
 * NOT thread-safe (encoder and decoder requires synchronized access)
 * 
 * @see http://mercurial.selenic.com/wiki/EncodingStrategy
 * @see http://mercurial.selenic.com/wiki/WindowsUTF8Plan
 * @see http://mercurial.selenic.com/wiki/CharacterEncodingOnWindows
 * @author Artem Tikhomirov
 * @author TMate Software Ltd.
 */
public class EncodingHelper {
	/*
	 * To understand what Mercurial thinks of UTF-8 and Unix byte approach to names, see
	 * http://mercurial.808500.n3.nabble.com/Unicode-support-request-td3430704.html
	 */
	
	private final SessionContext sessionContext;
	private final CharsetEncoder encoder;
	private final CharsetDecoder decoder;
	private final CharsetEncoder utfEncoder;
	private final CharsetDecoder utfDecoder;
	
	EncodingHelper(Charset fsEncoding, SessionContext.Source ctx) {
		sessionContext = ctx.getSessionContext();
		decoder = fsEncoding.newDecoder();
		encoder = fsEncoding.newEncoder();
		Charset utf8 = getUTF8();
		if (fsEncoding.equals(utf8)) {
			utfDecoder = decoder;
			utfEncoder = encoder;
		} else {
			utfDecoder = utf8.newDecoder();
			utfEncoder = utf8.newEncoder();
		}
	}

	/**
	 * Translate file names from manifest to amazing Unicode string 
	 */
	public String fromManifest(byte[] data, int start, int length) {
		return decodeWithSystemDefaultFallback(data, start, length);
	}
	
	/**
	 * @return byte representation of the string directly comparable to bytes in manifest
	 */
	public byte[] toManifest(CharSequence s) {
		if (s == null) {
			// perhaps, can return byte[0] in this case?
			throw new IllegalArgumentException();
		}
		return toArray(encodeWithSystemDefaultFallback(s));
	}

	/**
	 * Translate file names from dirstate to amazing Unicode string 
	 */
	public String fromDirstate(byte[] data, int start, int length) {
		return decodeWithSystemDefaultFallback(data, start, length);
	}
	
	public byte[] toDirstate(CharSequence fname) {
		if (fname == null) {
			throw new IllegalArgumentException();
		}
		return toArray(encodeWithSystemDefaultFallback(fname));
	}
	
	/**
	 * prepare filename to be serialized into fncache file
	 */
	public ByteBuffer toFNCache(CharSequence fname) {
		return encodeWithSystemDefaultFallback(fname);
	}
	
	public byte[] toBundle(CharSequence fname) {
		// yes, mercurial transfers filenames in local encoding
		// so that if your local encoding doesn't match that on server, 
		// and you use native characters, you'd likely fail
		return toArray(encodeWithSystemDefaultFallback(fname));
	}
	public String fromBundle(byte[] data, int start, int length) {
		return decodeWithSystemDefaultFallback(data, start, length);
	}
	
	
	public String userFromChangeset(byte[] data, int start, int length) {
		return decodeUnicodeWithFallback(data, start, length);
	}
	
	public String commentFromChangeset(byte[] data, int start, int length) {
		return decodeUnicodeWithFallback(data, start, length);
	}
	
	public String fileFromChangeset(byte[] data, int start, int length) {
		return decodeWithSystemDefaultFallback(data, start, length);
	}

	public byte[] userToChangeset(CharSequence user) {
		return toArray(encodeUnicode(user));
	}
	
	public byte[] commentToChangeset(CharSequence comment) {
		return toArray(encodeUnicode(comment));
	}
	
	public byte[] fileToChangeset(CharSequence file) {
		return toArray(encodeWithSystemDefaultFallback(file));
	}
	
	private String decodeWithSystemDefaultFallback(byte[] data, int start, int length) {
		try {
			return decoder.decode(ByteBuffer.wrap(data, start, length)).toString();
		} catch (CharacterCodingException ex) {
			sessionContext.getLog().dump(getClass(), Error, ex, String.format("Use of charset %s failed, resort to system default", charset().name()));
			// resort to system-default
			return new String(data, start, length);
		}
	}
	
	private ByteBuffer encodeWithSystemDefaultFallback(CharSequence s) {
		try {
			// synchronized(encoder) {
			return encoder.encode(CharBuffer.wrap(s));
			// }
		} catch (CharacterCodingException ex) {
			sessionContext.getLog().dump(getClass(), Error, ex, String.format("Use of charset %s failed, resort to system default", charset().name()));
			// resort to system-default
			return ByteBuffer.wrap(s.toString().getBytes());
		}
	}

	private byte[] toArray(ByteBuffer bb) {
		byte[] rv;
		if (bb.hasArray() && bb.arrayOffset() == 0) {
			rv = bb.array();
			if (rv.length == bb.remaining()) {
				return rv;
			}
			// fall through
		}
		rv = new byte[bb.remaining()];
		bb.get(rv, 0, rv.length);
		return rv;
	}

	private String decodeUnicodeWithFallback(byte[] data, int start, int length) {
		try {
			return utfDecoder.decode(ByteBuffer.wrap(data, start, length)).toString();
		} catch (CharacterCodingException ex) {
			// TODO post-1.2 respect ui.fallbackencoding actual setting
			try {
				return new String(data, start, length, "ISO-8859-1"); // XXX java5
			} catch (UnsupportedEncodingException e) {
				throw new HgInvalidStateException(ex.getMessage());
			}
		}
	}
	
	private ByteBuffer encodeUnicode(CharSequence s) {
		// 
		try {
			return utfEncoder.encode(CharBuffer.wrap(s));
		} catch (CharacterCodingException ex) {
			byte[] rv;
			try {
				rv = s.toString().getBytes(getUTF8().name()); // XXX Java 1.5
			} catch (UnsupportedEncodingException e) {
				throw new HgInvalidStateException("Unexpected error trying to get UTF-8 encoding"); 
			}
			return ByteBuffer.wrap(rv);
		}
	}

	private Charset charset() {
		return encoder.charset();
	}

	public static Charset getUTF8() {
		return Charset.forName("UTF-8");
	}
}