tikhomirov@320: /* tikhomirov@526: * Copyright (c) 2011-2013 TMate Software Ltd tikhomirov@320: * tikhomirov@320: * This program is free software; you can redistribute it and/or modify tikhomirov@320: * it under the terms of the GNU General Public License as published by tikhomirov@320: * the Free Software Foundation; version 2 of the License. tikhomirov@320: * tikhomirov@320: * This program is distributed in the hope that it will be useful, tikhomirov@320: * but WITHOUT ANY WARRANTY; without even the implied warranty of tikhomirov@320: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the tikhomirov@320: * GNU General Public License for more details. tikhomirov@320: * tikhomirov@320: * For information on how to redistribute this software under tikhomirov@320: * the terms of a license other than GNU General Public License tikhomirov@320: * contact TMate Software at support@hg4j.com tikhomirov@320: */ tikhomirov@320: package org.tmatesoft.hg.internal; tikhomirov@320: tikhomirov@456: import static org.tmatesoft.hg.util.LogFacility.Severity.Error; tikhomirov@456: tikhomirov@667: import java.io.UnsupportedEncodingException; tikhomirov@412: import java.nio.ByteBuffer; tikhomirov@415: import java.nio.CharBuffer; tikhomirov@412: import java.nio.charset.CharacterCodingException; tikhomirov@412: import java.nio.charset.Charset; tikhomirov@412: import java.nio.charset.CharsetDecoder; tikhomirov@412: import java.nio.charset.CharsetEncoder; tikhomirov@320: tikhomirov@415: import org.tmatesoft.hg.core.SessionContext; tikhomirov@667: import org.tmatesoft.hg.repo.HgInvalidStateException; tikhomirov@415: tikhomirov@320: /** tikhomirov@320: * Keep all encoding-related issues in the single place tikhomirov@415: * NOT thread-safe (encoder and decoder requires synchronized access) tikhomirov@667: * tikhomirov@667: * @see http://mercurial.selenic.com/wiki/EncodingStrategy tikhomirov@667: * @see http://mercurial.selenic.com/wiki/WindowsUTF8Plan tikhomirov@667: * @see http://mercurial.selenic.com/wiki/CharacterEncodingOnWindows tikhomirov@320: * @author Artem Tikhomirov tikhomirov@320: * @author TMate Software Ltd. tikhomirov@320: */ tikhomirov@320: public class EncodingHelper { tikhomirov@412: /* tikhomirov@412: * To understand what Mercurial thinks of UTF-8 and Unix byte approach to names, see tikhomirov@412: * http://mercurial.808500.n3.nabble.com/Unicode-support-request-td3430704.html tikhomirov@412: */ tikhomirov@412: tikhomirov@415: private final SessionContext sessionContext; tikhomirov@412: private final CharsetEncoder encoder; tikhomirov@412: private final CharsetDecoder decoder; tikhomirov@667: private final CharsetEncoder utfEncoder; tikhomirov@667: private final CharsetDecoder utfDecoder; tikhomirov@412: tikhomirov@667: EncodingHelper(Charset fsEncoding, SessionContext.Source ctx) { tikhomirov@667: sessionContext = ctx.getSessionContext(); tikhomirov@412: decoder = fsEncoding.newDecoder(); tikhomirov@412: encoder = fsEncoding.newEncoder(); tikhomirov@667: Charset utf8 = getUTF8(); tikhomirov@667: if (fsEncoding.equals(utf8)) { tikhomirov@667: utfDecoder = decoder; tikhomirov@667: utfEncoder = encoder; tikhomirov@667: } else { tikhomirov@667: utfDecoder = utf8.newDecoder(); tikhomirov@667: utfEncoder = utf8.newEncoder(); tikhomirov@667: } tikhomirov@412: } tikhomirov@320: tikhomirov@418: /** tikhomirov@418: * Translate file names from manifest to amazing Unicode string tikhomirov@418: */ tikhomirov@412: public String fromManifest(byte[] data, int start, int length) { tikhomirov@418: return decodeWithSystemDefaultFallback(data, start, length); tikhomirov@320: } tikhomirov@418: tikhomirov@415: /** tikhomirov@415: * @return byte representation of the string directly comparable to bytes in manifest tikhomirov@415: */ tikhomirov@526: public byte[] toManifest(CharSequence s) { tikhomirov@415: if (s == null) { tikhomirov@415: // perhaps, can return byte[0] in this case? tikhomirov@415: throw new IllegalArgumentException(); tikhomirov@415: } tikhomirov@667: return toArray(encodeWithSystemDefaultFallback(s)); tikhomirov@525: } tikhomirov@525: tikhomirov@525: /** tikhomirov@525: * Translate file names from dirstate to amazing Unicode string tikhomirov@525: */ tikhomirov@525: public String fromDirstate(byte[] data, int start, int length) { tikhomirov@525: return decodeWithSystemDefaultFallback(data, start, length); tikhomirov@525: } tikhomirov@525: tikhomirov@526: public byte[] toDirstate(CharSequence fname) { tikhomirov@525: if (fname == null) { tikhomirov@525: throw new IllegalArgumentException(); tikhomirov@525: } tikhomirov@667: return toArray(encodeWithSystemDefaultFallback(fname)); tikhomirov@667: } tikhomirov@667: tikhomirov@667: /** tikhomirov@667: * prepare filename to be serialized into fncache file tikhomirov@667: */ tikhomirov@667: public ByteBuffer toFNCache(CharSequence fname) { tikhomirov@525: return encodeWithSystemDefaultFallback(fname); tikhomirov@525: } tikhomirov@667: tikhomirov@667: public byte[] toBundle(CharSequence fname) { tikhomirov@667: // yes, mercurial transfers filenames in local encoding tikhomirov@667: // so that if your local encoding doesn't match that on server, tikhomirov@667: // and you use native characters, you'd likely fail tikhomirov@667: return toArray(encodeWithSystemDefaultFallback(fname)); tikhomirov@667: } tikhomirov@667: public String fromBundle(byte[] data, int start, int length) { tikhomirov@667: return decodeWithSystemDefaultFallback(data, start, length); tikhomirov@667: } tikhomirov@667: tikhomirov@667: tikhomirov@667: public String userFromChangeset(byte[] data, int start, int length) { tikhomirov@667: return decodeUnicodeWithFallback(data, start, length); tikhomirov@667: } tikhomirov@667: tikhomirov@667: public String commentFromChangeset(byte[] data, int start, int length) { tikhomirov@667: return decodeUnicodeWithFallback(data, start, length); tikhomirov@667: } tikhomirov@667: tikhomirov@667: public String fileFromChangeset(byte[] data, int start, int length) { tikhomirov@667: return decodeWithSystemDefaultFallback(data, start, length); tikhomirov@667: } tikhomirov@525: tikhomirov@667: public byte[] userToChangeset(CharSequence user) { tikhomirov@667: return toArray(encodeUnicode(user)); tikhomirov@667: } tikhomirov@667: tikhomirov@667: public byte[] commentToChangeset(CharSequence comment) { tikhomirov@667: return toArray(encodeUnicode(comment)); tikhomirov@667: } tikhomirov@667: tikhomirov@667: public byte[] fileToChangeset(CharSequence file) { tikhomirov@667: return toArray(encodeWithSystemDefaultFallback(file)); tikhomirov@667: } tikhomirov@667: tikhomirov@525: private String decodeWithSystemDefaultFallback(byte[] data, int start, int length) { tikhomirov@415: try { tikhomirov@525: return decoder.decode(ByteBuffer.wrap(data, start, length)).toString(); tikhomirov@525: } catch (CharacterCodingException ex) { tikhomirov@525: sessionContext.getLog().dump(getClass(), Error, ex, String.format("Use of charset %s failed, resort to system default", charset().name())); tikhomirov@525: // resort to system-default tikhomirov@525: return new String(data, start, length); tikhomirov@525: } tikhomirov@525: } tikhomirov@525: tikhomirov@667: private ByteBuffer encodeWithSystemDefaultFallback(CharSequence s) { tikhomirov@525: try { tikhomirov@525: // synchronized(encoder) { tikhomirov@667: return encoder.encode(CharBuffer.wrap(s)); tikhomirov@415: // } tikhomirov@415: } catch (CharacterCodingException ex) { tikhomirov@456: sessionContext.getLog().dump(getClass(), Error, ex, String.format("Use of charset %s failed, resort to system default", charset().name())); tikhomirov@415: // resort to system-default tikhomirov@667: return ByteBuffer.wrap(s.toString().getBytes()); tikhomirov@667: } tikhomirov@667: } tikhomirov@667: tikhomirov@667: private byte[] toArray(ByteBuffer bb) { tikhomirov@667: byte[] rv; tikhomirov@667: if (bb.hasArray() && bb.arrayOffset() == 0) { tikhomirov@667: rv = bb.array(); tikhomirov@667: if (rv.length == bb.remaining()) { tikhomirov@667: return rv; tikhomirov@667: } tikhomirov@667: // fall through tikhomirov@667: } tikhomirov@667: rv = new byte[bb.remaining()]; tikhomirov@667: bb.get(rv, 0, rv.length); tikhomirov@667: return rv; tikhomirov@667: } tikhomirov@667: tikhomirov@667: private String decodeUnicodeWithFallback(byte[] data, int start, int length) { tikhomirov@667: try { tikhomirov@667: return utfDecoder.decode(ByteBuffer.wrap(data, start, length)).toString(); tikhomirov@667: } catch (CharacterCodingException ex) { tikhomirov@667: // TODO post-1.2 respect ui.fallbackencoding actual setting tikhomirov@682: try { tikhomirov@682: return new String(data, start, length, "ISO-8859-1"); // XXX java5 tikhomirov@682: } catch (UnsupportedEncodingException e) { tikhomirov@682: throw new HgInvalidStateException(ex.getMessage()); tikhomirov@682: } tikhomirov@667: } tikhomirov@667: } tikhomirov@667: tikhomirov@667: private ByteBuffer encodeUnicode(CharSequence s) { tikhomirov@667: // tikhomirov@667: try { tikhomirov@667: return utfEncoder.encode(CharBuffer.wrap(s)); tikhomirov@667: } catch (CharacterCodingException ex) { tikhomirov@667: byte[] rv; tikhomirov@667: try { tikhomirov@667: rv = s.toString().getBytes(getUTF8().name()); // XXX Java 1.5 tikhomirov@667: } catch (UnsupportedEncodingException e) { tikhomirov@667: throw new HgInvalidStateException("Unexpected error trying to get UTF-8 encoding"); tikhomirov@667: } tikhomirov@667: return ByteBuffer.wrap(rv); tikhomirov@415: } tikhomirov@415: } tikhomirov@415: tikhomirov@418: private Charset charset() { tikhomirov@412: return encoder.charset(); tikhomirov@412: } tikhomirov@415: tikhomirov@527: public static Charset getUTF8() { tikhomirov@527: return Charset.forName("UTF-8"); tikhomirov@527: } tikhomirov@320: }