# HG changeset patch # User Artem Tikhomirov # Date 1332338042 -3600 # Node ID 63c5a9d7ca3f91e8345899656e68c15c350586ff # Parent 464b4404e75d9e8dafc12d6f9e5a34157902f6f6 Follow-up for Issue 29: unify path translation for manifest and dirstate diff -r 464b4404e75d -r 63c5a9d7ca3f src/org/tmatesoft/hg/internal/EncodingHelper.java --- a/src/org/tmatesoft/hg/internal/EncodingHelper.java Tue Mar 20 17:56:50 2012 +0100 +++ b/src/org/tmatesoft/hg/internal/EncodingHelper.java Wed Mar 21 14:54:02 2012 +0100 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 TMate Software Ltd + * Copyright (c) 2011-2012 TMate Software Ltd * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -16,9 +16,11 @@ */ package org.tmatesoft.hg.internal; -import java.io.UnsupportedEncodingException; - -import org.tmatesoft.hg.core.HgBadStateException; +import java.nio.ByteBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; /** * Keep all encoding-related issues in the single place @@ -27,13 +29,33 @@ */ public class EncodingHelper { // XXX perhaps, shall not be full of statics, but rather an instance coming from e.g. HgRepository? + /* + * To understand what Mercurial thinks of UTF-8 and Unix byte approach to names, see + * http://mercurial.808500.n3.nabble.com/Unicode-support-request-td3430704.html + */ + + private final CharsetEncoder encoder; + private final CharsetDecoder decoder; + + EncodingHelper(Charset fsEncoding) { + decoder = fsEncoding.newDecoder(); + encoder = fsEncoding.newEncoder(); + } - public static String fromManifest(byte[] data, int start, int length) { + public String fromManifest(byte[] data, int start, int length) { try { - return new String(data, start, length, "ISO-8859-1"); - } catch (UnsupportedEncodingException ex) { - // can't happen - throw new HgBadStateException(ex); + return decoder.decode(ByteBuffer.wrap(data, start, length)).toString(); + } catch (CharacterCodingException ex) { + // resort to system-default + return new String(data, start, length); } } + + public String fromDirstate(byte[] data, int start, int length) throws CharacterCodingException { + return decoder.decode(ByteBuffer.wrap(data, start, length)).toString(); + } + + public Charset charset() { + return encoder.charset(); + } } diff -r 464b4404e75d -r 63c5a9d7ca3f src/org/tmatesoft/hg/internal/Internals.java --- a/src/org/tmatesoft/hg/internal/Internals.java Tue Mar 20 17:56:50 2012 +0100 +++ b/src/org/tmatesoft/hg/internal/Internals.java Wed Mar 21 14:54:02 2012 +0100 @@ -71,7 +71,7 @@ * and if your project happen to use anything but filesystem default (say, UTF8 on cp1251 system), * native storage paths won't match */ - public static final String CFG_PROPERT_FS_FILENAME_ENCODING = "hg.fs.filename.encoding"; + public static final String CFG_PROPERTY_FS_FILENAME_ENCODING = "hg.fs.filename.encoding"; private int requiresFlags = 0; private List filterFactories; @@ -109,20 +109,9 @@ // XXX perhaps, should keep both fields right here, not in the HgRepository public PathRewrite buildDataFilesHelper() { - Object altEncoding = sessionContext.getProperty(CFG_PROPERT_FS_FILENAME_ENCODING, null); - Charset cs; - if (altEncoding == null) { - cs = Charset.defaultCharset(); - } else { - try { - cs = Charset.forName(altEncoding.toString()); - } catch (IllegalArgumentException ex) { - // both IllegalCharsetNameException and UnsupportedCharsetException are subclasses of IAE, too - // not severe enough to throw an exception, imo. Just record the fact it's bad ad we ignore it - sessionContext.getLog().error(getClass(), ex, String.format("Bad configuration value for filename encoding %s", altEncoding)); - cs = Charset.defaultCharset(); - } - } + // Note, tests in TestStorePath depend on the encoding not being cached + Charset cs = getFileEncoding(); + // StoragePathHelper needs fine-grained control over char encoding, hence doesn't use EncodingHelper return new StoragePathHelper((requiresFlags & STORE) != 0, (requiresFlags & FNCACHE) != 0, (requiresFlags & DOTENCODE) != 0, cs); } @@ -178,6 +167,28 @@ public boolean isCaseSensitiveFileSystem() { return isCaseSensitiveFileSystem; } + + public EncodingHelper buildFileNameEncodingHelper() { + return new EncodingHelper(getFileEncoding()); + } + + private Charset getFileEncoding() { + Object altEncoding = sessionContext.getProperty(CFG_PROPERTY_FS_FILENAME_ENCODING, null); + Charset cs; + if (altEncoding == null) { + cs = Charset.defaultCharset(); + } else { + try { + cs = Charset.forName(altEncoding.toString()); + } catch (IllegalArgumentException ex) { + // both IllegalCharsetNameException and UnsupportedCharsetException are subclasses of IAE, too + // not severe enough to throw an exception, imo. Just record the fact it's bad ad we ignore it + sessionContext.getLog().error(Internals.class, ex, String.format("Bad configuration value for filename encoding %s", altEncoding)); + cs = Charset.defaultCharset(); + } + } + return cs; + } public static boolean runningOnWindows() { return System.getProperty("os.name").indexOf("Windows") != -1; diff -r 464b4404e75d -r 63c5a9d7ca3f src/org/tmatesoft/hg/repo/HgDirstate.java --- a/src/org/tmatesoft/hg/repo/HgDirstate.java Tue Mar 20 17:56:50 2012 +0100 +++ b/src/org/tmatesoft/hg/repo/HgDirstate.java Wed Mar 21 14:54:02 2012 +0100 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2011 TMate Software Ltd + * Copyright (c) 2010-2012 TMate Software Ltd * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -23,6 +23,7 @@ import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; +import java.nio.charset.CharacterCodingException; import java.util.Collections; import java.util.HashMap; import java.util.LinkedHashMap; @@ -32,6 +33,7 @@ import org.tmatesoft.hg.core.HgInvalidControlFileException; import org.tmatesoft.hg.core.Nodeid; import org.tmatesoft.hg.internal.DataAccess; +import org.tmatesoft.hg.internal.EncodingHelper; import org.tmatesoft.hg.util.Pair; import org.tmatesoft.hg.util.Path; import org.tmatesoft.hg.util.PathPool; @@ -74,7 +76,7 @@ canonicalPathRewrite = canonicalPath; } - /*package-local*/ void read() throws HgInvalidControlFileException { + /*package-local*/ void read(EncodingHelper encodingHelper) throws HgInvalidControlFileException { normal = added = removed = merged = Collections.emptyMap(); parents = new Pair(Nodeid.NULL, Nodeid.NULL); if (canonicalPathRewrite != null) { @@ -108,13 +110,13 @@ da.readBytes(name, 0, nameLen); for (int i = 0; i < nameLen; i++) { if (name[i] == 0) { - fn1 = new String(name, 0, i, "UTF-8"); // XXX unclear from documentation what encoding is used there - fn2 = new String(name, i+1, nameLen - i - 1, "UTF-8"); // need to check with different system codepages + fn1 = encodingHelper.fromDirstate(name, 0, i); + fn2 = encodingHelper.fromDirstate(name, i+1, nameLen - i - 1); break; } } if (fn1 == null) { - fn1 = new String(name); + fn1 = encodingHelper.fromDirstate(name, 0, nameLen); } Record r = new Record(fmode, size, time, pathPool.path(fn1), fn2 == null ? null : pathPool.path(fn2)); if (canonicalPathRewrite != null) { @@ -145,6 +147,8 @@ repo.getContext().getLog().warn(getClass(), "Dirstate record for file %s (size: %d, tstamp:%d) has unknown state '%c'", r.name1, r.size(), r.time, state); } } + } catch (CharacterCodingException ex) { + throw new HgInvalidControlFileException(String.format("Failed reading file names from dirstate using encoding %s", encodingHelper.charset().name()), ex, dirstateFile); } catch (IOException ex) { throw new HgInvalidControlFileException("Dirstate read failed", ex, dirstateFile); } finally { diff -r 464b4404e75d -r 63c5a9d7ca3f src/org/tmatesoft/hg/repo/HgInternals.java --- a/src/org/tmatesoft/hg/repo/HgInternals.java Tue Mar 20 17:56:50 2012 +0100 +++ b/src/org/tmatesoft/hg/repo/HgInternals.java Wed Mar 21 14:54:02 2012 +0100 @@ -73,7 +73,7 @@ }; } HgDirstate ds = new HgDirstate(repo, new File(repo.getRepositoryRoot(), "dirstate"), new PathPool(new PathRewrite.Empty()), canonicalPath); - ds.read(); + ds.read(repo.getImplHelper().buildFileNameEncodingHelper()); return ds; } diff -r 464b4404e75d -r 63c5a9d7ca3f src/org/tmatesoft/hg/repo/HgManifest.java --- a/src/org/tmatesoft/hg/repo/HgManifest.java Tue Mar 20 17:56:50 2012 +0100 +++ b/src/org/tmatesoft/hg/repo/HgManifest.java Wed Mar 21 14:54:02 2012 +0100 @@ -51,6 +51,7 @@ */ public class HgManifest extends Revlog { private RevisionMapper revisionMap; + private EncodingHelper encodingHelper; public enum Flags { Exec, Link; @@ -95,8 +96,9 @@ } } - /*package-local*/ HgManifest(HgRepository hgRepo, RevlogStream content) { + /*package-local*/ HgManifest(HgRepository hgRepo, RevlogStream content, EncodingHelper eh) { super(hgRepo, content); + encodingHelper = eh; } /** @@ -171,7 +173,7 @@ manifestLast = manifestFirst; manifestFirst = x; } - content.iterate(manifestFirst, manifestLast, true, new ManifestParser(inspector)); + content.iterate(manifestFirst, manifestLast, true, new ManifestParser(inspector, encodingHelper)); } /** @@ -189,7 +191,7 @@ throw new IllegalArgumentException(); } int[] manifestRevs = toManifestRevisionIndexes(revisionIndexes, inspector); - content.iterate(manifestRevs, true, new ManifestParser(inspector)); + content.iterate(manifestRevs, true, new ManifestParser(inspector, encodingHelper)); } // @@ -331,11 +333,13 @@ private int start; private final int hash, length; private Path result; + private final EncodingHelper encHelper; - public PathProxy(byte[] data, int start, int length) { + public PathProxy(byte[] data, int start, int length, EncodingHelper eh) { this.data = data; this.start = start; this.length = length; + this.encHelper = eh; // copy from String.hashCode(). In fact, not necessarily match result of String(data).hashCode // just need some nice algorithm here @@ -373,7 +377,7 @@ public Path freeze() { if (result == null) { - result = Path.create(EncodingHelper.fromManifest(data, start, length)); + result = Path.create(encHelper.fromManifest(data, start, length)); // release reference to bigger data array, make a copy of relevant part only // use original bytes, not those from String above to avoid cache misses due to different encodings byte[] d = new byte[length]; @@ -393,11 +397,13 @@ private byte[] nodeidLookupBuffer = new byte[20]; // get reassigned each time new Nodeid is added to pool private final ProgressSupport progressHelper; private IterateControlMediator iterateControl; + private final EncodingHelper encHelper; - public ManifestParser(Inspector delegate) { + public ManifestParser(Inspector delegate, EncodingHelper eh) { assert delegate != null; inspector = delegate; inspector2 = delegate instanceof Inspector2 ? (Inspector2) delegate : null; + encHelper = eh; nodeidPool = new Pool2(); fnamePool = new Pool2(); thisRevPool = new Pool2(); @@ -421,7 +427,7 @@ int x = i; for( ; data[i] != '\n' && i < actualLen; i++) { if (fname == null && data[i] == 0) { - PathProxy px = fnamePool.unify(new PathProxy(data, x, i - x)); + PathProxy px = fnamePool.unify(new PathProxy(data, x, i - x, encHelper)); // if (cached = fnamePool.unify(px))== px then cacheMiss, else cacheHit // cpython 0..10k: hits: 15 989 152, misses: 3020 fname = px.freeze(); diff -r 464b4404e75d -r 63c5a9d7ca3f src/org/tmatesoft/hg/repo/HgRepository.java --- a/src/org/tmatesoft/hg/repo/HgRepository.java Tue Mar 20 17:56:50 2012 +0100 +++ b/src/org/tmatesoft/hg/repo/HgRepository.java Wed Mar 21 14:54:02 2012 +0100 @@ -145,7 +145,7 @@ public HgManifest getManifest() { if (manifest == null) { RevlogStream content = resolve(Path.create(repoPathHelper.rewrite("00manifest.i")), true); - manifest = new HgManifest(this, content); + manifest = new HgManifest(this, content, impl.buildFileNameEncodingHelper()); } return manifest; } @@ -308,7 +308,7 @@ }; } HgDirstate ds = new HgDirstate(this, new File(repoDir, "dirstate"), pathPool, canonicalPath); - ds.read(); + ds.read(impl.buildFileNameEncodingHelper()); return ds; } @@ -385,6 +385,10 @@ /*package-local*/ SessionContext getContext() { return sessionContext; } + + /*package-local*/ Internals getImplHelper() { + return impl; + } private List instantiateFilters(Path p, Filter.Options opts) { List factories = impl.getFilters(this); diff -r 464b4404e75d -r 63c5a9d7ca3f test/org/tmatesoft/hg/test/TestStorePath.java --- a/test/org/tmatesoft/hg/test/TestStorePath.java Tue Mar 20 17:56:50 2012 +0100 +++ b/test/org/tmatesoft/hg/test/TestStorePath.java Wed Mar 21 14:54:02 2012 +0100 @@ -119,11 +119,11 @@ public void testNationalChars() { String s = "Привет.txt"; // - propertyOverrides.put(Internals.CFG_PROPERT_FS_FILENAME_ENCODING, "cp1251"); + propertyOverrides.put(Internals.CFG_PROPERTY_FS_FILENAME_ENCODING, "cp1251"); PathRewrite sph = internals.buildDataFilesHelper(); errorCollector.checkThat(sph.rewrite(s), CoreMatchers.equalTo("store/data/~cf~f0~e8~e2~e5~f2.txt.i")); // - propertyOverrides.put(Internals.CFG_PROPERT_FS_FILENAME_ENCODING, "UTF8"); + propertyOverrides.put(Internals.CFG_PROPERTY_FS_FILENAME_ENCODING, "UTF8"); sph = internals.buildDataFilesHelper(); errorCollector.checkThat(sph.rewrite(s), CoreMatchers.equalTo("store/data/~d0~9f~d1~80~d0~b8~d0~b2~d0~b5~d1~82.txt.i")); }