comparison src/org/tmatesoft/hg/internal/StoragePathHelper.java @ 411:464b4404e75d smartgit3

Issue 29: Bad storage path translation - translate Unicode chars to filesystem encoding
author Artem Tikhomirov <tikhomirov.artem@gmail.com>
date Tue, 20 Mar 2012 17:56:50 +0100
parents 6d2c6b2469fc
children 528b6780a8bd
comparison
equal deleted inserted replaced
410:df5009d67be2 411:464b4404e75d
1 /* 1 /*
2 * Copyright (c) 2011 TMate Software Ltd 2 * Copyright (c) 2011-2012 TMate Software Ltd
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by 5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; version 2 of the License. 6 * the Free Software Foundation; version 2 of the License.
7 * 7 *
14 * the terms of a license other than GNU General Public License 14 * the terms of a license other than GNU General Public License
15 * contact TMate Software at support@hg4j.com 15 * contact TMate Software at support@hg4j.com
16 */ 16 */
17 package org.tmatesoft.hg.internal; 17 package org.tmatesoft.hg.internal;
18 18
19 import java.nio.ByteBuffer;
20 import java.nio.CharBuffer;
21 import java.nio.charset.Charset;
22 import java.nio.charset.CharsetEncoder;
19 import java.util.Arrays; 23 import java.util.Arrays;
20 import java.util.TreeSet; 24 import java.util.TreeSet;
25 import java.util.regex.Matcher;
26 import java.util.regex.Pattern;
21 27
22 import org.tmatesoft.hg.util.PathRewrite; 28 import org.tmatesoft.hg.util.PathRewrite;
23 29
24 /** 30 /**
25 * @see http://mercurial.selenic.com/wiki/CaseFoldingPlan 31 * @see http://mercurial.selenic.com/wiki/CaseFoldingPlan
26 * @see http://mercurial.selenic.com/wiki/fncacheRepoFormat 32 * @see http://mercurial.selenic.com/wiki/fncacheRepoFormat
33 * @see http://mercurial.selenic.com/wiki/EncodingStrategy
27 * 34 *
28 * @author Artem Tikhomirov 35 * @author Artem Tikhomirov
29 * @author TMate Software Ltd. 36 * @author TMate Software Ltd.
30 */ 37 */
31 class StoragePathHelper implements PathRewrite { 38 class StoragePathHelper implements PathRewrite {
32 39
33 private final boolean store; 40 private final boolean store;
34 private final boolean fncache; 41 private final boolean fncache;
35 private final boolean dotencode; 42 private final boolean dotencode;
36 43 private final Pattern suffix2replace;
44 private final CharsetEncoder csEncoder;
45 private final char[] hexEncodedByte = new char[] {'~', '0', '0'};
46 private final ByteBuffer byteEncodingBuf;
47 private final CharBuffer charEncodingBuf;
48
37 public StoragePathHelper(boolean isStore, boolean isFncache, boolean isDotencode) { 49 public StoragePathHelper(boolean isStore, boolean isFncache, boolean isDotencode) {
50 this(isStore, isFncache, isDotencode, Charset.defaultCharset());
51 }
52
53 public StoragePathHelper(boolean isStore, boolean isFncache, boolean isDotencode, Charset fsEncoding) {
54 assert fsEncoding != null;
38 store = isStore; 55 store = isStore;
39 fncache = isFncache; 56 fncache = isFncache;
40 dotencode = isDotencode; 57 dotencode = isDotencode;
58 suffix2replace = Pattern.compile("\\.([id]|hg)/");
59 csEncoder = fsEncoding.newEncoder(); // FIXME catch exception and rethrow as our's RT
60 byteEncodingBuf = ByteBuffer.allocate(Math.round(csEncoder.maxBytesPerChar()) + 1/*in fact, need ceil, hence +1*/);
61 charEncodingBuf = CharBuffer.allocate(1);
41 } 62 }
42 63
43 // FIXME document what path argument is, whether it includes .i or .d, and whether it's 'normalized' (slashes) or not. 64 // FIXME document what path argument is, whether it includes .i or .d, and whether it's 'normalized' (slashes) or not.
44 // since .hg/store keeps both .i files and files without extension (e.g. fncache), guees, for data == false 65 // since .hg/store keeps both .i files and files without extension (e.g. fncache), guees, for data == false
45 // we shall assume path has extension 66 // we shall assume path has extension
46 public CharSequence rewrite(CharSequence p) { 67 public CharSequence rewrite(CharSequence p) {
47 final String STR_STORE = "store/"; 68 final String STR_STORE = "store/";
48 final String STR_DATA = "data/"; 69 final String STR_DATA = "data/";
49 final String STR_DH = "dh/"; 70 final String STR_DH = "dh/";
50 final String reservedChars = "\\:*?\"<>|"; 71 final String reservedChars = "\\:*?\"<>|";
51 char[] hexByte = new char[2];
52 72
53 String path = p.toString(); 73 Matcher suffixMatcher = suffix2replace.matcher(p);
54 path = path.replace(".hg/", ".hg.hg/").replace(".i/", ".i.hg/").replace(".d/", ".d.hg/"); 74 CharSequence path;
75 // Matcher.replaceAll, but without extra toString
76 boolean found = suffixMatcher.find();
77 if (found) {
78 StringBuffer sb = new StringBuffer(p.length() + 20);
79 do {
80 suffixMatcher.appendReplacement(sb, ".$1.hg/");
81 } while (found = suffixMatcher.find());
82 suffixMatcher.appendTail(sb);
83 path = sb;
84 } else {
85 path = p;
86 }
87
55 StringBuilder sb = new StringBuilder(path.length() << 1); 88 StringBuilder sb = new StringBuilder(path.length() << 1);
56 if (store || fncache) { 89 if (store || fncache) {
57 // encodefilename
58 for (int i = 0; i < path.length(); i++) { 90 for (int i = 0; i < path.length(); i++) {
59 final char ch = path.charAt(i); 91 final char ch = path.charAt(i);
60 if (ch >= 'a' && ch <= 'z') { 92 if (ch >= 'a' && ch <= 'z') {
61 sb.append(ch); // POIRAE 93 sb.append(ch); // POIRAE
62 } else if (ch >= 'A' && ch <= 'Z') { 94 } else if (ch >= 'A' && ch <= 'Z') {
63 sb.append('_'); 95 sb.append('_');
64 sb.append(Character.toLowerCase(ch)); // Perhaps, (char) (((int) ch) + 32)? Even better, |= 0x20? 96 sb.append(Character.toLowerCase(ch)); // Perhaps, (char) (((int) ch) + 32)? Even better, |= 0x20?
65 } else if (reservedChars.indexOf(ch) != -1) { 97 } else if (reservedChars.indexOf(ch) != -1) {
66 sb.append('~'); 98 sb.append(toHexByte(ch));
67 sb.append(toHexByte(ch, hexByte));
68 } else if ((ch >= '~' /*126*/ && ch <= 255) || ch < ' ' /*32*/) { 99 } else if ((ch >= '~' /*126*/ && ch <= 255) || ch < ' ' /*32*/) {
69 sb.append('~'); 100 sb.append(toHexByte(ch));
70 sb.append(toHexByte(ch, hexByte));
71 } else if (ch == '_') { 101 } else if (ch == '_') {
72 sb.append('_'); 102 sb.append('_');
73 sb.append('_'); 103 sb.append('_');
74 } else { 104 } else {
75 sb.append(ch); 105 // either ASCII char that doesn't require special handling, or an Unicode character to get encoded
106 // according to filesystem/native encoding, see http://mercurial.selenic.com/wiki/EncodingStrategy
107 // despite of what the page says, use of native encoding seems worst solution to me (repositories
108 // can't be easily shared between OS'es with different encodings then, e.g. Win1251 and Linux UTF8).
109 // If the ease of sharing was not the point, what's the reason to mangle with names at all then (
110 // lowercase and exclude reserved device names).
111 if (ch < '~' /*126*/ || !csEncoder.canEncode(ch)) {
112 sb.append(ch);
113 } else {
114 appendEncoded(sb, ch);
115 }
76 } 116 }
77 } 117 }
78 // auxencode 118 // auxencode
79 if (fncache) { 119 if (fncache) {
80 encodeWindowsDeviceNames(sb); 120 encodeWindowsDeviceNames(sb);
81 } 121 }
82 } 122 }
83 final int MAX_PATH_LEN = 120; 123 final int MAX_PATH_LEN = 120;
84 if (fncache && (sb.length() + STR_DATA.length() + ".i".length() > MAX_PATH_LEN)) { 124 if (fncache && (sb.length() + STR_DATA.length() + ".i".length() > MAX_PATH_LEN)) {
125 // TODO [post-1.0] Mercurial uses system encoding for paths, hence we need to pass bytes to DigestHelper
126 // to ensure our sha1 value (default encoding of unicode string if one looks into DH impl) match that
127 // produced by Mercurial (based on native string).
85 String digest = new DigestHelper().sha1(STR_DATA, path, ".i").asHexString(); 128 String digest = new DigestHelper().sha1(STR_DATA, path, ".i").asHexString();
86 final int DIR_PREFIX_LEN = 8; 129 final int DIR_PREFIX_LEN = 8;
87 // not sure why (-4) is here. 120 - 40 = up to 80 for path with ext. dh/ + ext(.i) = 3+2 130 // not sure why (-4) is here. 120 - 40 = up to 80 for path with ext. dh/ + ext(.i) = 3+2
88 final int MAX_DIR_PREFIX = 8 * (DIR_PREFIX_LEN + 1) - 4; 131 final int MAX_DIR_PREFIX = 8 * (DIR_PREFIX_LEN + 1) - 4;
89 sb = new StringBuilder(MAX_PATH_LEN); 132 sb = new StringBuilder(MAX_PATH_LEN);
92 if (ch >= 'a' && ch <= 'z') { 135 if (ch >= 'a' && ch <= 'z') {
93 sb.append(ch); 136 sb.append(ch);
94 } else if (ch >= 'A' && ch <= 'Z') { 137 } else if (ch >= 'A' && ch <= 'Z') {
95 sb.append((char) (ch | 0x20)); // lowercase 138 sb.append((char) (ch | 0x20)); // lowercase
96 } else if (reservedChars.indexOf(ch) != -1) { 139 } else if (reservedChars.indexOf(ch) != -1) {
97 sb.append('~'); 140 sb.append(toHexByte(ch));
98 sb.append(toHexByte(ch, hexByte));
99 } else if ((ch >= '~' /*126*/ && ch <= 255) || ch < ' ' /*32*/) { 141 } else if ((ch >= '~' /*126*/ && ch <= 255) || ch < ' ' /*32*/) {
100 sb.append('~'); 142 sb.append(toHexByte(ch));
101 sb.append(toHexByte(ch, hexByte));
102 } else { 143 } else {
103 sb.append(ch); 144 if (ch < '~' /*126*/ || !csEncoder.canEncode(ch)) {
145 sb.append(ch);
146 } else {
147 appendEncoded(sb, ch);
148 }
104 } 149 }
105 } 150 }
106 encodeWindowsDeviceNames(sb); 151 encodeWindowsDeviceNames(sb);
107 int fnameStart = sb.lastIndexOf("/"); // since we rewrite file names, it never ends with slash (for dirs, I'd pass length-2); 152 int fnameStart = sb.lastIndexOf("/"); // since we rewrite file names, it never ends with slash (for dirs, I'd pass length-2);
108 StringBuilder completeHashName = new StringBuilder(MAX_PATH_LEN); 153 StringBuilder completeHashName = new StringBuilder(MAX_PATH_LEN);
161 sb.append(".i"); 206 sb.append(".i");
162 return sb.toString(); 207 return sb.toString();
163 } 208 }
164 209
165 private void encodeWindowsDeviceNames(StringBuilder sb) { 210 private void encodeWindowsDeviceNames(StringBuilder sb) {
166 char[] hexByte = new char[2];
167 int x = 0; // last segment start 211 int x = 0; // last segment start
168 final TreeSet<String> windowsReservedFilenames = new TreeSet<String>(); 212 final TreeSet<String> windowsReservedFilenames = new TreeSet<String>();
169 windowsReservedFilenames.addAll(Arrays.asList("con prn aux nul com1 com2 com3 com4 com5 com6 com7 com8 com9 lpt1 lpt2 lpt3 lpt4 lpt5 lpt6 lpt7 lpt8 lpt9".split(" "))); 213 windowsReservedFilenames.addAll(Arrays.asList("con prn aux nul com1 com2 com3 com4 com5 com6 com7 com8 com9 lpt1 lpt2 lpt3 lpt4 lpt5 lpt6 lpt7 lpt8 lpt9".split(" ")));
170 do { 214 do {
171 int i = sb.indexOf("/", x); 215 int i = sb.indexOf("/", x);
181 found = windowsReservedFilenames.contains(sb.subSequence(x, x+3)); 225 found = windowsReservedFilenames.contains(sb.subSequence(x, x+3));
182 } else if (i-x > 4 && sb.charAt(x+4) == '.') { 226 } else if (i-x > 4 && sb.charAt(x+4) == '.') {
183 found = windowsReservedFilenames.contains(sb.subSequence(x, x+4)); 227 found = windowsReservedFilenames.contains(sb.subSequence(x, x+4));
184 } 228 }
185 if (found) { 229 if (found) {
186 sb.insert(x+3, toHexByte(sb.charAt(x+2), hexByte)); 230 // x+2 as we change the third letter in device name
187 sb.setCharAt(x+2, '~'); 231 replace(sb, x+2, toHexByte(sb.charAt(x+2)));
188 i += 2; 232 i += 2;
189 } 233 }
190 } 234 }
191 if (dotencode && (sb.charAt(x) == '.' || sb.charAt(x) == ' ')) { 235 if (dotencode && (sb.charAt(x) == '.' || sb.charAt(x) == ' ')) {
192 sb.insert(x+1, toHexByte(sb.charAt(x), hexByte)); 236 char dotOrSpace = sb.charAt(x); // beware, replace() below changes charAt(x), rather get a copy
193 sb.setCharAt(x, '~'); // setChar *after* charAt/insert to get ~2e, not ~7e for '.' 237 // not to get ~7e for '.' instead of ~2e, if later refactoring changes the logic
238 replace(sb, x, toHexByte(dotOrSpace));
194 i += 2; 239 i += 2;
195 } 240 }
196 x = i+1; 241 x = i+1;
197 } while (x < sb.length()); 242 } while (x < sb.length());
198 } 243 }
199 244
200 private static char[] toHexByte(int ch, char[] buf) { 245 // shall be synchronized in case of multithreaded use
201 assert buf.length > 1; 246 private void appendEncoded(StringBuilder sb, char ch) {
247 charEncodingBuf.clear();
248 byteEncodingBuf.clear();
249 charEncodingBuf.put(ch).flip();
250 csEncoder.encode(charEncodingBuf, byteEncodingBuf, false);
251 byteEncodingBuf.flip();
252 while (byteEncodingBuf.hasRemaining()) {
253 sb.append(toHexByte(byteEncodingBuf.get()));
254 }
255 }
256
257 /**
258 * replace char at sb[index] with a sequence
259 */
260 private static void replace(StringBuilder sb, int index, char[] with) {
261 // there's StringBuilder.replace(int, int+1, String), but with char[] - I don't want to make a string out of hexEncodedByte
262 sb.setCharAt(index, with[0]);
263 sb.insert(index+1, with, 1, with.length - 1);
264 }
265
266 /**
267 * put hex representation of byte ch into buf from specified offset
268 */
269 private char[] toHexByte(int ch) {
202 final String hexDigits = "0123456789abcdef"; 270 final String hexDigits = "0123456789abcdef";
203 buf[0] = hexDigits.charAt((ch & 0x00F0) >>> 4); 271 hexEncodedByte[1] = hexDigits.charAt((ch & 0x00F0) >>> 4);
204 buf[1] = hexDigits.charAt(ch & 0x0F); 272 hexEncodedByte[2] = hexDigits.charAt(ch & 0x0F);
205 return buf; 273 return hexEncodedByte;
206 } 274 }
207 } 275 }