diff hg4j/src/main/java/org/tmatesoft/hg/internal/RevlogStream.java @ 213:6ec4af642ba8 gradle

Project uses Gradle for build - actual changes
author Alexander Kitaev <kitaev@gmail.com>
date Tue, 10 May 2011 10:52:53 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hg4j/src/main/java/org/tmatesoft/hg/internal/RevlogStream.java	Tue May 10 10:52:53 2011 +0200
@@ -0,0 +1,460 @@
+/*
+ * Copyright (c) 2010-2011 TMate Software Ltd
+ *  
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * For information on how to redistribute this software under
+ * the terms of a license other than GNU General Public License
+ * contact TMate Software at support@hg4j.com
+ */
+package org.tmatesoft.hg.internal;
+
+import static org.tmatesoft.hg.repo.HgRepository.BAD_REVISION;
+import static org.tmatesoft.hg.repo.HgRepository.TIP;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.tmatesoft.hg.core.HgBadStateException;
+import org.tmatesoft.hg.core.Nodeid;
+import org.tmatesoft.hg.repo.HgRepository;
+
+
+/**
+ * ? Single RevlogStream per file per repository with accessor to record access session (e.g. with back/forward operations), 
+ * or numerous RevlogStream with separate representation of the underlying data (cached, lazy ChunkStream)?
+ * 
+ * @see http://mercurial.selenic.com/wiki/Revlog
+ * @see http://mercurial.selenic.com/wiki/RevlogNG
+ *
+ * @author Artem Tikhomirov
+ * @author TMate Software Ltd.
+ */
+public class RevlogStream {
+
+	/*
+	 * makes sense for index with inline data only - actual offset of the record in the .i file (record entry + revision * record size))
+	 * 
+	 * long[] in fact (there are 8-bytes field in the revlog)
+	 * However, (a) DataAccess currently doesn't operate with long seek/length
+	 * and, of greater significance, (b) files with inlined data are designated for smaller files,  
+	 * guess, about 130 Kb, and offset there won't ever break int capacity
+	 */
+	private int[] indexRecordOffset;  
+	private int[] baseRevisions;
+	private boolean inline = false;
+	private final File indexFile;
+	private final DataAccessProvider dataAccess;
+
+	// if we need anything else from HgRepo, might replace DAP parameter with HgRepo and query it for DAP.
+	public RevlogStream(DataAccessProvider dap, File indexFile) {
+		this.dataAccess = dap;
+		this.indexFile = indexFile;
+	}
+
+	/*package*/ DataAccess getIndexStream() {
+		return dataAccess.create(indexFile);
+	}
+
+	/*package*/ DataAccess getDataStream() {
+		final String indexName = indexFile.getName();
+		File dataFile = new File(indexFile.getParentFile(), indexName.substring(0, indexName.length() - 1) + "d");
+		return dataAccess.create(dataFile);
+	}
+	
+	public int revisionCount() {
+		initOutline();
+		return baseRevisions.length;
+	}
+	
+	public int dataLength(int revision) {
+		// XXX in fact, use of iterate() instead of this implementation may be quite reasonable.
+		//
+		final int indexSize = revisionCount();
+		DataAccess daIndex = getIndexStream(); // XXX may supply a hint that I'll need really few bytes of data (although at some offset)
+		if (revision == TIP) {
+			revision = indexSize - 1;
+		}
+		try {
+			int recordOffset = getIndexOffsetInt(revision);
+			daIndex.seek(recordOffset + 12); // 6+2+4
+			int actualLen = daIndex.readInt();
+			return actualLen; 
+		} catch (IOException ex) {
+			ex.printStackTrace(); // log error. FIXME better handling
+			throw new IllegalStateException(ex);
+		} finally {
+			daIndex.done();
+		}
+	}
+	
+	public byte[] nodeid(int revision) {
+		final int indexSize = revisionCount();
+		if (revision == TIP) {
+			revision = indexSize - 1;
+		}
+		if (revision < 0 || revision >= indexSize) {
+			throw new IllegalArgumentException(Integer.toString(revision));
+		}
+		DataAccess daIndex = getIndexStream();
+		try {
+			int recordOffset = getIndexOffsetInt(revision);
+			daIndex.seek(recordOffset + 32);
+			byte[] rv = new byte[20];
+			daIndex.readBytes(rv, 0, 20);
+			return rv;
+		} catch (IOException ex) {
+			ex.printStackTrace();
+			throw new IllegalStateException();
+		} finally {
+			daIndex.done();
+		}
+	}
+	
+	public int linkRevision(int revision) {
+		final int last = revisionCount() - 1;
+		if (revision == TIP) {
+			revision = last;
+		}
+		if (revision < 0 || revision > last) {
+			throw new IllegalArgumentException(Integer.toString(revision));
+		}
+		DataAccess daIndex = getIndexStream();
+		try {
+			int recordOffset = getIndexOffsetInt(revision);
+			daIndex.seek(recordOffset + 20);
+			int linkRev = daIndex.readInt();
+			return linkRev;
+		} catch (IOException ex) {
+			ex.printStackTrace();
+			throw new IllegalStateException();
+		} finally {
+			daIndex.done();
+		}
+	}
+	
+	// Perhaps, RevlogStream should be limited to use of plain int revisions for access,
+	// while Nodeids should be kept on the level up, in Revlog. Guess, Revlog better keep
+	// map of nodeids, and once this comes true, we may get rid of this method.
+	// Unlike its counterpart, {@link Revlog#getLocalRevisionNumber()}, doesn't fail with exception if node not found,
+	/**
+	 * @return integer in [0..revisionCount()) or {@link HgRepository#BAD_REVISION} if not found
+	 */
+	public int findLocalRevisionNumber(Nodeid nodeid) {
+		// XXX this one may be implemented with iterate() once there's mechanism to stop iterations
+		final int indexSize = revisionCount();
+		DataAccess daIndex = getIndexStream();
+		try {
+			byte[] nodeidBuf = new byte[20];
+			for (int i = 0; i < indexSize; i++) {
+				daIndex.skip(8);
+				int compressedLen = daIndex.readInt();
+				daIndex.skip(20);
+				daIndex.readBytes(nodeidBuf, 0, 20);
+				if (nodeid.equalsTo(nodeidBuf)) {
+					return i;
+				}
+				daIndex.skip(inline ? 12 + compressedLen : 12);
+			}
+		} catch (IOException ex) {
+			ex.printStackTrace(); // log error. FIXME better handling
+			throw new IllegalStateException(ex);
+		} finally {
+			daIndex.done();
+		}
+		return BAD_REVISION;
+	}
+
+
+	private final int REVLOGV1_RECORD_SIZE = 64;
+
+	// should be possible to use TIP, ALL, or -1, -2, -n notation of Hg
+	// ? boolean needsNodeid
+	public void iterate(int start, int end, boolean needData, Inspector inspector) {
+		initOutline();
+		final int indexSize = revisionCount();
+		if (indexSize == 0) {
+			return;
+		}
+		if (end == TIP) {
+			end = indexSize - 1;
+		}
+		if (start == TIP) {
+			start = indexSize - 1;
+		}
+		if (start < 0 || start >= indexSize) {
+			throw new IllegalArgumentException(String.format("Bad left range boundary %d in [0..%d]", start, indexSize-1));
+		}
+		if (end < start || end >= indexSize) {
+			throw new IllegalArgumentException(String.format("Bad right range boundary %d in [0..%d]", end, indexSize-1));
+		}
+		// XXX may cache [start .. end] from index with a single read (pre-read)
+		
+		DataAccess daIndex = null, daData = null;
+		daIndex = getIndexStream();
+		if (needData && !inline) {
+			daData = getDataStream();
+		}
+		try {
+			byte[] nodeidBuf = new byte[20];
+			DataAccess lastUserData = null;
+			int i;
+			boolean extraReadsToBaseRev = false;
+			if (needData && getBaseRevision(start) < start) {
+				i = getBaseRevision(start);
+				extraReadsToBaseRev = true;
+			} else {
+				i = start;
+			}
+			
+			daIndex.seek(getIndexOffsetInt(i));
+			for (; i <= end; i++ ) {
+				if (inline && needData) {
+					// inspector reading data (though FilterDataAccess) may have affected index position
+					daIndex.seek(getIndexOffsetInt(i));
+				}
+				long l = daIndex.readLong(); // 0
+				long offset = i == 0 ? 0 : (l >>> 16);
+				@SuppressWarnings("unused")
+				int flags = (int) (l & 0X0FFFF);
+				int compressedLen = daIndex.readInt(); // +8
+				int actualLen = daIndex.readInt(); // +12
+				int baseRevision = daIndex.readInt(); // +16
+				int linkRevision = daIndex.readInt(); // +20
+				int parent1Revision = daIndex.readInt();
+				int parent2Revision = daIndex.readInt();
+				// Hg has 32 bytes here, uses 20 for nodeid, and keeps 12 last bytes empty
+				daIndex.readBytes(nodeidBuf, 0, 20); // +32
+				daIndex.skip(12);
+				DataAccess userDataAccess = null;
+				if (needData) {
+					final byte firstByte;
+					int streamOffset;
+					DataAccess streamDataAccess;
+					if (inline) {
+						streamDataAccess = daIndex;
+						streamOffset = getIndexOffsetInt(i) + REVLOGV1_RECORD_SIZE; // don't need to do seek as it's actual position in the index stream
+					} else {
+						streamOffset = (int) offset;
+						streamDataAccess = daData;
+						daData.seek(streamOffset);
+					}
+					final boolean patchToPrevious = baseRevision != i; // the only way I found to tell if it's a patch
+					firstByte = streamDataAccess.readByte();
+					if (firstByte == 0x78 /* 'x' */) {
+						userDataAccess = new InflaterDataAccess(streamDataAccess, streamOffset, compressedLen, patchToPrevious ? -1 : actualLen);
+					} else if (firstByte == 0x75 /* 'u' */) {
+						userDataAccess = new FilterDataAccess(streamDataAccess, streamOffset+1, compressedLen-1);
+					} else {
+						// XXX Python impl in fact throws exception when there's not 'x', 'u' or '0'
+						// but I don't see reason not to return data as is 
+						userDataAccess = new FilterDataAccess(streamDataAccess, streamOffset, compressedLen);
+					}
+					// XXX 
+					if (patchToPrevious) {
+						// this is a patch
+						LinkedList<PatchRecord> patches = new LinkedList<PatchRecord>();
+						while (!userDataAccess.isEmpty()) {
+							PatchRecord pr = PatchRecord.read(userDataAccess);
+//							System.out.printf("PatchRecord:%d %d %d\n", pr.start, pr.end, pr.len);
+							patches.add(pr);
+						}
+						userDataAccess.done();
+						//
+						byte[] userData = apply(lastUserData, actualLen, patches);
+						userDataAccess = new ByteArrayDataAccess(userData);
+					}
+				} else {
+					if (inline) {
+						daIndex.skip(compressedLen);
+					}
+				}
+				if (!extraReadsToBaseRev || i >= start) {
+					inspector.next(i, actualLen, baseRevision, linkRevision, parent1Revision, parent2Revision, nodeidBuf, userDataAccess);
+				}
+				if (userDataAccess != null) {
+					userDataAccess.reset();
+					if (lastUserData != null) {
+						lastUserData.done();
+					}
+					lastUserData = userDataAccess;
+				}
+			}
+		} catch (IOException ex) {
+			throw new HgBadStateException(ex); // FIXME need better handling
+		} finally {
+			daIndex.done();
+			if (daData != null) {
+				daData.done();
+			}
+		}
+	}
+
+	private int getBaseRevision(int revision) {
+		return baseRevisions[revision];
+	}
+
+	/**
+	 * @return  offset of the revision's record in the index (.i) stream
+	 */
+	private int getIndexOffsetInt(int revision) {
+		return inline ? indexRecordOffset[revision] : revision * REVLOGV1_RECORD_SIZE;
+	}
+
+	private void initOutline() {
+		if (baseRevisions != null && baseRevisions.length > 0) {
+			return;
+		}
+		ArrayList<Integer> resBases = new ArrayList<Integer>();
+		ArrayList<Integer> resOffsets = new ArrayList<Integer>();
+		DataAccess da = getIndexStream();
+		try {
+			if (da.isEmpty()) {
+				// do not fail with exception if stream is empty, it's likely intentional
+				baseRevisions = new int[0];
+				return;
+			}
+			int versionField = da.readInt();
+			da.readInt(); // just to skip next 4 bytes of offset + flags
+			final int INLINEDATA = 1 << 16;
+			inline = (versionField & INLINEDATA) != 0;
+			long offset = 0; // first offset is always 0, thus Hg uses it for other purposes
+			while(true) {
+				int compressedLen = da.readInt();
+				// 8+4 = 12 bytes total read here
+				@SuppressWarnings("unused")
+				int actualLen = da.readInt();
+				int baseRevision = da.readInt();
+				// 12 + 8 = 20 bytes read here
+//				int linkRevision = di.readInt();
+//				int parent1Revision = di.readInt();
+//				int parent2Revision = di.readInt();
+//				byte[] nodeid = new byte[32];
+				resBases.add(baseRevision);
+				if (inline) {
+					int o = (int) offset;
+					if (o != offset) {
+						// just in case, can't happen, ever, unless HG (or some other bad tool) produces index file 
+						// with inlined data of size greater than 2 Gb.
+						throw new HgBadStateException("Data too big, offset didn't fit to sizeof(int)");
+					}
+					resOffsets.add(o + REVLOGV1_RECORD_SIZE * resOffsets.size());
+					da.skip(3*4 + 32 + compressedLen); // Check: 44 (skip) + 20 (read) = 64 (total RevlogNG record size)
+				} else {
+					da.skip(3*4 + 32);
+				}
+				if (da.isEmpty()) {
+					// fine, done then
+					baseRevisions = toArray(resBases);
+					if (inline) {
+						indexRecordOffset = toArray(resOffsets);
+					}
+					break;
+				} else {
+					// start reading next record
+					long l = da.readLong();
+					offset = l >>> 16;
+				}
+			}
+		} catch (IOException ex) {
+			ex.printStackTrace(); // log error
+			// too bad, no outline then, but don't fail with NPE
+			baseRevisions = new int[0];
+		} finally {
+			da.done();
+		}
+	}
+	
+	private static int[] toArray(List<Integer> l) {
+		int[] rv = new int[l.size()];
+		for (int i = 0; i < rv.length; i++) {
+			rv[i] = l.get(i);
+		}
+		return rv;
+	}
+	
+
+	// mpatch.c : apply()
+	// FIXME need to implement patch merge (fold, combine, gather and discard from aforementioned mpatch.[c|py]), also see Revlog and Mercurial PDF
+	public/*for HgBundle; until moved to better place*/static byte[] apply(DataAccess baseRevisionContent, int outcomeLen, List<PatchRecord> patch) throws IOException {
+		int last = 0, destIndex = 0;
+		if (outcomeLen == -1) {
+			outcomeLen = baseRevisionContent.length();
+			for (PatchRecord pr : patch) {
+				outcomeLen += pr.start - last + pr.len;
+				last = pr.end;
+			}
+			outcomeLen -= last;
+			last = 0;
+		}
+		byte[] rv = new byte[outcomeLen];
+		for (PatchRecord pr : patch) {
+			baseRevisionContent.seek(last);
+			baseRevisionContent.readBytes(rv, destIndex, pr.start-last);
+			destIndex += pr.start - last;
+			System.arraycopy(pr.data, 0, rv, destIndex, pr.data.length);
+			destIndex += pr.data.length;
+			last = pr.end;
+		}
+		baseRevisionContent.seek(last);
+		baseRevisionContent.readBytes(rv, destIndex, (int) (baseRevisionContent.length() - last));
+		return rv;
+	}
+
+	// @see http://mercurial.selenic.com/wiki/BundleFormat, in Changelog group description
+	public static class PatchRecord {
+		/*
+		   Given there are pr1 and pr2:
+		     pr1.start to pr1.end will be replaced with pr's data (of pr1.len)
+		     pr1.end to pr2.start gets copied from base
+		 */
+		public int start, end, len;
+		public byte[] data;
+
+		// TODO consider PatchRecord that only records data position (absolute in data source), and acquires data as needed 
+		private PatchRecord(int p1, int p2, int length, byte[] src) {
+			start = p1;
+			end = p2;
+			len = length;
+			data = src;
+		}
+
+		/*package-local*/ static PatchRecord read(byte[] data, int offset) {
+			final int x = offset; // shorthand
+			int p1 =  ((data[x] & 0xFF)<< 24)    | ((data[x+1] & 0xFF) << 16) | ((data[x+2] & 0xFF) << 8)  | (data[x+3] & 0xFF);
+			int p2 =  ((data[x+4] & 0xFF) << 24) | ((data[x+5] & 0xFF) << 16) | ((data[x+6] & 0xFF) << 8)  | (data[x+7] & 0xFF);
+			int len = ((data[x+8] & 0xFF) << 24) | ((data[x+9] & 0xFF) << 16) | ((data[x+10] & 0xFF) << 8) | (data[x+11] & 0xFF);
+			byte[] dataCopy = new byte[len];
+			System.arraycopy(data, x+12, dataCopy, 0, len);
+			return new PatchRecord(p1, p2, len, dataCopy);
+		}
+
+		public /*for HgBundle*/ static PatchRecord read(DataAccess da) throws IOException {
+			int p1 = da.readInt();
+			int p2 = da.readInt();
+			int len = da.readInt();
+			byte[] src = new byte[len];
+			da.readBytes(src, 0, len);
+			return new PatchRecord(p1, p2, len, src);
+		}
+	}
+
+	// FIXME byte[] data might be too expensive, for few usecases it may be better to have intermediate Access object (when we don't need full data 
+	// instantly - e.g. calculate hash, or comparing two revisions
+	public interface Inspector {
+		// XXX boolean retVal to indicate whether to continue?
+		// TODO specify nodeid and data length, and reuse policy (i.e. if revlog stream doesn't reuse nodeid[] for each call)
+		// implementers shall not invoke DataAccess.done(), it's accomplished by #iterate at appropraite moment
+		void next(int revisionNumber, int actualLen, int baseRevision, int linkRevision, int parent1Revision, int parent2Revision, byte[/*20*/] nodeid, DataAccess data);
+	}
+}