From DONOTREPLY at icculus.org Sun Nov 5 05:06:02 2006
From: DONOTREPLY at icculus.org (DONOTREPLY at icculus.org)
Date: 5 Nov 2006 05:06:02 -0500
Subject: r795 - in trunk: . archivers lzma
Message-ID: <20061105100602.17660.qmail@icculus.org>
Author: icculus
Date: 2006-11-05 05:06:02 -0500 (Sun, 05 Nov 2006)
New Revision: 795
Modified:
trunk/CHANGELOG
trunk/archivers/lzma.c
trunk/lzma/LZMA-LICENSE.txt
Log:
More 7zip work (thanks, Dennis!)
Modified: trunk/CHANGELOG
===================================================================
--- trunk/CHANGELOG 2006-09-27 09:21:56 UTC (rev 794)
+++ trunk/CHANGELOG 2006-11-05 10:06:02 UTC (rev 795)
@@ -2,6 +2,7 @@
* CHANGELOG.
*/
+11052006 - More 7zip archiver work (thanks, Dennis!).
09272006 - Reworked 7zip archiver (thanks, Dennis!).
09232006 - Fixed typo in doxygen comment.
04112006 - Added LZMA archiver...7zip support (thanks, Dennis!).
Modified: trunk/archivers/lzma.c
===================================================================
--- trunk/archivers/lzma.c 2006-09-27 09:21:56 UTC (rev 794)
+++ trunk/archivers/lzma.c 2006-11-05 10:06:02 UTC (rev 795)
@@ -51,16 +51,29 @@
void *File;
} CFileInStream;
-/* Set by LZMA_openArchive, except blockXXX which is handled by LZMA_read() */
+/*
+ * In LZMA the archive is splited in blocks, those are called folders
+ * Set by LZMA_read()
+*/
+typedef struct _LZMAfolder
+{
+ PHYSFS_uint8 *cache; /* Cached folder */
+ PHYSFS_uint32 size; /* Size of folder */
+ PHYSFS_uint32 index; /* Index of folder in archive */
+ PHYSFS_uint32 references; /* Number of files using this block */
+} LZMAfolder;
+
+/*
+ * Set by LZMA_openArchive(), except folder which gets it's values
+ * in LZMA_read()
+ */
typedef struct _LZMAarchive
{
struct _LZMAentry *firstEntry; /* Used for cleanup on shutdown */
struct _LZMAentry *lastEntry;
+ LZMAfolder *folder; /* Array of folders */
CArchiveDatabaseEx db; /* For 7z: Database */
CFileInStream stream; /* For 7z: Input file incl. read and seek callbacks */
- unsigned char *block; /* Currently cached block */
- size_t blockSize; /* Size of current block */
- PHYSFS_uint32 blockIndex; /* Index of current block */
} LZMAarchive;
/* Set by LZMA_openRead(), except offset which is set by LZMA_read() */
@@ -70,8 +83,9 @@
struct _LZMAentry *previous;
LZMAarchive *archive; /* Link to corresponding archive */
CFileItem *file; /* For 7z: File info, eg. name, size */
- PHYSFS_uint32 index; /* Index inside the archive */
- size_t offset; /* Offset inside archive block */
+ PHYSFS_uint32 fileIndex; /* Index of file in archive */
+ PHYSFS_uint32 folderIndex; /* Index of folder in archive */
+ size_t offset; /* Offset in folder */
PHYSFS_uint32 position; /* Current "virtual" position in file */
} LZMAentry;
@@ -273,23 +287,30 @@
allocTempImp.Alloc = SzAllocPhysicsFS;
allocTempImp.Free = SzFreePhysicsFS;
- if (lzma_err(SzExtract(
- &entry->archive->stream.InStream, /* compressed data */
- &entry->archive->db,
- entry->index,
- &entry->archive->blockIndex, /* Index of currently cached block, may be changed by SzExtract */
- &entry->archive->block, /* Cache of current decompressed block, may be allocated/freed by SzExtract */
- &entry->archive->blockSize, /* Size of current cache, may be changed by SzExtract */
- &entry->offset, /* Offset of this file inside the cache block, set by SzExtract */
- &fileSize, /* Size of this file */
- &allocImp,
- &allocTempImp
- )) != SZ_OK)
- return -1;
+ /* Only decompress the folder if it is not allready cached */
+ if (entry->archive->folder[entry->folderIndex].cache == NULL)
+ if (lzma_err(SzExtract(
+ &entry->archive->stream.InStream, /* compressed data */
+ &entry->archive->db,
+ entry->fileIndex,
+ /* Index of cached folder, will be changed by SzExtract */
+ &entry->archive->folder[entry->folderIndex].index,
+ /* Cache for decompressed folder, allocated/freed by SzExtract */
+ &entry->archive->folder[entry->folderIndex].cache,
+ /* Size of cache, will be changed by SzExtract */
+ &entry->archive->folder[entry->folderIndex].size,
+ /* Offset of this file inside the cache, set by SzExtract */
+ &entry->offset,
+ &fileSize, /* Size of this file */
+ &allocImp,
+ &allocTempImp
+ )) != SZ_OK)
+ return -1;
/* Copy wanted bytes over from cache to outBuffer */
strncpy(outBuffer,
- (void*)(entry->archive->block + entry->offset + entry->position),
+ (void*) (entry->archive->folder[entry->folderIndex].cache +
+ entry->offset + entry->position),
wantedSize);
entry->position += wantedSize;
return objCount;
@@ -352,6 +373,13 @@
if (entry->next != NULL)
entry->next->previous = entry->previous;
+ entry->archive->folder[entry->folderIndex].references--;
+ if (entry->archive->folder[entry->folderIndex].references == 0)
+ {
+ allocator.Free(entry->archive->folder[entry->folderIndex].cache);
+ entry->archive->folder[entry->folderIndex].cache = NULL;
+ }
+
allocator.Free(entry);
entry = NULL;
@@ -384,17 +412,17 @@
static void *LZMA_openArchive(const char *name, int forWriting)
{
+ PHYSFS_uint64 len;
LZMAarchive *archive = NULL;
ISzAlloc allocImp;
ISzAlloc allocTempImp;
BAIL_IF_MACRO(forWriting, ERR_ARC_IS_READ_ONLY, NULL);
- BAIL_IF_MACRO(!LZMA_isArchive(name, forWriting), ERR_UNSUPPORTED_ARCHIVE, 0);
+ BAIL_IF_MACRO(!LZMA_isArchive(name,forWriting), ERR_UNSUPPORTED_ARCHIVE, 0);
archive = (LZMAarchive *) allocator.Malloc(sizeof (LZMAarchive));
BAIL_IF_MACRO(archive == NULL, ERR_OUT_OF_MEMORY, NULL);
- archive->block = NULL;
archive->firstEntry = NULL;
archive->lastEntry = NULL;
@@ -424,6 +452,16 @@
return NULL;
} /* if */
+ len = archive->db.Database.NumFolders * sizeof (LZMAfolder);
+ archive->folder = (LZMAfolder *) allocator.Malloc(len);
+ BAIL_IF_MACRO(archive->folder == NULL, ERR_OUT_OF_MEMORY, NULL);
+
+ /*
+ * Init with 0 so we know when a folder is already cached
+ * Values will be set by LZMA_read()
+ */
+ memset(archive->folder, 0, len);
+
return(archive);
} /* LZMA_openArchive */
@@ -529,20 +567,27 @@
{
LZMAarchive *archive = (LZMAarchive *) opaque;
LZMAentry *entry = NULL;
- PHYSFS_uint32 index = 0;
+ PHYSFS_uint32 fileIndex = 0;
+ PHYSFS_uint32 folderIndex = 0;
- *fileExists = lzma_find_entry(archive, name, &index);
+ *fileExists = lzma_find_entry(archive, name, &fileIndex);
BAIL_IF_MACRO(!*fileExists, ERR_NO_SUCH_FILE, NULL);
+ folderIndex = archive->db.FileIndexToFolderIndexMap[fileIndex];
+ BAIL_IF_MACRO(folderIndex == (PHYSFS_uint32)-1, ERR_UNKNOWN_ERROR, NULL);
+
entry = (LZMAentry *) allocator.Malloc(sizeof (LZMAentry));
BAIL_IF_MACRO(entry == NULL, ERR_OUT_OF_MEMORY, NULL);
- entry->index = index;
+ entry->fileIndex = fileIndex;
+ entry->folderIndex = folderIndex;
entry->archive = archive;
- entry->file = archive->db.Database.Files + entry->index;
+ entry->file = archive->db.Database.Files + entry->fileIndex;
entry->offset = 0; /* Offset will be set by LZMA_read() */
entry->position = 0;
+ archive->folder[folderIndex].references++;
+
entry->next = NULL;
entry->previous = entry->archive->lastEntry;
if (entry->previous != NULL)
@@ -584,7 +629,7 @@
__PHYSFS_platformClose(archive->stream.File);
/* Free the cache which might have been allocated by LZMA_read() */
- allocator.Free(archive->block);
+ allocator.Free(archive->folder);
allocator.Free(archive);
} /* LZMA_dirClose */
Modified: trunk/lzma/LZMA-LICENSE.txt
===================================================================
--- trunk/lzma/LZMA-LICENSE.txt 2006-09-27 09:21:56 UTC (rev 794)
+++ trunk/lzma/LZMA-LICENSE.txt 2006-11-05 10:06:02 UTC (rev 795)
@@ -92,4 +92,3 @@
You should have received a copy of the Common Public License
along with this library.
-k
From DONOTREPLY at icculus.org Sun Nov 5 06:09:42 2006
From: DONOTREPLY at icculus.org (DONOTREPLY at icculus.org)
Date: 5 Nov 2006 06:09:42 -0500
Subject: r796 - trunk/platform
Message-ID: <20061105110942.2263.qmail@icculus.org>
Author: icculus
Date: 2006-11-05 06:09:42 -0500 (Sun, 05 Nov 2006)
New Revision: 796
Modified:
trunk/platform/win32.c
Log:
Silly comment typo.
Modified: trunk/platform/win32.c
===================================================================
--- trunk/platform/win32.c 2006-11-05 10:06:02 UTC (rev 795)
+++ trunk/platform/win32.c 2006-11-05 11:09:42 UTC (rev 796)
@@ -60,7 +60,7 @@
/*
* Users without the platform SDK don't have this defined. The original docs
* for SetFilePointer() just said to compare with 0xFFFFFFFF, so this should
- * work as desired
+ * work as desired.
*/
#define PHYSFS_INVALID_SET_FILE_POINTER 0xFFFFFFFF
From DONOTREPLY at icculus.org Sun Nov 5 06:10:14 2006
From: DONOTREPLY at icculus.org (DONOTREPLY at icculus.org)
Date: 5 Nov 2006 06:10:14 -0500
Subject: r797 - trunk
Message-ID: <20061105111014.2452.qmail@icculus.org>
Author: icculus
Date: 2006-11-05 06:10:14 -0500 (Sun, 05 Nov 2006)
New Revision: 797
Modified:
trunk/CHANGELOG
trunk/Makefile.am.newautomake
trunk/Makefile.am.oldautomake
trunk/makeos2.cmd
trunk/physfs.dsp
trunk/physfs.h
trunk/physfs.vcproj
trunk/physfsMPW.make
trunk/physfs_static.dsp
Log:
Initial Unicode work.
Modified: trunk/CHANGELOG
===================================================================
--- trunk/CHANGELOG 2006-11-05 11:09:42 UTC (rev 796)
+++ trunk/CHANGELOG 2006-11-05 11:10:14 UTC (rev 797)
@@ -2,7 +2,7 @@
* CHANGELOG.
*/
-11052006 - More 7zip archiver work (thanks, Dennis!).
+11052006 - More 7zip archiver work (thanks, Dennis!). Initial Unicode work.
09272006 - Reworked 7zip archiver (thanks, Dennis!).
09232006 - Fixed typo in doxygen comment.
04112006 - Added LZMA archiver...7zip support (thanks, Dennis!).
Modified: trunk/Makefile.am.newautomake
===================================================================
--- trunk/Makefile.am.newautomake 2006-11-05 11:09:42 UTC (rev 796)
+++ trunk/Makefile.am.newautomake 2006-11-05 11:10:14 UTC (rev 797)
@@ -110,6 +110,7 @@
libphysfs_la_SOURCES = \
physfs.c \
physfs_internal.h \
+ physfs_unicode.c \
physfs_byteorder.c
if BUILD_ZLIB
Modified: trunk/Makefile.am.oldautomake
===================================================================
--- trunk/Makefile.am.oldautomake 2006-11-05 11:09:42 UTC (rev 796)
+++ trunk/Makefile.am.oldautomake 2006-11-05 11:10:14 UTC (rev 797)
@@ -9,6 +9,7 @@
libphysfs_la_SOURCES = \
physfs.c \
physfs_internal.h \
+ physfs_unicode.c \
physfs_byteorder.c
if BUILD_ZLIB
Modified: trunk/makeos2.cmd
===================================================================
--- trunk/makeos2.cmd 2006-11-05 11:09:42 UTC (rev 796)
+++ trunk/makeos2.cmd 2006-11-05 11:10:14 UTC (rev 797)
@@ -110,6 +110,11 @@
@echo "PHYSFS_getCdRomDirsCallback" >> bin\physfs.def
@echo "PHYSFS_getSearchPathCallback" >> bin\physfs.def
@echo "PHYSFS_enumerateFilesCallback" >> bin\physfs.def
+ at echo "PHYSFS_utf8toucs2" >> bin\physfs.def
+ at echo "PHYSFS_utf8fromucs2" >> bin\physfs.def
+ at echo "PHYSFS_utf8toucs4" >> bin\physfs.def
+ at echo "PHYSFS_utf8fromucs4" >> bin\physfs.def
+ at echo "PHYSFS_utf8fromlatin1" >> bin\physfs.def
@echo Building export library...
emximp -o bin/physfs.lib bin/physfs.def
@@ -118,6 +123,7 @@
@echo on
gcc %CFLAGS% -o bin/physfs.obj physfs.c
gcc %CFLAGS% -o bin/physfs_byteorder.obj physfs_byteorder.c
+gcc %CFLAGS% -o bin/physfs_unicode.obj physfs_unicode.c
gcc %CFLAGS% -o bin/os2.obj platform/os2.c
gcc %CFLAGS% -o bin/dir.obj archivers/dir.c
gcc %CFLAGS% -o bin/grp.obj archivers/grp.c
Modified: trunk/physfs.dsp
===================================================================
--- trunk/physfs.dsp 2006-11-05 11:09:42 UTC (rev 796)
+++ trunk/physfs.dsp 2006-11-05 11:10:14 UTC (rev 797)
@@ -149,6 +149,10 @@
# End Source File
# Begin Source File
+SOURCE=.\physfs_unicode.c
+# End Source File
+# Begin Source File
+
SOURCE=.\archivers\qpak.c
# End Source File
# Begin Source File
Modified: trunk/physfs.h
===================================================================
--- trunk/physfs.h 2006-11-05 11:09:42 UTC (rev 796)
+++ trunk/physfs.h 2006-11-05 11:10:14 UTC (rev 797)
@@ -147,6 +147,40 @@
* - .WAD (DOOM engine archives)
* - .MIX (Older Westwood games archives)
*
+ *
+ * String policy for PhysicsFS 2.0 and later:
+ *
+ * PhysicsFS 1.0 deals with null-terminated ASCII strings. All high ASCII
+ * chars resulted in undefined behaviour, and there was no Unicode support.
+ *
+ * All strings passed through PhysicsFS are in null-terminated UTF-8 format.
+ * This means that if all you care about is English (ASCII characters <= 127)
+ * then you just use regular C strings. If you care about Unicode (and you
+ * should!) then you need to figure out what your platform wants, needs, and
+ * offers. If you are on Windows and build with Unicode support, your TCHAR
+ * strings are two bytes per character (this is called "UCS-2 encoding"). You
+ * should convert them to UTF-8 before handing them to PhysicsFS with
+ * PHYSFS_utf8fromucs2(). If you're using Unix or Mac OS X, your wchar_t
+ * strings are four bytes per character ("UCS-4 encoding"). Use
+ * PHYSFS_utf8fromucs2(). Mac OS X can gie you UTF-8 directly from a CFString,
+ * and many Unixes generally give you C strings in UTF-8 format everywhere.
+ * If you have a single-byte high ASCII charset, like so-many European
+ * "codepages" you may be out of luck. We'll convert from "Latin1" to UTF-8
+ * only, and never back to Latin1. If you're above ASCII 127, all bets are
+ * off: move to Unicode or use your platform's facilities. Passing a C string
+ * with high-ASCII data that isn't UTF-8 encoded will NOT do what you expect!
+ *
+ * Naturally, there's also PHYSFS_utf8toucs2() and PHYSFS_utf8toucs4() to get
+ * data back into a format you like. Behind the scenes, PhysicsFS will use
+ * Unicode where possible: the UTF-8 strings on Windows will be converted
+ * and used with the multibyte Windows APIs, for example.
+ *
+ * PhysicsFS offers basic encoding conversion support, but not a whole string
+ * library. Get your stuff into whatever format you can work with.
+ *
+ *
+ * Other stuff:
+ *
* Please see the file LICENSE in the source's root directory for licensing
* and redistribution rights.
*
@@ -1989,7 +2023,129 @@
PHYSFS_EnumFilesCallback c,
void *d);
+/**
+ * \fn void PHYSFS_utf8fromucs4(const PHYSFS_uint32 *src, char *dst, PHYSFS_uint64 len)
+ * \brief Convert a UCS-4 string to a UTF-8 string.
+ *
+ * UCS-4 strings are 32-bits per character: \c wchar_t on Unix.
+ *
+ * To ensure that the destination buffer is large enough for the conversion,
+ * please allocate a buffer that is the same size as the source buffer. UTF-8
+ * never uses more than 32-bits per character, so while it may shrink a UCS-4
+ * string, it will never expand it.
+ *
+ * Strings that don't fit in the destination buffer will be truncated, but
+ * will always be null-terminated and never have an incomplete UTF-8
+ * sequence at the end.
+ *
+ * \param src Null-terminated source string in UCS-4 format.
+ * \param dst Buffer to store converted UTF-8 string.
+ * \param len Size, in bytes, of destination buffer.
+ */
+__EXPORT__ void PHYSFS_utf8fromucs4(const PHYSFS_uint32 *src, char *dst,
+ PHYSFS_uint64 len);
+/**
+ * \fn void PHYSFS_utf8toucs4(const char *src, PHYSFS_uint32 *dst, PHYSFS_uint64 len)
+ * \brief Convert a UTF-8 string to a UCS-4 string.
+ *
+ * UCS-4 strings are 32-bits per character: \c wchar_t on Unix.
+ *
+ * To ensure that the destination buffer is large enough for the conversion,
+ * please allocate a buffer that is four times the size of the source buffer.
+ * UTF-8 uses from one to four bytes per character, but UCS-4 always uses
+ * four, so an entirely low-ASCII string will quadruple in size!
+ *
+ * Strings that don't fit in the destination buffer will be truncated, but
+ * will always be null-terminated and never have an incomplete UCS-4
+ * sequence at the end.
+ *
+ * \param src Null-terminated source string in UTF-8 format.
+ * \param dst Buffer to store converted UCS-4 string.
+ * \param len Size, in bytes, of destination buffer.
+ */
+__EXPORT__ void PHYSFS_utf8toucs4(const char *src, PHYSFS_uint32 *dst,
+ PHYSFS_uint64 len);
+
+/**
+ * \fn void PHYSFS_utf8fromucs2(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len)
+ * \brief Convert a UCS-2 string to a UTF-8 string.
+ *
+ * UCS-2 strings are 16-bits per character: \c TCHAR on Windows, when building
+ * with Unicode support.
+ *
+ * To ensure that the destination buffer is large enough for the conversion,
+ * please allocate a buffer that is double the size of the source buffer.
+ * UTF-8 never uses more than 32-bits per character, so while it may shrink
+ * a UCS-2 string, it may also expand it.
+ *
+ * Strings that don't fit in the destination buffer will be truncated, but
+ * will always be null-terminated and never have an incomplete UTF-8
+ * sequence at the end.
+ *
+ * Please note that UCS-2 is not UTF-16; we do not support the "surrogate"
+ * values at this time.
+ *
+ * \param src Null-terminated source string in UCS-2 format.
+ * \param dst Buffer to store converted UTF-8 string.
+ * \param len Size, in bytes, of destination buffer.
+ */
+__EXPORT__ void PHYSFS_utf8fromucs2(const PHYSFS_uint16 *src, char *dst,
+ PHYSFS_uint64 len);
+
+/**
+ * \fn PHYSFS_utf8toucs2(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len)
+ * \brief Convert a UTF-8 string to a UCS-2 string.
+ *
+ * UCS-2 strings are 16-bits per character: \c TCHAR on Windows, when building
+ * with Unicode support.
+ *
+ * To ensure that the destination buffer is large enough for the conversion,
+ * please allocate a buffer that is double the size of the source buffer.
+ * UTF-8 uses from one to four bytes per character, but UCS-2 always uses
+ * two, so an entirely low-ASCII string will double in size!
+ *
+ * Strings that don't fit in the destination buffer will be truncated, but
+ * will always be null-terminated and never have an incomplete UCS-2
+ * sequence at the end.
+ *
+ * Please note that UCS-2 is not UTF-16; we do not support the "surrogate"
+ * values at this time.
+ *
+ * \param src Null-terminated source string in UTF-8 format.
+ * \param dst Buffer to store converted UCS-2 string.
+ * \param len Size, in bytes, of destination buffer.
+ */
+__EXPORT__ void PHYSFS_utf8toucs2(const char *src, PHYSFS_uint16 *dst,
+ PHYSFS_uint64 len);
+
+/**
+ * \fn void PHYSFS_utf8fromlatin1(const char *src, char *dst, PHYSFS_uint64 len)
+ * \brief Convert a UTF-8 string to a Latin1 string.
+ *
+ * Latin1 strings are 8-bits per character: a popular "high ASCII"
+ * encoding.
+ *
+ * To ensure that the destination buffer is large enough for the conversion,
+ * please allocate a buffer that is double the size of the source buffer.
+ * UTF-8 expands latin1 codepoints over 127 from to 2 bytes, so the string
+ * may grow in some cases.
+ *
+ * Strings that don't fit in the destination buffer will be truncated, but
+ * will always be null-terminated and never have an incomplete UTF-8
+ * sequence at the end.
+ *
+ * Please note that we do not supply a UTF-8 to Latin1 converter, since Latin1
+ * can't express most Unicode codepoints. It's a legacy encoding; you should
+ * be converting away from it at all times.
+ *
+ * \param src Null-terminated source string in Latin1 format.
+ * \param dst Buffer to store converted UTF-8 string.
+ * \param len Size, in bytes, of destination buffer.
+ */
+__EXPORT__ void PHYSFS_utf8fromlatin1(const char *src, char *dst,
+ PHYSFS_uint64 len);
+
/* Everything above this line is part of the PhysicsFS 2.0 API. */
Modified: trunk/physfs.vcproj
===================================================================
--- trunk/physfs.vcproj 2006-11-05 11:09:42 UTC (rev 796)
+++ trunk/physfs.vcproj 2006-11-05 11:10:14 UTC (rev 797)
@@ -194,6 +194,9 @@
RelativePath=".\physfs_byteorder.c">
+
+
Author: icculus
Date: 2006-11-05 06:30:48 -0500 (Sun, 05 Nov 2006)
New Revision: 798
Modified:
trunk/lzma/
Log:
Ignore automake files.
Property changes on: trunk/lzma
___________________________________________________________________
Name: svn:ignore
+ Makefile.in
Makefile
From DONOTREPLY at icculus.org Sun Nov 5 06:32:18 2006
From: DONOTREPLY at icculus.org (DONOTREPLY at icculus.org)
Date: 5 Nov 2006 06:32:18 -0500
Subject: r799 - trunk
Message-ID: <20061105113218.8628.qmail@icculus.org>
Author: icculus
Date: 2006-11-05 06:32:18 -0500 (Sun, 05 Nov 2006)
New Revision: 799
Added:
trunk/physfs_unicode.c
Log:
Initial add.
Added: trunk/physfs_unicode.c
===================================================================
--- trunk/physfs_unicode.c (rev 0)
+++ trunk/physfs_unicode.c 2006-11-05 11:32:18 UTC (rev 799)
@@ -0,0 +1,338 @@
+#if HAVE_CONFIG_H
+# include
+#endif
+
+#include "physfs.h"
+
+#define __PHYSICSFS_INTERNAL__
+#include "physfs_internal.h"
+
+
+/*
+ * From rfc3629, the UTF-8 spec:
+ * http://www.ietf.org/rfc/rfc3629.txt
+ *
+ * Char. number range | UTF-8 octet sequence
+ * (hexadecimal) | (binary)
+ * --------------------+---------------------------------------------
+ * 0000 0000-0000 007F | 0xxxxxxx
+ * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
+ * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
+ * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ */
+
+
+/*
+ * This may not be the best value, but it's one that isn't represented
+ * in Unicode (0x10FFFF is the largest codepoint value). We return this
+ * value from utf8codepoint() if there's bogus bits in the
+ * stream. utf8codepoint() will turn this value into something
+ * reasonable (like a question mark), for text that wants to try to recover,
+ * whereas utf8valid() will use the value to determine if a string has bad
+ * bits.
+ */
+#define UNICODE_BOGUS_CHAR_VALUE 0xFFFFFFFF
+
+/*
+ * This is the codepoint we currently return when there was bogus bits in a
+ * UTF-8 string. May not fly in Asian locales?
+ */
+#define UNICODE_BOGUS_CHAR_CODEPOINT '?'
+
+static PHYSFS_uint32 utf8codepoint(const char **_str)
+{
+ const char *str = *_str;
+ PHYSFS_uint32 retval = 0;
+ PHYSFS_uint32 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *str);
+ PHYSFS_uint32 octet2, octet3, octet4;
+
+ if (octet == 0) /* null terminator, end of string. */
+ return 0;
+
+ else if (octet < 128) /* one octet char: 0 to 127 */
+ {
+ (*_str)++; /* skip to next possible start of codepoint. */
+ return(octet);
+ } /* else if */
+
+ else if ((octet > 127) && (octet < 192)) /* bad (starts with 10xxxxxx). */
+ {
+ /*
+ * Apparently each of these is supposed to be flagged as a bogus
+ * char, instead of just resyncing to the next valid codepoint.
+ */
+ (*_str)++; /* skip to next possible start of codepoint. */
+ return UNICODE_BOGUS_CHAR_VALUE;
+ } /* else if */
+
+ else if (octet < 224) /* two octets */
+ {
+ octet -= (128+64);
+ octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+ if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ *_str += 2; /* skip to next possible start of codepoint. */
+ retval = ((octet << 6) | (octet2 - 128));
+ if ((retval >= 0x80) && (retval <= 0x7FF))
+ return retval;
+ } /* else if */
+
+ else if (octet < 240) /* three octets */
+ {
+ octet -= (128+64+32);
+ octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+ if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+ if ((octet3 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ *_str += 3; /* skip to next possible start of codepoint. */
+ retval = ( ((octet << 12)) | ((octet2-128) << 6) | ((octet3-128)) );
+
+ /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
+ switch (retval)
+ {
+ case 0xD800:
+ case 0xDB7F:
+ case 0xDB80:
+ case 0xDBFF:
+ case 0xDC00:
+ case 0xDF80:
+ case 0xDFFF:
+ return UNICODE_BOGUS_CHAR_VALUE;
+ } /* switch */
+
+ /* 0xFFFE and 0xFFFF are illegal, too, so we check them at the edge. */
+ if ((retval >= 0x800) && (retval <= 0xFFFD))
+ return retval;
+ } /* else if */
+
+ else if (octet < 248) /* four octets */
+ {
+ octet -= (128+64+32+16);
+ octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+ if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+ if ((octet3 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ octet4 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+ if ((octet4 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ *_str += 4; /* skip to next possible start of codepoint. */
+ retval = ( ((octet << 18)) | ((octet2 - 128) << 12) |
+ ((octet3 - 128) << 6) | ((octet4 - 128)) );
+ if ((retval >= 0x10000) && (retval <= 0x10FFFF))
+ return retval;
+ } /* else if */
+
+ /*
+ * Five and six octet sequences became illegal in rfc3629.
+ * We throw the codepoint away, but parse them to make sure we move
+ * ahead the right number of bytes and don't overflow the buffer.
+ */
+
+ else if (octet < 252) /* five octets */
+ {
+ octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+ if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+ if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+ if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+ if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ *_str += 5; /* skip to next possible start of codepoint. */
+ return UNICODE_BOGUS_CHAR_VALUE;
+ } /* else if */
+
+ else /* six octets */
+ {
+ octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+ if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+ if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+ if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+ if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+ if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ *_str += 6; /* skip to next possible start of codepoint. */
+ return UNICODE_BOGUS_CHAR_VALUE;
+ } /* else if */
+
+ return UNICODE_BOGUS_CHAR_VALUE;
+} /* utf8codepoint */
+
+void PHYSFS_utf8toucs4(const char *src, PHYSFS_uint32 *dst, PHYSFS_uint64 len)
+{
+ len -= sizeof (PHYSFS_uint32); /* save room for null char. */
+ while (len >= sizeof (PHYSFS_uint32))
+ {
+ PHYSFS_uint32 cp = utf8codepoint(&src);
+ if (cp == 0)
+ break;
+ else if (cp == UNICODE_BOGUS_CHAR_VALUE)
+ cp = UNICODE_BOGUS_CHAR_CODEPOINT;
+ *(dst++) = cp;
+ len -= sizeof (PHYSFS_uint32);
+ } /* while */
+
+ *dst = 0;
+} /* PHYSFS_utf8toucs4 */
+
+void PHYSFS_utf8toucs2(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len)
+{
+ len -= sizeof (PHYSFS_uint16); /* save room for null char. */
+ while (len >= sizeof (PHYSFS_uint16))
+ {
+ PHYSFS_uint32 cp = utf8codepoint(&src);
+ if (cp == 0)
+ break;
+ else if (cp == UNICODE_BOGUS_CHAR_VALUE)
+ cp = UNICODE_BOGUS_CHAR_CODEPOINT;
+
+ /* !!! BLUESKY: UTF-16 surrogates? */
+ if (cp > 0xFFFF)
+ cp = UNICODE_BOGUS_CHAR_CODEPOINT;
+
+ *(dst++) = cp;
+ len -= sizeof (PHYSFS_uint16);
+ } /* while */
+
+ *dst = 0;
+} /* PHYSFS_utf8toucs2 */
+
+static void utf8fromcodepoint(PHYSFS_uint32 cp, char **_dst, PHYSFS_uint64 *_len)
+{
+ char *dst = *_dst;
+ PHYSFS_uint64 len = *_len;
+
+ if (len == 0)
+ return;
+
+ if (cp > 0x10FFFF)
+ cp = UNICODE_BOGUS_CHAR_CODEPOINT;
+ else if ((cp == 0xFFFE) || (cp == 0xFFFF)) /* illegal values. */
+ cp = UNICODE_BOGUS_CHAR_CODEPOINT;
+ else
+ {
+ /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
+ switch (cp)
+ {
+ case 0xD800:
+ case 0xDB7F:
+ case 0xDB80:
+ case 0xDBFF:
+ case 0xDC00:
+ case 0xDF80:
+ case 0xDFFF:
+ cp = UNICODE_BOGUS_CHAR_CODEPOINT;
+ } /* switch */
+ } /* else */
+
+ /* Do the encoding... */
+ if (cp < 0x80)
+ {
+ *(dst++) = (char) cp;
+ len--;
+ } /* if */
+
+ else if (cp < 0x800)
+ {
+ if (len < 2)
+ len = 0;
+ else
+ {
+ *(dst++) = (char) ((cp >> 6) | 128 | 64);
+ *(dst++) = (char) (cp & 0x3F) | 128;
+ len -= 2;
+ } /* else */
+ } /* else if */
+
+ else if (cp < 0x10000)
+ {
+ if (len < 3)
+ len = 0;
+ else
+ {
+ *(dst++) = (char) ((cp >> 12) | 128 | 64 | 32);
+ *(dst++) = (char) ((cp >> 6) & 0x3F) | 128;
+ *(dst++) = (char) (cp & 0x3F) | 128;
+ len -= 3;
+ } /* else */
+ } /* else if */
+
+ else
+ {
+ if (len < 4)
+ len = 0;
+ else
+ {
+ *(dst++) = (char) ((cp >> 18) | 128 | 64 | 32 | 16);
+ *(dst++) = (char) ((cp >> 12) & 0x3F) | 128;
+ *(dst++) = (char) ((cp >> 6) & 0x3F) | 128;
+ *(dst++) = (char) (cp & 0x3F) | 128;
+ len -= 4;
+ } /* else if */
+ } /* else */
+
+ *_dst = dst;
+ *_len = len;
+} /* utf8fromcodepoint */
+
+#define UTF8FROMTYPE(typ, src, dst, len) \
+ len--; \
+ while (len) \
+ { \
+ const PHYSFS_uint32 cp = (PHYSFS_uint32) *(src++); \
+ if (cp == 0) break; \
+ utf8fromcodepoint(cp, &dst, &len); \
+ } \
+ *dst = '\0'; \
+
+void PHYSFS_utf8fromucs4(const PHYSFS_uint32 *src, char *dst, PHYSFS_uint64 len)
+{
+ UTF8FROMTYPE(PHYSFS_uint32, src, dst, len);
+} /* PHYSFS_utf8fromucs4 */
+
+void PHYSFS_utf8fromucs2(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len)
+{
+ UTF8FROMTYPE(PHYSFS_uint64, src, dst, len);
+} /* PHYSFS_utf8fromucs4 */
+
+/* latin1 maps to unicode codepoints directly, we just utf-8 encode it. */
+void PHYSFS_utf8fromlatin1(const char *src, char *dst, PHYSFS_uint64 len)
+{
+ UTF8FROMTYPE(PHYSFS_uint8, src, dst, len);
+} /* PHYSFS_utf8fromlatin1 */
+
+#undef UTF8FROMTYPE
+
+/* end of physfs_unicode.c ... */
+
From DONOTREPLY at icculus.org Sun Nov 5 14:06:23 2006
From: DONOTREPLY at icculus.org (DONOTREPLY at icculus.org)
Date: 5 Nov 2006 14:06:23 -0500
Subject: r800 - in trunk: . platform
Message-ID: <20061105190623.6982.qmail@icculus.org>
Author: icculus
Date: 2006-11-05 14:06:23 -0500 (Sun, 05 Nov 2006)
New Revision: 800
Modified:
trunk/CHANGELOG
trunk/platform/beos.cpp
Log:
Apparently BeOS's BPath constructor doesn't actually _need_ leaf to be
manually split out.
Modified: trunk/CHANGELOG
===================================================================
--- trunk/CHANGELOG 2006-11-05 11:32:18 UTC (rev 799)
+++ trunk/CHANGELOG 2006-11-05 19:06:23 UTC (rev 800)
@@ -3,6 +3,7 @@
*/
11052006 - More 7zip archiver work (thanks, Dennis!). Initial Unicode work.
+ Minor BeOS realpath tweak.
09272006 - Reworked 7zip archiver (thanks, Dennis!).
09232006 - Fixed typo in doxygen comment.
04112006 - Added LZMA archiver...7zip support (thanks, Dennis!).
Modified: trunk/platform/beos.cpp
===================================================================
--- trunk/platform/beos.cpp 2006-11-05 11:32:18 UTC (rev 799)
+++ trunk/platform/beos.cpp 2006-11-05 19:06:23 UTC (rev 800)
@@ -200,14 +200,7 @@
char *__PHYSFS_platformRealPath(const char *path)
{
- char *str = (char *) alloca(strlen(path) + 1);
- BAIL_IF_MACRO(str == NULL, ERR_OUT_OF_MEMORY, NULL);
- strcpy(str, path);
- char *leaf = strrchr(str, '/');
- if (leaf != NULL)
- *(leaf++) = '\0';
-
- BPath normalized(str, leaf, true); /* force normalization of path. */
+ BPath normalized(path, NULL, true); /* force normalization of path. */
const char *resolved_path = normalized.Path();
BAIL_IF_MACRO(resolved_path == NULL, ERR_NO_SUCH_FILE, NULL);
char *retval = (char *) allocator.Malloc(strlen(resolved_path) + 1);