From DONOTREPLY at icculus.org Sun Nov 5 05:06:02 2006 From: DONOTREPLY at icculus.org (DONOTREPLY at icculus.org) Date: 5 Nov 2006 05:06:02 -0500 Subject: r795 - in trunk: . archivers lzma Message-ID: <20061105100602.17660.qmail@icculus.org> Author: icculus Date: 2006-11-05 05:06:02 -0500 (Sun, 05 Nov 2006) New Revision: 795 Modified: trunk/CHANGELOG trunk/archivers/lzma.c trunk/lzma/LZMA-LICENSE.txt Log: More 7zip work (thanks, Dennis!) Modified: trunk/CHANGELOG =================================================================== --- trunk/CHANGELOG 2006-09-27 09:21:56 UTC (rev 794) +++ trunk/CHANGELOG 2006-11-05 10:06:02 UTC (rev 795) @@ -2,6 +2,7 @@ * CHANGELOG. */ +11052006 - More 7zip archiver work (thanks, Dennis!). 09272006 - Reworked 7zip archiver (thanks, Dennis!). 09232006 - Fixed typo in doxygen comment. 04112006 - Added LZMA archiver...7zip support (thanks, Dennis!). Modified: trunk/archivers/lzma.c =================================================================== --- trunk/archivers/lzma.c 2006-09-27 09:21:56 UTC (rev 794) +++ trunk/archivers/lzma.c 2006-11-05 10:06:02 UTC (rev 795) @@ -51,16 +51,29 @@ void *File; } CFileInStream; -/* Set by LZMA_openArchive, except blockXXX which is handled by LZMA_read() */ +/* + * In LZMA the archive is splited in blocks, those are called folders + * Set by LZMA_read() +*/ +typedef struct _LZMAfolder +{ + PHYSFS_uint8 *cache; /* Cached folder */ + PHYSFS_uint32 size; /* Size of folder */ + PHYSFS_uint32 index; /* Index of folder in archive */ + PHYSFS_uint32 references; /* Number of files using this block */ +} LZMAfolder; + +/* + * Set by LZMA_openArchive(), except folder which gets it's values + * in LZMA_read() + */ typedef struct _LZMAarchive { struct _LZMAentry *firstEntry; /* Used for cleanup on shutdown */ struct _LZMAentry *lastEntry; + LZMAfolder *folder; /* Array of folders */ CArchiveDatabaseEx db; /* For 7z: Database */ CFileInStream stream; /* For 7z: Input file incl. read and seek callbacks */ - unsigned char *block; /* Currently cached block */ - size_t blockSize; /* Size of current block */ - PHYSFS_uint32 blockIndex; /* Index of current block */ } LZMAarchive; /* Set by LZMA_openRead(), except offset which is set by LZMA_read() */ @@ -70,8 +83,9 @@ struct _LZMAentry *previous; LZMAarchive *archive; /* Link to corresponding archive */ CFileItem *file; /* For 7z: File info, eg. name, size */ - PHYSFS_uint32 index; /* Index inside the archive */ - size_t offset; /* Offset inside archive block */ + PHYSFS_uint32 fileIndex; /* Index of file in archive */ + PHYSFS_uint32 folderIndex; /* Index of folder in archive */ + size_t offset; /* Offset in folder */ PHYSFS_uint32 position; /* Current "virtual" position in file */ } LZMAentry; @@ -273,23 +287,30 @@ allocTempImp.Alloc = SzAllocPhysicsFS; allocTempImp.Free = SzFreePhysicsFS; - if (lzma_err(SzExtract( - &entry->archive->stream.InStream, /* compressed data */ - &entry->archive->db, - entry->index, - &entry->archive->blockIndex, /* Index of currently cached block, may be changed by SzExtract */ - &entry->archive->block, /* Cache of current decompressed block, may be allocated/freed by SzExtract */ - &entry->archive->blockSize, /* Size of current cache, may be changed by SzExtract */ - &entry->offset, /* Offset of this file inside the cache block, set by SzExtract */ - &fileSize, /* Size of this file */ - &allocImp, - &allocTempImp - )) != SZ_OK) - return -1; + /* Only decompress the folder if it is not allready cached */ + if (entry->archive->folder[entry->folderIndex].cache == NULL) + if (lzma_err(SzExtract( + &entry->archive->stream.InStream, /* compressed data */ + &entry->archive->db, + entry->fileIndex, + /* Index of cached folder, will be changed by SzExtract */ + &entry->archive->folder[entry->folderIndex].index, + /* Cache for decompressed folder, allocated/freed by SzExtract */ + &entry->archive->folder[entry->folderIndex].cache, + /* Size of cache, will be changed by SzExtract */ + &entry->archive->folder[entry->folderIndex].size, + /* Offset of this file inside the cache, set by SzExtract */ + &entry->offset, + &fileSize, /* Size of this file */ + &allocImp, + &allocTempImp + )) != SZ_OK) + return -1; /* Copy wanted bytes over from cache to outBuffer */ strncpy(outBuffer, - (void*)(entry->archive->block + entry->offset + entry->position), + (void*) (entry->archive->folder[entry->folderIndex].cache + + entry->offset + entry->position), wantedSize); entry->position += wantedSize; return objCount; @@ -352,6 +373,13 @@ if (entry->next != NULL) entry->next->previous = entry->previous; + entry->archive->folder[entry->folderIndex].references--; + if (entry->archive->folder[entry->folderIndex].references == 0) + { + allocator.Free(entry->archive->folder[entry->folderIndex].cache); + entry->archive->folder[entry->folderIndex].cache = NULL; + } + allocator.Free(entry); entry = NULL; @@ -384,17 +412,17 @@ static void *LZMA_openArchive(const char *name, int forWriting) { + PHYSFS_uint64 len; LZMAarchive *archive = NULL; ISzAlloc allocImp; ISzAlloc allocTempImp; BAIL_IF_MACRO(forWriting, ERR_ARC_IS_READ_ONLY, NULL); - BAIL_IF_MACRO(!LZMA_isArchive(name, forWriting), ERR_UNSUPPORTED_ARCHIVE, 0); + BAIL_IF_MACRO(!LZMA_isArchive(name,forWriting), ERR_UNSUPPORTED_ARCHIVE, 0); archive = (LZMAarchive *) allocator.Malloc(sizeof (LZMAarchive)); BAIL_IF_MACRO(archive == NULL, ERR_OUT_OF_MEMORY, NULL); - archive->block = NULL; archive->firstEntry = NULL; archive->lastEntry = NULL; @@ -424,6 +452,16 @@ return NULL; } /* if */ + len = archive->db.Database.NumFolders * sizeof (LZMAfolder); + archive->folder = (LZMAfolder *) allocator.Malloc(len); + BAIL_IF_MACRO(archive->folder == NULL, ERR_OUT_OF_MEMORY, NULL); + + /* + * Init with 0 so we know when a folder is already cached + * Values will be set by LZMA_read() + */ + memset(archive->folder, 0, len); + return(archive); } /* LZMA_openArchive */ @@ -529,20 +567,27 @@ { LZMAarchive *archive = (LZMAarchive *) opaque; LZMAentry *entry = NULL; - PHYSFS_uint32 index = 0; + PHYSFS_uint32 fileIndex = 0; + PHYSFS_uint32 folderIndex = 0; - *fileExists = lzma_find_entry(archive, name, &index); + *fileExists = lzma_find_entry(archive, name, &fileIndex); BAIL_IF_MACRO(!*fileExists, ERR_NO_SUCH_FILE, NULL); + folderIndex = archive->db.FileIndexToFolderIndexMap[fileIndex]; + BAIL_IF_MACRO(folderIndex == (PHYSFS_uint32)-1, ERR_UNKNOWN_ERROR, NULL); + entry = (LZMAentry *) allocator.Malloc(sizeof (LZMAentry)); BAIL_IF_MACRO(entry == NULL, ERR_OUT_OF_MEMORY, NULL); - entry->index = index; + entry->fileIndex = fileIndex; + entry->folderIndex = folderIndex; entry->archive = archive; - entry->file = archive->db.Database.Files + entry->index; + entry->file = archive->db.Database.Files + entry->fileIndex; entry->offset = 0; /* Offset will be set by LZMA_read() */ entry->position = 0; + archive->folder[folderIndex].references++; + entry->next = NULL; entry->previous = entry->archive->lastEntry; if (entry->previous != NULL) @@ -584,7 +629,7 @@ __PHYSFS_platformClose(archive->stream.File); /* Free the cache which might have been allocated by LZMA_read() */ - allocator.Free(archive->block); + allocator.Free(archive->folder); allocator.Free(archive); } /* LZMA_dirClose */ Modified: trunk/lzma/LZMA-LICENSE.txt =================================================================== --- trunk/lzma/LZMA-LICENSE.txt 2006-09-27 09:21:56 UTC (rev 794) +++ trunk/lzma/LZMA-LICENSE.txt 2006-11-05 10:06:02 UTC (rev 795) @@ -92,4 +92,3 @@ You should have received a copy of the Common Public License along with this library. -k From DONOTREPLY at icculus.org Sun Nov 5 06:09:42 2006 From: DONOTREPLY at icculus.org (DONOTREPLY at icculus.org) Date: 5 Nov 2006 06:09:42 -0500 Subject: r796 - trunk/platform Message-ID: <20061105110942.2263.qmail@icculus.org> Author: icculus Date: 2006-11-05 06:09:42 -0500 (Sun, 05 Nov 2006) New Revision: 796 Modified: trunk/platform/win32.c Log: Silly comment typo. Modified: trunk/platform/win32.c =================================================================== --- trunk/platform/win32.c 2006-11-05 10:06:02 UTC (rev 795) +++ trunk/platform/win32.c 2006-11-05 11:09:42 UTC (rev 796) @@ -60,7 +60,7 @@ /* * Users without the platform SDK don't have this defined. The original docs * for SetFilePointer() just said to compare with 0xFFFFFFFF, so this should - * work as desired + * work as desired. */ #define PHYSFS_INVALID_SET_FILE_POINTER 0xFFFFFFFF From DONOTREPLY at icculus.org Sun Nov 5 06:10:14 2006 From: DONOTREPLY at icculus.org (DONOTREPLY at icculus.org) Date: 5 Nov 2006 06:10:14 -0500 Subject: r797 - trunk Message-ID: <20061105111014.2452.qmail@icculus.org> Author: icculus Date: 2006-11-05 06:10:14 -0500 (Sun, 05 Nov 2006) New Revision: 797 Modified: trunk/CHANGELOG trunk/Makefile.am.newautomake trunk/Makefile.am.oldautomake trunk/makeos2.cmd trunk/physfs.dsp trunk/physfs.h trunk/physfs.vcproj trunk/physfsMPW.make trunk/physfs_static.dsp Log: Initial Unicode work. Modified: trunk/CHANGELOG =================================================================== --- trunk/CHANGELOG 2006-11-05 11:09:42 UTC (rev 796) +++ trunk/CHANGELOG 2006-11-05 11:10:14 UTC (rev 797) @@ -2,7 +2,7 @@ * CHANGELOG. */ -11052006 - More 7zip archiver work (thanks, Dennis!). +11052006 - More 7zip archiver work (thanks, Dennis!). Initial Unicode work. 09272006 - Reworked 7zip archiver (thanks, Dennis!). 09232006 - Fixed typo in doxygen comment. 04112006 - Added LZMA archiver...7zip support (thanks, Dennis!). Modified: trunk/Makefile.am.newautomake =================================================================== --- trunk/Makefile.am.newautomake 2006-11-05 11:09:42 UTC (rev 796) +++ trunk/Makefile.am.newautomake 2006-11-05 11:10:14 UTC (rev 797) @@ -110,6 +110,7 @@ libphysfs_la_SOURCES = \ physfs.c \ physfs_internal.h \ + physfs_unicode.c \ physfs_byteorder.c if BUILD_ZLIB Modified: trunk/Makefile.am.oldautomake =================================================================== --- trunk/Makefile.am.oldautomake 2006-11-05 11:09:42 UTC (rev 796) +++ trunk/Makefile.am.oldautomake 2006-11-05 11:10:14 UTC (rev 797) @@ -9,6 +9,7 @@ libphysfs_la_SOURCES = \ physfs.c \ physfs_internal.h \ + physfs_unicode.c \ physfs_byteorder.c if BUILD_ZLIB Modified: trunk/makeos2.cmd =================================================================== --- trunk/makeos2.cmd 2006-11-05 11:09:42 UTC (rev 796) +++ trunk/makeos2.cmd 2006-11-05 11:10:14 UTC (rev 797) @@ -110,6 +110,11 @@ @echo "PHYSFS_getCdRomDirsCallback" >> bin\physfs.def @echo "PHYSFS_getSearchPathCallback" >> bin\physfs.def @echo "PHYSFS_enumerateFilesCallback" >> bin\physfs.def + at echo "PHYSFS_utf8toucs2" >> bin\physfs.def + at echo "PHYSFS_utf8fromucs2" >> bin\physfs.def + at echo "PHYSFS_utf8toucs4" >> bin\physfs.def + at echo "PHYSFS_utf8fromucs4" >> bin\physfs.def + at echo "PHYSFS_utf8fromlatin1" >> bin\physfs.def @echo Building export library... emximp -o bin/physfs.lib bin/physfs.def @@ -118,6 +123,7 @@ @echo on gcc %CFLAGS% -o bin/physfs.obj physfs.c gcc %CFLAGS% -o bin/physfs_byteorder.obj physfs_byteorder.c +gcc %CFLAGS% -o bin/physfs_unicode.obj physfs_unicode.c gcc %CFLAGS% -o bin/os2.obj platform/os2.c gcc %CFLAGS% -o bin/dir.obj archivers/dir.c gcc %CFLAGS% -o bin/grp.obj archivers/grp.c Modified: trunk/physfs.dsp =================================================================== --- trunk/physfs.dsp 2006-11-05 11:09:42 UTC (rev 796) +++ trunk/physfs.dsp 2006-11-05 11:10:14 UTC (rev 797) @@ -149,6 +149,10 @@ # End Source File # Begin Source File +SOURCE=.\physfs_unicode.c +# End Source File +# Begin Source File + SOURCE=.\archivers\qpak.c # End Source File # Begin Source File Modified: trunk/physfs.h =================================================================== --- trunk/physfs.h 2006-11-05 11:09:42 UTC (rev 796) +++ trunk/physfs.h 2006-11-05 11:10:14 UTC (rev 797) @@ -147,6 +147,40 @@ * - .WAD (DOOM engine archives) * - .MIX (Older Westwood games archives) * + * + * String policy for PhysicsFS 2.0 and later: + * + * PhysicsFS 1.0 deals with null-terminated ASCII strings. All high ASCII + * chars resulted in undefined behaviour, and there was no Unicode support. + * + * All strings passed through PhysicsFS are in null-terminated UTF-8 format. + * This means that if all you care about is English (ASCII characters <= 127) + * then you just use regular C strings. If you care about Unicode (and you + * should!) then you need to figure out what your platform wants, needs, and + * offers. If you are on Windows and build with Unicode support, your TCHAR + * strings are two bytes per character (this is called "UCS-2 encoding"). You + * should convert them to UTF-8 before handing them to PhysicsFS with + * PHYSFS_utf8fromucs2(). If you're using Unix or Mac OS X, your wchar_t + * strings are four bytes per character ("UCS-4 encoding"). Use + * PHYSFS_utf8fromucs2(). Mac OS X can gie you UTF-8 directly from a CFString, + * and many Unixes generally give you C strings in UTF-8 format everywhere. + * If you have a single-byte high ASCII charset, like so-many European + * "codepages" you may be out of luck. We'll convert from "Latin1" to UTF-8 + * only, and never back to Latin1. If you're above ASCII 127, all bets are + * off: move to Unicode or use your platform's facilities. Passing a C string + * with high-ASCII data that isn't UTF-8 encoded will NOT do what you expect! + * + * Naturally, there's also PHYSFS_utf8toucs2() and PHYSFS_utf8toucs4() to get + * data back into a format you like. Behind the scenes, PhysicsFS will use + * Unicode where possible: the UTF-8 strings on Windows will be converted + * and used with the multibyte Windows APIs, for example. + * + * PhysicsFS offers basic encoding conversion support, but not a whole string + * library. Get your stuff into whatever format you can work with. + * + * + * Other stuff: + * * Please see the file LICENSE in the source's root directory for licensing * and redistribution rights. * @@ -1989,7 +2023,129 @@ PHYSFS_EnumFilesCallback c, void *d); +/** + * \fn void PHYSFS_utf8fromucs4(const PHYSFS_uint32 *src, char *dst, PHYSFS_uint64 len) + * \brief Convert a UCS-4 string to a UTF-8 string. + * + * UCS-4 strings are 32-bits per character: \c wchar_t on Unix. + * + * To ensure that the destination buffer is large enough for the conversion, + * please allocate a buffer that is the same size as the source buffer. UTF-8 + * never uses more than 32-bits per character, so while it may shrink a UCS-4 + * string, it will never expand it. + * + * Strings that don't fit in the destination buffer will be truncated, but + * will always be null-terminated and never have an incomplete UTF-8 + * sequence at the end. + * + * \param src Null-terminated source string in UCS-4 format. + * \param dst Buffer to store converted UTF-8 string. + * \param len Size, in bytes, of destination buffer. + */ +__EXPORT__ void PHYSFS_utf8fromucs4(const PHYSFS_uint32 *src, char *dst, + PHYSFS_uint64 len); +/** + * \fn void PHYSFS_utf8toucs4(const char *src, PHYSFS_uint32 *dst, PHYSFS_uint64 len) + * \brief Convert a UTF-8 string to a UCS-4 string. + * + * UCS-4 strings are 32-bits per character: \c wchar_t on Unix. + * + * To ensure that the destination buffer is large enough for the conversion, + * please allocate a buffer that is four times the size of the source buffer. + * UTF-8 uses from one to four bytes per character, but UCS-4 always uses + * four, so an entirely low-ASCII string will quadruple in size! + * + * Strings that don't fit in the destination buffer will be truncated, but + * will always be null-terminated and never have an incomplete UCS-4 + * sequence at the end. + * + * \param src Null-terminated source string in UTF-8 format. + * \param dst Buffer to store converted UCS-4 string. + * \param len Size, in bytes, of destination buffer. + */ +__EXPORT__ void PHYSFS_utf8toucs4(const char *src, PHYSFS_uint32 *dst, + PHYSFS_uint64 len); + +/** + * \fn void PHYSFS_utf8fromucs2(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len) + * \brief Convert a UCS-2 string to a UTF-8 string. + * + * UCS-2 strings are 16-bits per character: \c TCHAR on Windows, when building + * with Unicode support. + * + * To ensure that the destination buffer is large enough for the conversion, + * please allocate a buffer that is double the size of the source buffer. + * UTF-8 never uses more than 32-bits per character, so while it may shrink + * a UCS-2 string, it may also expand it. + * + * Strings that don't fit in the destination buffer will be truncated, but + * will always be null-terminated and never have an incomplete UTF-8 + * sequence at the end. + * + * Please note that UCS-2 is not UTF-16; we do not support the "surrogate" + * values at this time. + * + * \param src Null-terminated source string in UCS-2 format. + * \param dst Buffer to store converted UTF-8 string. + * \param len Size, in bytes, of destination buffer. + */ +__EXPORT__ void PHYSFS_utf8fromucs2(const PHYSFS_uint16 *src, char *dst, + PHYSFS_uint64 len); + +/** + * \fn PHYSFS_utf8toucs2(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len) + * \brief Convert a UTF-8 string to a UCS-2 string. + * + * UCS-2 strings are 16-bits per character: \c TCHAR on Windows, when building + * with Unicode support. + * + * To ensure that the destination buffer is large enough for the conversion, + * please allocate a buffer that is double the size of the source buffer. + * UTF-8 uses from one to four bytes per character, but UCS-2 always uses + * two, so an entirely low-ASCII string will double in size! + * + * Strings that don't fit in the destination buffer will be truncated, but + * will always be null-terminated and never have an incomplete UCS-2 + * sequence at the end. + * + * Please note that UCS-2 is not UTF-16; we do not support the "surrogate" + * values at this time. + * + * \param src Null-terminated source string in UTF-8 format. + * \param dst Buffer to store converted UCS-2 string. + * \param len Size, in bytes, of destination buffer. + */ +__EXPORT__ void PHYSFS_utf8toucs2(const char *src, PHYSFS_uint16 *dst, + PHYSFS_uint64 len); + +/** + * \fn void PHYSFS_utf8fromlatin1(const char *src, char *dst, PHYSFS_uint64 len) + * \brief Convert a UTF-8 string to a Latin1 string. + * + * Latin1 strings are 8-bits per character: a popular "high ASCII" + * encoding. + * + * To ensure that the destination buffer is large enough for the conversion, + * please allocate a buffer that is double the size of the source buffer. + * UTF-8 expands latin1 codepoints over 127 from to 2 bytes, so the string + * may grow in some cases. + * + * Strings that don't fit in the destination buffer will be truncated, but + * will always be null-terminated and never have an incomplete UTF-8 + * sequence at the end. + * + * Please note that we do not supply a UTF-8 to Latin1 converter, since Latin1 + * can't express most Unicode codepoints. It's a legacy encoding; you should + * be converting away from it at all times. + * + * \param src Null-terminated source string in Latin1 format. + * \param dst Buffer to store converted UTF-8 string. + * \param len Size, in bytes, of destination buffer. + */ +__EXPORT__ void PHYSFS_utf8fromlatin1(const char *src, char *dst, + PHYSFS_uint64 len); + /* Everything above this line is part of the PhysicsFS 2.0 API. */ Modified: trunk/physfs.vcproj =================================================================== --- trunk/physfs.vcproj 2006-11-05 11:09:42 UTC (rev 796) +++ trunk/physfs.vcproj 2006-11-05 11:10:14 UTC (rev 797) @@ -194,6 +194,9 @@ RelativePath=".\physfs_byteorder.c"> + + Author: icculus Date: 2006-11-05 06:30:48 -0500 (Sun, 05 Nov 2006) New Revision: 798 Modified: trunk/lzma/ Log: Ignore automake files. Property changes on: trunk/lzma ___________________________________________________________________ Name: svn:ignore + Makefile.in Makefile From DONOTREPLY at icculus.org Sun Nov 5 06:32:18 2006 From: DONOTREPLY at icculus.org (DONOTREPLY at icculus.org) Date: 5 Nov 2006 06:32:18 -0500 Subject: r799 - trunk Message-ID: <20061105113218.8628.qmail@icculus.org> Author: icculus Date: 2006-11-05 06:32:18 -0500 (Sun, 05 Nov 2006) New Revision: 799 Added: trunk/physfs_unicode.c Log: Initial add. Added: trunk/physfs_unicode.c =================================================================== --- trunk/physfs_unicode.c (rev 0) +++ trunk/physfs_unicode.c 2006-11-05 11:32:18 UTC (rev 799) @@ -0,0 +1,338 @@ +#if HAVE_CONFIG_H +# include +#endif + +#include "physfs.h" + +#define __PHYSICSFS_INTERNAL__ +#include "physfs_internal.h" + + +/* + * From rfc3629, the UTF-8 spec: + * http://www.ietf.org/rfc/rfc3629.txt + * + * Char. number range | UTF-8 octet sequence + * (hexadecimal) | (binary) + * --------------------+--------------------------------------------- + * 0000 0000-0000 007F | 0xxxxxxx + * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx + * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx + * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + */ + + +/* + * This may not be the best value, but it's one that isn't represented + * in Unicode (0x10FFFF is the largest codepoint value). We return this + * value from utf8codepoint() if there's bogus bits in the + * stream. utf8codepoint() will turn this value into something + * reasonable (like a question mark), for text that wants to try to recover, + * whereas utf8valid() will use the value to determine if a string has bad + * bits. + */ +#define UNICODE_BOGUS_CHAR_VALUE 0xFFFFFFFF + +/* + * This is the codepoint we currently return when there was bogus bits in a + * UTF-8 string. May not fly in Asian locales? + */ +#define UNICODE_BOGUS_CHAR_CODEPOINT '?' + +static PHYSFS_uint32 utf8codepoint(const char **_str) +{ + const char *str = *_str; + PHYSFS_uint32 retval = 0; + PHYSFS_uint32 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *str); + PHYSFS_uint32 octet2, octet3, octet4; + + if (octet == 0) /* null terminator, end of string. */ + return 0; + + else if (octet < 128) /* one octet char: 0 to 127 */ + { + (*_str)++; /* skip to next possible start of codepoint. */ + return(octet); + } /* else if */ + + else if ((octet > 127) && (octet < 192)) /* bad (starts with 10xxxxxx). */ + { + /* + * Apparently each of these is supposed to be flagged as a bogus + * char, instead of just resyncing to the next valid codepoint. + */ + (*_str)++; /* skip to next possible start of codepoint. */ + return UNICODE_BOGUS_CHAR_VALUE; + } /* else if */ + + else if (octet < 224) /* two octets */ + { + octet -= (128+64); + octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); + if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */ + return UNICODE_BOGUS_CHAR_VALUE; + + *_str += 2; /* skip to next possible start of codepoint. */ + retval = ((octet << 6) | (octet2 - 128)); + if ((retval >= 0x80) && (retval <= 0x7FF)) + return retval; + } /* else if */ + + else if (octet < 240) /* three octets */ + { + octet -= (128+64+32); + octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); + if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */ + return UNICODE_BOGUS_CHAR_VALUE; + + octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); + if ((octet3 & (128+64)) != 128) /* Format isn't 10xxxxxx? */ + return UNICODE_BOGUS_CHAR_VALUE; + + *_str += 3; /* skip to next possible start of codepoint. */ + retval = ( ((octet << 12)) | ((octet2-128) << 6) | ((octet3-128)) ); + + /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */ + switch (retval) + { + case 0xD800: + case 0xDB7F: + case 0xDB80: + case 0xDBFF: + case 0xDC00: + case 0xDF80: + case 0xDFFF: + return UNICODE_BOGUS_CHAR_VALUE; + } /* switch */ + + /* 0xFFFE and 0xFFFF are illegal, too, so we check them at the edge. */ + if ((retval >= 0x800) && (retval <= 0xFFFD)) + return retval; + } /* else if */ + + else if (octet < 248) /* four octets */ + { + octet -= (128+64+32+16); + octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); + if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */ + return UNICODE_BOGUS_CHAR_VALUE; + + octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); + if ((octet3 & (128+64)) != 128) /* Format isn't 10xxxxxx? */ + return UNICODE_BOGUS_CHAR_VALUE; + + octet4 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); + if ((octet4 & (128+64)) != 128) /* Format isn't 10xxxxxx? */ + return UNICODE_BOGUS_CHAR_VALUE; + + *_str += 4; /* skip to next possible start of codepoint. */ + retval = ( ((octet << 18)) | ((octet2 - 128) << 12) | + ((octet3 - 128) << 6) | ((octet4 - 128)) ); + if ((retval >= 0x10000) && (retval <= 0x10FFFF)) + return retval; + } /* else if */ + + /* + * Five and six octet sequences became illegal in rfc3629. + * We throw the codepoint away, but parse them to make sure we move + * ahead the right number of bytes and don't overflow the buffer. + */ + + else if (octet < 252) /* five octets */ + { + octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); + if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ + return UNICODE_BOGUS_CHAR_VALUE; + + octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); + if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ + return UNICODE_BOGUS_CHAR_VALUE; + + octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); + if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ + return UNICODE_BOGUS_CHAR_VALUE; + + octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); + if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ + return UNICODE_BOGUS_CHAR_VALUE; + + *_str += 5; /* skip to next possible start of codepoint. */ + return UNICODE_BOGUS_CHAR_VALUE; + } /* else if */ + + else /* six octets */ + { + octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); + if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ + return UNICODE_BOGUS_CHAR_VALUE; + + octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); + if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ + return UNICODE_BOGUS_CHAR_VALUE; + + octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); + if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ + return UNICODE_BOGUS_CHAR_VALUE; + + octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); + if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ + return UNICODE_BOGUS_CHAR_VALUE; + + octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); + if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ + return UNICODE_BOGUS_CHAR_VALUE; + + *_str += 6; /* skip to next possible start of codepoint. */ + return UNICODE_BOGUS_CHAR_VALUE; + } /* else if */ + + return UNICODE_BOGUS_CHAR_VALUE; +} /* utf8codepoint */ + +void PHYSFS_utf8toucs4(const char *src, PHYSFS_uint32 *dst, PHYSFS_uint64 len) +{ + len -= sizeof (PHYSFS_uint32); /* save room for null char. */ + while (len >= sizeof (PHYSFS_uint32)) + { + PHYSFS_uint32 cp = utf8codepoint(&src); + if (cp == 0) + break; + else if (cp == UNICODE_BOGUS_CHAR_VALUE) + cp = UNICODE_BOGUS_CHAR_CODEPOINT; + *(dst++) = cp; + len -= sizeof (PHYSFS_uint32); + } /* while */ + + *dst = 0; +} /* PHYSFS_utf8toucs4 */ + +void PHYSFS_utf8toucs2(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len) +{ + len -= sizeof (PHYSFS_uint16); /* save room for null char. */ + while (len >= sizeof (PHYSFS_uint16)) + { + PHYSFS_uint32 cp = utf8codepoint(&src); + if (cp == 0) + break; + else if (cp == UNICODE_BOGUS_CHAR_VALUE) + cp = UNICODE_BOGUS_CHAR_CODEPOINT; + + /* !!! BLUESKY: UTF-16 surrogates? */ + if (cp > 0xFFFF) + cp = UNICODE_BOGUS_CHAR_CODEPOINT; + + *(dst++) = cp; + len -= sizeof (PHYSFS_uint16); + } /* while */ + + *dst = 0; +} /* PHYSFS_utf8toucs2 */ + +static void utf8fromcodepoint(PHYSFS_uint32 cp, char **_dst, PHYSFS_uint64 *_len) +{ + char *dst = *_dst; + PHYSFS_uint64 len = *_len; + + if (len == 0) + return; + + if (cp > 0x10FFFF) + cp = UNICODE_BOGUS_CHAR_CODEPOINT; + else if ((cp == 0xFFFE) || (cp == 0xFFFF)) /* illegal values. */ + cp = UNICODE_BOGUS_CHAR_CODEPOINT; + else + { + /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */ + switch (cp) + { + case 0xD800: + case 0xDB7F: + case 0xDB80: + case 0xDBFF: + case 0xDC00: + case 0xDF80: + case 0xDFFF: + cp = UNICODE_BOGUS_CHAR_CODEPOINT; + } /* switch */ + } /* else */ + + /* Do the encoding... */ + if (cp < 0x80) + { + *(dst++) = (char) cp; + len--; + } /* if */ + + else if (cp < 0x800) + { + if (len < 2) + len = 0; + else + { + *(dst++) = (char) ((cp >> 6) | 128 | 64); + *(dst++) = (char) (cp & 0x3F) | 128; + len -= 2; + } /* else */ + } /* else if */ + + else if (cp < 0x10000) + { + if (len < 3) + len = 0; + else + { + *(dst++) = (char) ((cp >> 12) | 128 | 64 | 32); + *(dst++) = (char) ((cp >> 6) & 0x3F) | 128; + *(dst++) = (char) (cp & 0x3F) | 128; + len -= 3; + } /* else */ + } /* else if */ + + else + { + if (len < 4) + len = 0; + else + { + *(dst++) = (char) ((cp >> 18) | 128 | 64 | 32 | 16); + *(dst++) = (char) ((cp >> 12) & 0x3F) | 128; + *(dst++) = (char) ((cp >> 6) & 0x3F) | 128; + *(dst++) = (char) (cp & 0x3F) | 128; + len -= 4; + } /* else if */ + } /* else */ + + *_dst = dst; + *_len = len; +} /* utf8fromcodepoint */ + +#define UTF8FROMTYPE(typ, src, dst, len) \ + len--; \ + while (len) \ + { \ + const PHYSFS_uint32 cp = (PHYSFS_uint32) *(src++); \ + if (cp == 0) break; \ + utf8fromcodepoint(cp, &dst, &len); \ + } \ + *dst = '\0'; \ + +void PHYSFS_utf8fromucs4(const PHYSFS_uint32 *src, char *dst, PHYSFS_uint64 len) +{ + UTF8FROMTYPE(PHYSFS_uint32, src, dst, len); +} /* PHYSFS_utf8fromucs4 */ + +void PHYSFS_utf8fromucs2(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len) +{ + UTF8FROMTYPE(PHYSFS_uint64, src, dst, len); +} /* PHYSFS_utf8fromucs4 */ + +/* latin1 maps to unicode codepoints directly, we just utf-8 encode it. */ +void PHYSFS_utf8fromlatin1(const char *src, char *dst, PHYSFS_uint64 len) +{ + UTF8FROMTYPE(PHYSFS_uint8, src, dst, len); +} /* PHYSFS_utf8fromlatin1 */ + +#undef UTF8FROMTYPE + +/* end of physfs_unicode.c ... */ + From DONOTREPLY at icculus.org Sun Nov 5 14:06:23 2006 From: DONOTREPLY at icculus.org (DONOTREPLY at icculus.org) Date: 5 Nov 2006 14:06:23 -0500 Subject: r800 - in trunk: . platform Message-ID: <20061105190623.6982.qmail@icculus.org> Author: icculus Date: 2006-11-05 14:06:23 -0500 (Sun, 05 Nov 2006) New Revision: 800 Modified: trunk/CHANGELOG trunk/platform/beos.cpp Log: Apparently BeOS's BPath constructor doesn't actually _need_ leaf to be manually split out. Modified: trunk/CHANGELOG =================================================================== --- trunk/CHANGELOG 2006-11-05 11:32:18 UTC (rev 799) +++ trunk/CHANGELOG 2006-11-05 19:06:23 UTC (rev 800) @@ -3,6 +3,7 @@ */ 11052006 - More 7zip archiver work (thanks, Dennis!). Initial Unicode work. + Minor BeOS realpath tweak. 09272006 - Reworked 7zip archiver (thanks, Dennis!). 09232006 - Fixed typo in doxygen comment. 04112006 - Added LZMA archiver...7zip support (thanks, Dennis!). Modified: trunk/platform/beos.cpp =================================================================== --- trunk/platform/beos.cpp 2006-11-05 11:32:18 UTC (rev 799) +++ trunk/platform/beos.cpp 2006-11-05 19:06:23 UTC (rev 800) @@ -200,14 +200,7 @@ char *__PHYSFS_platformRealPath(const char *path) { - char *str = (char *) alloca(strlen(path) + 1); - BAIL_IF_MACRO(str == NULL, ERR_OUT_OF_MEMORY, NULL); - strcpy(str, path); - char *leaf = strrchr(str, '/'); - if (leaf != NULL) - *(leaf++) = '\0'; - - BPath normalized(str, leaf, true); /* force normalization of path. */ + BPath normalized(path, NULL, true); /* force normalization of path. */ const char *resolved_path = normalized.Path(); BAIL_IF_MACRO(resolved_path == NULL, ERR_NO_SUCH_FILE, NULL); char *retval = (char *) allocator.Malloc(strlen(resolved_path) + 1);