From DONOTREPLY at icculus.org  Sun Nov  5 05:06:02 2006
From: DONOTREPLY at icculus.org (DONOTREPLY at icculus.org)
Date: 5 Nov 2006 05:06:02 -0500
Subject: r795 - in trunk: . archivers lzma
Message-ID: <20061105100602.17660.qmail@icculus.org>
Author: icculus
Date: 2006-11-05 05:06:02 -0500 (Sun, 05 Nov 2006)
New Revision: 795
Modified:
   trunk/CHANGELOG
   trunk/archivers/lzma.c
   trunk/lzma/LZMA-LICENSE.txt
Log:
More 7zip work (thanks, Dennis!)
Modified: trunk/CHANGELOG
===================================================================
--- trunk/CHANGELOG	2006-09-27 09:21:56 UTC (rev 794)
+++ trunk/CHANGELOG	2006-11-05 10:06:02 UTC (rev 795)
@@ -2,6 +2,7 @@
  * CHANGELOG.
  */
 
+11052006 - More 7zip archiver work (thanks, Dennis!).
 09272006 - Reworked 7zip archiver (thanks, Dennis!).
 09232006 - Fixed typo in doxygen comment.
 04112006 - Added LZMA archiver...7zip support (thanks, Dennis!).
Modified: trunk/archivers/lzma.c
===================================================================
--- trunk/archivers/lzma.c	2006-09-27 09:21:56 UTC (rev 794)
+++ trunk/archivers/lzma.c	2006-11-05 10:06:02 UTC (rev 795)
@@ -51,16 +51,29 @@
     void *File;
 } CFileInStream;
 
-/* Set by LZMA_openArchive, except blockXXX which is handled by LZMA_read() */
+/*
+ * In LZMA the archive is splited in blocks, those are called folders
+ * Set by LZMA_read()
+*/
+typedef struct _LZMAfolder
+{
+    PHYSFS_uint8 *cache; /* Cached folder */
+    PHYSFS_uint32 size; /* Size of folder */
+    PHYSFS_uint32 index; /* Index of folder in archive */
+    PHYSFS_uint32 references; /* Number of files using this block */
+} LZMAfolder;
+
+/*
+ * Set by LZMA_openArchive(), except folder which gets it's values
+ *  in LZMA_read()
+ */
 typedef struct _LZMAarchive
 {
     struct _LZMAentry *firstEntry; /* Used for cleanup on shutdown */
     struct _LZMAentry *lastEntry;
+    LZMAfolder *folder; /* Array of folders */
     CArchiveDatabaseEx db; /* For 7z: Database */
     CFileInStream stream; /* For 7z: Input file incl. read and seek callbacks */
-    unsigned char *block; /* Currently cached block */
-    size_t blockSize; /* Size of current block */
-    PHYSFS_uint32 blockIndex; /* Index of current block */
 } LZMAarchive;
 
 /* Set by LZMA_openRead(), except offset which is set by LZMA_read() */
@@ -70,8 +83,9 @@
     struct _LZMAentry *previous;
     LZMAarchive *archive; /* Link to corresponding archive */
     CFileItem *file; /* For 7z: File info, eg. name, size */
-    PHYSFS_uint32 index; /* Index inside the archive */
-    size_t offset; /* Offset inside archive block */
+    PHYSFS_uint32 fileIndex; /* Index of file in archive */
+    PHYSFS_uint32 folderIndex; /* Index of folder in archive */
+    size_t offset; /* Offset in folder */
     PHYSFS_uint32 position; /* Current "virtual" position in file */
 } LZMAentry;
 
@@ -273,23 +287,30 @@
     allocTempImp.Alloc = SzAllocPhysicsFS;
     allocTempImp.Free = SzFreePhysicsFS;
 
-    if (lzma_err(SzExtract(
-        &entry->archive->stream.InStream, /* compressed data */
-        &entry->archive->db,
-        entry->index,
-        &entry->archive->blockIndex, /* Index of currently cached block, may be changed by SzExtract */
-        &entry->archive->block, /* Cache of current decompressed block, may be allocated/freed by SzExtract */
-        &entry->archive->blockSize, /* Size of current cache, may be changed by SzExtract */
-        &entry->offset, /* Offset of this file inside the cache block, set by SzExtract */
-        &fileSize, /* Size of this file */
-        &allocImp,
-        &allocTempImp
-            )) != SZ_OK)
-        return -1;
+    /* Only decompress the folder if it is not allready cached */
+    if (entry->archive->folder[entry->folderIndex].cache == NULL)
+        if (lzma_err(SzExtract(
+            &entry->archive->stream.InStream, /* compressed data */
+            &entry->archive->db,
+            entry->fileIndex,
+            /* Index of cached folder, will be changed by SzExtract */
+            &entry->archive->folder[entry->folderIndex].index,
+            /* Cache for decompressed folder, allocated/freed by SzExtract */
+            &entry->archive->folder[entry->folderIndex].cache,
+            /* Size of cache, will be changed by SzExtract */
+            &entry->archive->folder[entry->folderIndex].size,
+            /* Offset of this file inside the cache, set by SzExtract */
+            &entry->offset,
+            &fileSize, /* Size of this file */
+            &allocImp,
+            &allocTempImp
+                )) != SZ_OK)
+            return -1;
 
     /* Copy wanted bytes over from cache to outBuffer */
     strncpy(outBuffer,
-            (void*)(entry->archive->block + entry->offset + entry->position),
+            (void*) (entry->archive->folder[entry->folderIndex].cache +
+                     entry->offset + entry->position),
             wantedSize);
     entry->position += wantedSize;
     return objCount;
@@ -352,6 +373,13 @@
     if (entry->next != NULL)
         entry->next->previous = entry->previous;
 
+    entry->archive->folder[entry->folderIndex].references--;
+    if (entry->archive->folder[entry->folderIndex].references == 0)
+    {
+        allocator.Free(entry->archive->folder[entry->folderIndex].cache);
+        entry->archive->folder[entry->folderIndex].cache = NULL;
+    }
+
     allocator.Free(entry);
     entry = NULL;
 
@@ -384,17 +412,17 @@
 
 static void *LZMA_openArchive(const char *name, int forWriting)
 {
+    PHYSFS_uint64 len;
     LZMAarchive *archive = NULL;
     ISzAlloc allocImp;
     ISzAlloc allocTempImp;
 
     BAIL_IF_MACRO(forWriting, ERR_ARC_IS_READ_ONLY, NULL);
-    BAIL_IF_MACRO(!LZMA_isArchive(name, forWriting), ERR_UNSUPPORTED_ARCHIVE, 0);
+    BAIL_IF_MACRO(!LZMA_isArchive(name,forWriting), ERR_UNSUPPORTED_ARCHIVE, 0);
 
     archive = (LZMAarchive *) allocator.Malloc(sizeof (LZMAarchive));
     BAIL_IF_MACRO(archive == NULL, ERR_OUT_OF_MEMORY, NULL);
 
-    archive->block = NULL;
     archive->firstEntry = NULL;
     archive->lastEntry = NULL;
 
@@ -424,6 +452,16 @@
         return NULL;
     } /* if */
 
+    len = archive->db.Database.NumFolders * sizeof (LZMAfolder);
+    archive->folder = (LZMAfolder *) allocator.Malloc(len);
+    BAIL_IF_MACRO(archive->folder == NULL, ERR_OUT_OF_MEMORY, NULL);
+
+    /*
+     * Init with 0 so we know when a folder is already cached
+     * Values will be set by LZMA_read()
+     */
+    memset(archive->folder, 0, len);
+
     return(archive);
 } /* LZMA_openArchive */
 
@@ -529,20 +567,27 @@
 {
     LZMAarchive *archive = (LZMAarchive *) opaque;
     LZMAentry *entry = NULL;
-    PHYSFS_uint32 index = 0;
+    PHYSFS_uint32 fileIndex = 0;
+    PHYSFS_uint32 folderIndex = 0;
 
-    *fileExists = lzma_find_entry(archive, name, &index);
+    *fileExists = lzma_find_entry(archive, name, &fileIndex);
     BAIL_IF_MACRO(!*fileExists, ERR_NO_SUCH_FILE, NULL);
 
+    folderIndex = archive->db.FileIndexToFolderIndexMap[fileIndex];
+    BAIL_IF_MACRO(folderIndex == (PHYSFS_uint32)-1, ERR_UNKNOWN_ERROR, NULL);
+
     entry = (LZMAentry *) allocator.Malloc(sizeof (LZMAentry));
     BAIL_IF_MACRO(entry == NULL, ERR_OUT_OF_MEMORY, NULL);
 
-    entry->index = index;
+    entry->fileIndex = fileIndex;
+    entry->folderIndex = folderIndex;
     entry->archive = archive;
-    entry->file = archive->db.Database.Files + entry->index;
+    entry->file = archive->db.Database.Files + entry->fileIndex;
     entry->offset = 0; /* Offset will be set by LZMA_read() */
     entry->position = 0;
 
+    archive->folder[folderIndex].references++;
+
     entry->next = NULL;
     entry->previous = entry->archive->lastEntry;
     if (entry->previous != NULL)
@@ -584,7 +629,7 @@
     __PHYSFS_platformClose(archive->stream.File);
 
     /* Free the cache which might have been allocated by LZMA_read() */
-    allocator.Free(archive->block);
+    allocator.Free(archive->folder);
     allocator.Free(archive);
 } /* LZMA_dirClose */
 
Modified: trunk/lzma/LZMA-LICENSE.txt
===================================================================
--- trunk/lzma/LZMA-LICENSE.txt	2006-09-27 09:21:56 UTC (rev 794)
+++ trunk/lzma/LZMA-LICENSE.txt	2006-11-05 10:06:02 UTC (rev 795)
@@ -92,4 +92,3 @@
 
 You should have received a copy of the Common Public License
 along with this library.
-k
From DONOTREPLY at icculus.org  Sun Nov  5 06:09:42 2006
From: DONOTREPLY at icculus.org (DONOTREPLY at icculus.org)
Date: 5 Nov 2006 06:09:42 -0500
Subject: r796 - trunk/platform
Message-ID: <20061105110942.2263.qmail@icculus.org>
Author: icculus
Date: 2006-11-05 06:09:42 -0500 (Sun, 05 Nov 2006)
New Revision: 796
Modified:
   trunk/platform/win32.c
Log:
Silly comment typo.
Modified: trunk/platform/win32.c
===================================================================
--- trunk/platform/win32.c	2006-11-05 10:06:02 UTC (rev 795)
+++ trunk/platform/win32.c	2006-11-05 11:09:42 UTC (rev 796)
@@ -60,7 +60,7 @@
 /*
  * Users without the platform SDK don't have this defined.  The original docs
  *  for SetFilePointer() just said to compare with 0xFFFFFFFF, so this should
- *  work as desired
+ *  work as desired.
  */
 #define PHYSFS_INVALID_SET_FILE_POINTER  0xFFFFFFFF
 
From DONOTREPLY at icculus.org  Sun Nov  5 06:10:14 2006
From: DONOTREPLY at icculus.org (DONOTREPLY at icculus.org)
Date: 5 Nov 2006 06:10:14 -0500
Subject: r797 - trunk
Message-ID: <20061105111014.2452.qmail@icculus.org>
Author: icculus
Date: 2006-11-05 06:10:14 -0500 (Sun, 05 Nov 2006)
New Revision: 797
Modified:
   trunk/CHANGELOG
   trunk/Makefile.am.newautomake
   trunk/Makefile.am.oldautomake
   trunk/makeos2.cmd
   trunk/physfs.dsp
   trunk/physfs.h
   trunk/physfs.vcproj
   trunk/physfsMPW.make
   trunk/physfs_static.dsp
Log:
Initial Unicode work.
Modified: trunk/CHANGELOG
===================================================================
--- trunk/CHANGELOG	2006-11-05 11:09:42 UTC (rev 796)
+++ trunk/CHANGELOG	2006-11-05 11:10:14 UTC (rev 797)
@@ -2,7 +2,7 @@
  * CHANGELOG.
  */
 
-11052006 - More 7zip archiver work (thanks, Dennis!).
+11052006 - More 7zip archiver work (thanks, Dennis!). Initial Unicode work.
 09272006 - Reworked 7zip archiver (thanks, Dennis!).
 09232006 - Fixed typo in doxygen comment.
 04112006 - Added LZMA archiver...7zip support (thanks, Dennis!).
Modified: trunk/Makefile.am.newautomake
===================================================================
--- trunk/Makefile.am.newautomake	2006-11-05 11:09:42 UTC (rev 796)
+++ trunk/Makefile.am.newautomake	2006-11-05 11:10:14 UTC (rev 797)
@@ -110,6 +110,7 @@
 libphysfs_la_SOURCES =	\
 	physfs.c	\
 	physfs_internal.h	\
+	physfs_unicode.c	\
 	physfs_byteorder.c
 
 if BUILD_ZLIB
Modified: trunk/Makefile.am.oldautomake
===================================================================
--- trunk/Makefile.am.oldautomake	2006-11-05 11:09:42 UTC (rev 796)
+++ trunk/Makefile.am.oldautomake	2006-11-05 11:10:14 UTC (rev 797)
@@ -9,6 +9,7 @@
 libphysfs_la_SOURCES =	\
 	physfs.c	\
 	physfs_internal.h	\
+	physfs_unicode.c	\
 	physfs_byteorder.c
 
 if BUILD_ZLIB
Modified: trunk/makeos2.cmd
===================================================================
--- trunk/makeos2.cmd	2006-11-05 11:09:42 UTC (rev 796)
+++ trunk/makeos2.cmd	2006-11-05 11:10:14 UTC (rev 797)
@@ -110,6 +110,11 @@
 @echo  "PHYSFS_getCdRomDirsCallback" >> bin\physfs.def
 @echo  "PHYSFS_getSearchPathCallback" >> bin\physfs.def
 @echo  "PHYSFS_enumerateFilesCallback" >> bin\physfs.def
+ at echo  "PHYSFS_utf8toucs2" >> bin\physfs.def
+ at echo  "PHYSFS_utf8fromucs2" >> bin\physfs.def
+ at echo  "PHYSFS_utf8toucs4" >> bin\physfs.def
+ at echo  "PHYSFS_utf8fromucs4" >> bin\physfs.def
+ at echo  "PHYSFS_utf8fromlatin1" >> bin\physfs.def
 
 @echo Building export library...
 emximp -o bin/physfs.lib bin/physfs.def
@@ -118,6 +123,7 @@
 @echo on
 gcc %CFLAGS% -o bin/physfs.obj physfs.c
 gcc %CFLAGS% -o bin/physfs_byteorder.obj physfs_byteorder.c
+gcc %CFLAGS% -o bin/physfs_unicode.obj physfs_unicode.c
 gcc %CFLAGS% -o bin/os2.obj platform/os2.c
 gcc %CFLAGS% -o bin/dir.obj archivers/dir.c
 gcc %CFLAGS% -o bin/grp.obj archivers/grp.c
Modified: trunk/physfs.dsp
===================================================================
--- trunk/physfs.dsp	2006-11-05 11:09:42 UTC (rev 796)
+++ trunk/physfs.dsp	2006-11-05 11:10:14 UTC (rev 797)
@@ -149,6 +149,10 @@
 # End Source File
 # Begin Source File
 
+SOURCE=.\physfs_unicode.c
+# End Source File
+# Begin Source File
+
 SOURCE=.\archivers\qpak.c
 # End Source File
 # Begin Source File
Modified: trunk/physfs.h
===================================================================
--- trunk/physfs.h	2006-11-05 11:09:42 UTC (rev 796)
+++ trunk/physfs.h	2006-11-05 11:10:14 UTC (rev 797)
@@ -147,6 +147,40 @@
  *   - .WAD (DOOM engine archives)
  *   - .MIX (Older Westwood games archives)
  *
+ *
+ * String policy for PhysicsFS 2.0 and later:
+ *
+ * PhysicsFS 1.0 deals with null-terminated ASCII strings. All high ASCII
+ *  chars resulted in undefined behaviour, and there was no Unicode support.
+ *
+ * All strings passed through PhysicsFS are in null-terminated UTF-8 format.
+ *  This means that if all you care about is English (ASCII characters <= 127)
+ *  then you just use regular C strings. If you care about Unicode (and you
+ *  should!) then you need to figure out what your platform wants, needs, and
+ *  offers. If you are on Windows and build with Unicode support, your TCHAR
+ *  strings are two bytes per character (this is called "UCS-2 encoding"). You
+ *  should convert them to UTF-8 before handing them to PhysicsFS with
+ *  PHYSFS_utf8fromucs2(). If you're using Unix or Mac OS X, your wchar_t
+ *  strings are four bytes per character ("UCS-4 encoding"). Use
+ *  PHYSFS_utf8fromucs2(). Mac OS X can gie you UTF-8 directly from a CFString,
+ *  and many Unixes generally give you C strings in UTF-8 format everywhere.
+ *  If you have a single-byte high ASCII charset, like so-many European
+ *  "codepages" you may be out of luck. We'll convert from "Latin1" to UTF-8
+ *  only, and never back to Latin1. If you're above ASCII 127, all bets are
+ *  off: move to Unicode or use your platform's facilities. Passing a C string
+ *  with high-ASCII data that isn't UTF-8 encoded will NOT do what you expect!
+ *
+ * Naturally, there's also PHYSFS_utf8toucs2() and PHYSFS_utf8toucs4() to get
+ *  data back into a format you like. Behind the scenes, PhysicsFS will use
+ *  Unicode where possible: the UTF-8 strings on Windows will be converted
+ *  and used with the multibyte Windows APIs, for example.
+ *
+ * PhysicsFS offers basic encoding conversion support, but not a whole string
+ *  library. Get your stuff into whatever format you can work with.
+ *
+ *
+ * Other stuff:
+ *
  * Please see the file LICENSE in the source's root directory for licensing
  *  and redistribution rights.
  *
@@ -1989,7 +2023,129 @@
                                               PHYSFS_EnumFilesCallback c,
                                               void *d);
 
+/**
+ * \fn void PHYSFS_utf8fromucs4(const PHYSFS_uint32 *src, char *dst, PHYSFS_uint64 len)
+ * \brief Convert a UCS-4 string to a UTF-8 string.
+ *
+ * UCS-4 strings are 32-bits per character: \c wchar_t on Unix.
+ *
+ * To ensure that the destination buffer is large enough for the conversion,
+ *  please allocate a buffer that is the same size as the source buffer. UTF-8
+ *  never uses more than 32-bits per character, so while it may shrink a UCS-4
+ *  string, it will never expand it.
+ *
+ * Strings that don't fit in the destination buffer will be truncated, but
+ *  will always be null-terminated and never have an incomplete UTF-8
+ *  sequence at the end.
+ *
+ *   \param src Null-terminated source string in UCS-4 format.
+ *   \param dst Buffer to store converted UTF-8 string.
+ *   \param len Size, in bytes, of destination buffer.
+ */
+__EXPORT__ void PHYSFS_utf8fromucs4(const PHYSFS_uint32 *src, char *dst,
+                                    PHYSFS_uint64 len);
 
+/**
+ * \fn void PHYSFS_utf8toucs4(const char *src, PHYSFS_uint32 *dst, PHYSFS_uint64 len)
+ * \brief Convert a UTF-8 string to a UCS-4 string.
+ *
+ * UCS-4 strings are 32-bits per character: \c wchar_t on Unix.
+ *
+ * To ensure that the destination buffer is large enough for the conversion,
+ *  please allocate a buffer that is four times the size of the source buffer.
+ *  UTF-8 uses from one to four bytes per character, but UCS-4 always uses
+ *  four, so an entirely low-ASCII string will quadruple in size!
+ *
+ * Strings that don't fit in the destination buffer will be truncated, but
+ *  will always be null-terminated and never have an incomplete UCS-4
+ *  sequence at the end.
+ *
+ *   \param src Null-terminated source string in UTF-8 format.
+ *   \param dst Buffer to store converted UCS-4 string.
+ *   \param len Size, in bytes, of destination buffer.
+ */
+__EXPORT__ void PHYSFS_utf8toucs4(const char *src, PHYSFS_uint32 *dst,
+                                  PHYSFS_uint64 len);
+
+/**
+ * \fn void PHYSFS_utf8fromucs2(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len)
+ * \brief Convert a UCS-2 string to a UTF-8 string.
+ *
+ * UCS-2 strings are 16-bits per character: \c TCHAR on Windows, when building
+ *  with Unicode support.
+ *
+ * To ensure that the destination buffer is large enough for the conversion,
+ *  please allocate a buffer that is double the size of the source buffer.
+ *  UTF-8 never uses more than 32-bits per character, so while it may shrink
+ *  a UCS-2 string, it may also expand it.
+ *
+ * Strings that don't fit in the destination buffer will be truncated, but
+ *  will always be null-terminated and never have an incomplete UTF-8
+ *  sequence at the end.
+ *
+ * Please note that UCS-2 is not UTF-16; we do not support the "surrogate"
+ *  values at this time.
+ *
+ *   \param src Null-terminated source string in UCS-2 format.
+ *   \param dst Buffer to store converted UTF-8 string.
+ *   \param len Size, in bytes, of destination buffer.
+ */
+__EXPORT__ void PHYSFS_utf8fromucs2(const PHYSFS_uint16 *src, char *dst,
+                                    PHYSFS_uint64 len);
+
+/**
+ * \fn PHYSFS_utf8toucs2(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len)
+ * \brief Convert a UTF-8 string to a UCS-2 string.
+ *
+ * UCS-2 strings are 16-bits per character: \c TCHAR on Windows, when building
+ *  with Unicode support.
+ *
+ * To ensure that the destination buffer is large enough for the conversion,
+ *  please allocate a buffer that is double the size of the source buffer.
+ *  UTF-8 uses from one to four bytes per character, but UCS-2 always uses
+ *  two, so an entirely low-ASCII string will double in size!
+ *
+ * Strings that don't fit in the destination buffer will be truncated, but
+ *  will always be null-terminated and never have an incomplete UCS-2
+ *  sequence at the end.
+ *
+ * Please note that UCS-2 is not UTF-16; we do not support the "surrogate"
+ *  values at this time.
+ *
+ *   \param src Null-terminated source string in UTF-8 format.
+ *   \param dst Buffer to store converted UCS-2 string.
+ *   \param len Size, in bytes, of destination buffer.
+ */
+__EXPORT__ void PHYSFS_utf8toucs2(const char *src, PHYSFS_uint16 *dst,
+                                  PHYSFS_uint64 len);
+
+/**
+ * \fn void PHYSFS_utf8fromlatin1(const char *src, char *dst, PHYSFS_uint64 len)
+ * \brief Convert a UTF-8 string to a Latin1 string.
+ *
+ * Latin1 strings are 8-bits per character: a popular "high ASCII"
+ *  encoding.
+ *
+ * To ensure that the destination buffer is large enough for the conversion,
+ *  please allocate a buffer that is double the size of the source buffer.
+ *  UTF-8 expands latin1 codepoints over 127 from to 2 bytes, so the string
+ *  may grow in some cases.
+ *
+ * Strings that don't fit in the destination buffer will be truncated, but
+ *  will always be null-terminated and never have an incomplete UTF-8
+ *  sequence at the end.
+ *
+ * Please note that we do not supply a UTF-8 to Latin1 converter, since Latin1
+ *  can't express most Unicode codepoints. It's a legacy encoding; you should
+ *  be converting away from it at all times.
+ *
+ *   \param src Null-terminated source string in Latin1 format.
+ *   \param dst Buffer to store converted UTF-8 string.
+ *   \param len Size, in bytes, of destination buffer.
+ */
+__EXPORT__ void PHYSFS_utf8fromlatin1(const char *src, char *dst,
+                                  PHYSFS_uint64 len);
+
 /* Everything above this line is part of the PhysicsFS 2.0 API. */
 
 
Modified: trunk/physfs.vcproj
===================================================================
--- trunk/physfs.vcproj	2006-11-05 11:09:42 UTC (rev 796)
+++ trunk/physfs.vcproj	2006-11-05 11:10:14 UTC (rev 797)
@@ -194,6 +194,9 @@
 				RelativePath=".\physfs_byteorder.c">
 			
 			
+			
+			
 			
 			
Author: icculus
Date: 2006-11-05 06:30:48 -0500 (Sun, 05 Nov 2006)
New Revision: 798
Modified:
   trunk/lzma/
Log:
Ignore automake files.
Property changes on: trunk/lzma
___________________________________________________________________
Name: svn:ignore
   + Makefile.in
Makefile
From DONOTREPLY at icculus.org  Sun Nov  5 06:32:18 2006
From: DONOTREPLY at icculus.org (DONOTREPLY at icculus.org)
Date: 5 Nov 2006 06:32:18 -0500
Subject: r799 - trunk
Message-ID: <20061105113218.8628.qmail@icculus.org>
Author: icculus
Date: 2006-11-05 06:32:18 -0500 (Sun, 05 Nov 2006)
New Revision: 799
Added:
   trunk/physfs_unicode.c
Log:
Initial add.
Added: trunk/physfs_unicode.c
===================================================================
--- trunk/physfs_unicode.c	                        (rev 0)
+++ trunk/physfs_unicode.c	2006-11-05 11:32:18 UTC (rev 799)
@@ -0,0 +1,338 @@
+#if HAVE_CONFIG_H
+#  include 
+#endif
+
+#include "physfs.h"
+
+#define __PHYSICSFS_INTERNAL__
+#include "physfs_internal.h"
+
+
+/*
+ * From rfc3629, the UTF-8 spec:
+ *  http://www.ietf.org/rfc/rfc3629.txt
+ *
+ *   Char. number range  |        UTF-8 octet sequence
+ *      (hexadecimal)    |              (binary)
+ *   --------------------+---------------------------------------------
+ *   0000 0000-0000 007F | 0xxxxxxx
+ *   0000 0080-0000 07FF | 110xxxxx 10xxxxxx
+ *   0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
+ *   0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ */
+
+
+/*
+ * This may not be the best value, but it's one that isn't represented
+ *  in Unicode (0x10FFFF is the largest codepoint value). We return this
+ *  value from utf8codepoint() if there's bogus bits in the
+ *  stream. utf8codepoint() will turn this value into something
+ *  reasonable (like a question mark), for text that wants to try to recover,
+ *  whereas utf8valid() will use the value to determine if a string has bad
+ *  bits.
+ */
+#define UNICODE_BOGUS_CHAR_VALUE 0xFFFFFFFF
+
+/*
+ * This is the codepoint we currently return when there was bogus bits in a
+ *  UTF-8 string. May not fly in Asian locales?
+ */
+#define UNICODE_BOGUS_CHAR_CODEPOINT '?'
+
+static PHYSFS_uint32 utf8codepoint(const char **_str)
+{
+    const char *str = *_str;
+    PHYSFS_uint32 retval = 0;
+    PHYSFS_uint32 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *str);
+    PHYSFS_uint32 octet2, octet3, octet4;
+
+    if (octet == 0)  /* null terminator, end of string. */
+        return 0;
+
+    else if (octet < 128)  /* one octet char: 0 to 127 */
+    {
+        (*_str)++;  /* skip to next possible start of codepoint. */
+        return(octet);
+    } /* else if */
+
+    else if ((octet > 127) && (octet < 192))  /* bad (starts with 10xxxxxx). */
+    {
+        /*
+         * Apparently each of these is supposed to be flagged as a bogus
+         *  char, instead of just resyncing to the next valid codepoint.
+         */
+        (*_str)++;  /* skip to next possible start of codepoint. */
+        return UNICODE_BOGUS_CHAR_VALUE;
+    } /* else if */
+
+    else if (octet < 224)  /* two octets */
+    {
+        octet -= (128+64);
+        octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+        if ((octet2 & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        *_str += 2;  /* skip to next possible start of codepoint. */
+        retval = ((octet << 6) | (octet2 - 128));
+        if ((retval >= 0x80) && (retval <= 0x7FF))
+            return retval;
+    } /* else if */
+
+    else if (octet < 240)  /* three octets */
+    {
+        octet -= (128+64+32);
+        octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+        if ((octet2 & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+        if ((octet3 & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        *_str += 3;  /* skip to next possible start of codepoint. */
+        retval = ( ((octet << 12)) | ((octet2-128) << 6) | ((octet3-128)) );
+
+        /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
+        switch (retval)
+        {
+            case 0xD800:
+            case 0xDB7F:
+            case 0xDB80:
+            case 0xDBFF:
+            case 0xDC00:
+            case 0xDF80:
+            case 0xDFFF:
+                return UNICODE_BOGUS_CHAR_VALUE;
+        } /* switch */
+
+        /* 0xFFFE and 0xFFFF are illegal, too, so we check them at the edge. */
+        if ((retval >= 0x800) && (retval <= 0xFFFD))
+            return retval;
+    } /* else if */
+
+    else if (octet < 248)  /* four octets */
+    {
+        octet -= (128+64+32+16);
+        octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+        if ((octet2 & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+        if ((octet3 & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet4 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+        if ((octet4 & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        *_str += 4;  /* skip to next possible start of codepoint. */
+        retval = ( ((octet << 18)) | ((octet2 - 128) << 12) |
+                   ((octet3 - 128) << 6) | ((octet4 - 128)) );
+        if ((retval >= 0x10000) && (retval <= 0x10FFFF))
+            return retval;
+    } /* else if */
+
+    /*
+     * Five and six octet sequences became illegal in rfc3629.
+     *  We throw the codepoint away, but parse them to make sure we move
+     *  ahead the right number of bytes and don't overflow the buffer.
+     */
+
+    else if (octet < 252)  /* five octets */
+    {
+        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        *_str += 5;  /* skip to next possible start of codepoint. */
+        return UNICODE_BOGUS_CHAR_VALUE;
+    } /* else if */
+
+    else  /* six octets */
+    {
+        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        *_str += 6;  /* skip to next possible start of codepoint. */
+        return UNICODE_BOGUS_CHAR_VALUE;
+    } /* else if */
+
+    return UNICODE_BOGUS_CHAR_VALUE;
+} /* utf8codepoint */
+
+void PHYSFS_utf8toucs4(const char *src, PHYSFS_uint32 *dst, PHYSFS_uint64 len)
+{
+    len -= sizeof (PHYSFS_uint32);   /* save room for null char. */
+    while (len >= sizeof (PHYSFS_uint32))
+    {
+        PHYSFS_uint32 cp = utf8codepoint(&src);
+        if (cp == 0)
+            break;
+        else if (cp == UNICODE_BOGUS_CHAR_VALUE)
+            cp = UNICODE_BOGUS_CHAR_CODEPOINT;
+        *(dst++) = cp;
+        len -= sizeof (PHYSFS_uint32);
+    } /* while */
+
+    *dst = 0;
+} /* PHYSFS_utf8toucs4 */
+
+void PHYSFS_utf8toucs2(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len)
+{
+    len -= sizeof (PHYSFS_uint16);   /* save room for null char. */
+    while (len >= sizeof (PHYSFS_uint16))
+    {
+        PHYSFS_uint32 cp = utf8codepoint(&src);
+        if (cp == 0)
+            break;
+        else if (cp == UNICODE_BOGUS_CHAR_VALUE)
+            cp = UNICODE_BOGUS_CHAR_CODEPOINT;
+
+        /* !!! BLUESKY: UTF-16 surrogates? */
+        if (cp > 0xFFFF)
+            cp = UNICODE_BOGUS_CHAR_CODEPOINT;
+
+        *(dst++) = cp;
+        len -= sizeof (PHYSFS_uint16);
+    } /* while */
+
+    *dst = 0;
+} /* PHYSFS_utf8toucs2 */
+
+static void utf8fromcodepoint(PHYSFS_uint32 cp, char **_dst, PHYSFS_uint64 *_len)
+{
+    char *dst = *_dst;
+    PHYSFS_uint64 len = *_len;
+
+    if (len == 0)
+        return;
+
+    if (cp > 0x10FFFF)
+        cp = UNICODE_BOGUS_CHAR_CODEPOINT;
+    else if ((cp == 0xFFFE) || (cp == 0xFFFF))  /* illegal values. */
+        cp = UNICODE_BOGUS_CHAR_CODEPOINT;
+    else
+    {
+        /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
+        switch (cp)
+        {
+            case 0xD800:
+            case 0xDB7F:
+            case 0xDB80:
+            case 0xDBFF:
+            case 0xDC00:
+            case 0xDF80:
+            case 0xDFFF:
+                cp = UNICODE_BOGUS_CHAR_CODEPOINT;
+        } /* switch */
+    } /* else */
+
+    /* Do the encoding... */
+    if (cp < 0x80)
+    {
+        *(dst++) = (char) cp;
+        len--;
+    } /* if */
+
+    else if (cp < 0x800)
+    {
+        if (len < 2)
+            len = 0;
+        else
+        {
+            *(dst++) = (char) ((cp >> 6) | 128 | 64);
+            *(dst++) = (char) (cp & 0x3F) | 128;
+            len -= 2;
+        } /* else */
+    } /* else if */
+
+    else if (cp < 0x10000)
+    {
+        if (len < 3)
+            len = 0;
+        else
+        {
+            *(dst++) = (char) ((cp >> 12) | 128 | 64 | 32);
+            *(dst++) = (char) ((cp >> 6) & 0x3F) | 128;
+            *(dst++) = (char) (cp & 0x3F) | 128;
+            len -= 3;
+        } /* else */
+    } /* else if */
+
+    else
+    {
+        if (len < 4)
+            len = 0;
+        else
+        {
+            *(dst++) = (char) ((cp >> 18) | 128 | 64 | 32 | 16);
+            *(dst++) = (char) ((cp >> 12) & 0x3F) | 128;
+            *(dst++) = (char) ((cp >> 6) & 0x3F) | 128;
+            *(dst++) = (char) (cp & 0x3F) | 128;
+            len -= 4;
+        } /* else if */
+    } /* else */
+
+    *_dst = dst;
+    *_len = len;
+} /* utf8fromcodepoint */
+
+#define UTF8FROMTYPE(typ, src, dst, len) \
+    len--;  \
+    while (len) \
+    { \
+        const PHYSFS_uint32 cp = (PHYSFS_uint32) *(src++); \
+        if (cp == 0) break; \
+        utf8fromcodepoint(cp, &dst, &len); \
+    } \
+    *dst = '\0'; \
+
+void PHYSFS_utf8fromucs4(const PHYSFS_uint32 *src, char *dst, PHYSFS_uint64 len)
+{
+    UTF8FROMTYPE(PHYSFS_uint32, src, dst, len);
+} /* PHYSFS_utf8fromucs4 */
+
+void PHYSFS_utf8fromucs2(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len)
+{
+    UTF8FROMTYPE(PHYSFS_uint64, src, dst, len);
+} /* PHYSFS_utf8fromucs4 */
+
+/* latin1 maps to unicode codepoints directly, we just utf-8 encode it. */
+void PHYSFS_utf8fromlatin1(const char *src, char *dst, PHYSFS_uint64 len)
+{
+    UTF8FROMTYPE(PHYSFS_uint8, src, dst, len);
+} /* PHYSFS_utf8fromlatin1 */
+
+#undef UTF8FROMTYPE
+
+/* end of physfs_unicode.c ... */
+
From DONOTREPLY at icculus.org  Sun Nov  5 14:06:23 2006
From: DONOTREPLY at icculus.org (DONOTREPLY at icculus.org)
Date: 5 Nov 2006 14:06:23 -0500
Subject: r800 - in trunk: . platform
Message-ID: <20061105190623.6982.qmail@icculus.org>
Author: icculus
Date: 2006-11-05 14:06:23 -0500 (Sun, 05 Nov 2006)
New Revision: 800
Modified:
   trunk/CHANGELOG
   trunk/platform/beos.cpp
Log:
Apparently BeOS's BPath constructor doesn't actually _need_ leaf to be
 manually split out.
Modified: trunk/CHANGELOG
===================================================================
--- trunk/CHANGELOG	2006-11-05 11:32:18 UTC (rev 799)
+++ trunk/CHANGELOG	2006-11-05 19:06:23 UTC (rev 800)
@@ -3,6 +3,7 @@
  */
 
 11052006 - More 7zip archiver work (thanks, Dennis!). Initial Unicode work.
+           Minor BeOS realpath tweak.
 09272006 - Reworked 7zip archiver (thanks, Dennis!).
 09232006 - Fixed typo in doxygen comment.
 04112006 - Added LZMA archiver...7zip support (thanks, Dennis!).
Modified: trunk/platform/beos.cpp
===================================================================
--- trunk/platform/beos.cpp	2006-11-05 11:32:18 UTC (rev 799)
+++ trunk/platform/beos.cpp	2006-11-05 19:06:23 UTC (rev 800)
@@ -200,14 +200,7 @@
 
 char *__PHYSFS_platformRealPath(const char *path)
 {
-    char *str = (char *) alloca(strlen(path) + 1);
-    BAIL_IF_MACRO(str == NULL, ERR_OUT_OF_MEMORY, NULL);
-    strcpy(str, path);
-    char *leaf = strrchr(str, '/');
-    if (leaf != NULL)
-        *(leaf++) = '\0';
-
-    BPath normalized(str, leaf, true);  /* force normalization of path. */
+    BPath normalized(path, NULL, true);  /* force normalization of path. */
     const char *resolved_path = normalized.Path();
     BAIL_IF_MACRO(resolved_path == NULL, ERR_NO_SUCH_FILE, NULL);
     char *retval = (char *) allocator.Malloc(strlen(resolved_path) + 1);