r539 - trunk

Sun Mar 2 01:37:37 EST 2008

Author: icculus
Date: 2008-03-02 01:37:37 -0500 (Sun, 02 Mar 2008)
New Revision: 539

Modified:
   trunk/gui.h
   trunk/mojosetup.c
   trunk/platform_windows.c
   trunk/universal.h
Log:
Moved utf8codepoint() out of platform_windows.c and into the core, with hooks
 for the GUI plugins, so I can clean up the unicode FIXMEs in the stdio and
 ncurses targets.


Modified: trunk/gui.h
===================================================================

--- trunk/gui.h	2008-03-02 06:22:08 UTC (rev 538)
+++ trunk/gui.h	2008-03-02 06:37:37 UTC (rev 539)
@@ -249,9 +249,14 @@
 #endif
 #define ticks() entry->ticks()
 
+#ifdef utf8codepoint
+#undef utf8codepoint
 #endif
+#define utf8codepoint(x) entry->utf8codepoint(x)
 
+#endif
 
+
 /*
  * make some decisions about which GUI plugins to build...
  *  We list them all here, but some are built, some aren't. Some are DLLs,

Modified: trunk/mojosetup.c
===================================================================
--- trunk/mojosetup.c	2008-03-02 06:22:08 UTC (rev 538)
+++ trunk/mojosetup.c	2008-03-02 06:37:37 UTC (rev 539)
@@ -39,6 +39,7 @@
     format,
     numstr,
     MojoPlatform_ticks,
+    utf8codepoint,
 };
 
 int GArgc = 0;
@@ -629,6 +630,153 @@
 } // xstrncpy
 
 
+uint32 utf8codepoint(const char **_str)
+{
+    const char *str = *_str;
+    uint32 retval = 0;
+    uint32 octet = (uint32) ((uint8) *str);
+    uint32 octet2, octet3, octet4;
+
+    if (octet == 0)  // null terminator, end of string.
+        return 0;
+
+    else if (octet < 128)  // one octet char: 0 to 127
+    {
+        (*_str)++;  // skip to next possible start of codepoint.
+        return octet;
+    } // else if
+
+    else if ((octet > 127) && (octet < 192))  // bad (starts with 10xxxxxx).
+    {
+        // Apparently each of these is supposed to be flagged as a bogus
+        //  char, instead of just resyncing to the next valid codepoint.
+        (*_str)++;  // skip to next possible start of codepoint.
+        return UNICODE_BOGUS_CHAR_VALUE;
+    } // else if
+
+    else if (octet < 224)  // two octets
+    {
+        octet -= (128+64);
+        octet2 = (uint32) ((uint8) *(++str));
+        if ((octet2 & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        *_str += 2;  // skip to next possible start of codepoint.
+        retval = ((octet << 6) | (octet2 - 128));
+        if ((retval >= 0x80) && (retval <= 0x7FF))
+            return retval;
+    } // else if
+
+    else if (octet < 240)  // three octets
+    {
+        octet -= (128+64+32);
+        octet2 = (uint32) ((uint8) *(++str));
+        if ((octet2 & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet3 = (uint32) ((uint8) *(++str));
+        if ((octet3 & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        *_str += 3;  // skip to next possible start of codepoint.
+        retval = ( ((octet << 12)) | ((octet2-128) << 6) | ((octet3-128)) );
+
+        // There are seven "UTF-16 surrogates" that are illegal in UTF-8.
+        switch (retval)
+        {
+            case 0xD800:
+            case 0xDB7F:
+            case 0xDB80:
+            case 0xDBFF:
+            case 0xDC00:
+            case 0xDF80:
+            case 0xDFFF:
+                return UNICODE_BOGUS_CHAR_VALUE;
+        } // switch
+
+        // 0xFFFE and 0xFFFF are illegal, too, so we check them at the edge.
+        if ((retval >= 0x800) && (retval <= 0xFFFD))
+            return retval;
+    } // else if
+
+    else if (octet < 248)  // four octets
+    {
+        octet -= (128+64+32+16);
+        octet2 = (uint32) ((uint8) *(++str));
+        if ((octet2 & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet3 = (uint32) ((uint8) *(++str));
+        if ((octet3 & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet4 = (uint32) ((uint8) *(++str));
+        if ((octet4 & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        *_str += 4;  // skip to next possible start of codepoint.
+        retval = ( ((octet << 18)) | ((octet2 - 128) << 12) |
+                   ((octet3 - 128) << 6) | ((octet4 - 128)) );
+        if ((retval >= 0x10000) && (retval <= 0x10FFFF))
+            return retval;
+    } // else if
+
+    // Five and six octet sequences became illegal in rfc3629.
+    //  We throw the codepoint away, but parse them to make sure we move
+    //  ahead the right number of bytes and don't overflow the buffer.
+
+    else if (octet < 252)  // five octets
+    {
+        octet = (uint32) ((uint8) *(++str));
+        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet = (uint32) ((uint8) *(++str));
+        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet = (uint32) ((uint8) *(++str));
+        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet = (uint32) ((uint8) *(++str));
+        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        *_str += 5;  // skip to next possible start of codepoint.
+        return UNICODE_BOGUS_CHAR_VALUE;
+    } // else if
+
+    else  // six octets
+    {
+        octet = (uint32) ((uint8) *(++str));
+        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet = (uint32) ((uint8) *(++str));
+        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet = (uint32) ((uint8) *(++str));
+        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet = (uint32) ((uint8) *(++str));
+        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet = (uint32) ((uint8) *(++str));
+        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        *_str += 6;  // skip to next possible start of codepoint.
+        return UNICODE_BOGUS_CHAR_VALUE;
+    } // else if
+
+    return UNICODE_BOGUS_CHAR_VALUE;
+} // utf8codepoint
+
+
 static void outOfMemory(void)
 {
     // Try to translate "out of memory", but not if it causes recursion.

Modified: trunk/platform_windows.c
===================================================================
--- trunk/platform_windows.c	2008-03-02 06:22:08 UTC (rev 538)
+++ trunk/platform_windows.c	2008-03-02 06:37:37 UTC (rev 539)
@@ -71,155 +71,8 @@
 #endif
 
 
-#define UNICODE_BOGUS_CHAR_VALUE 0xFFFFFFFF
-#define UNICODE_BOGUS_CHAR_CODEPOINT '?'
+// these utf-8 functions may move to mojosetup.c some day...
 
-static uint32 utf8codepoint(const char **_str)
-{
-    const char *str = *_str;
-    uint32 retval = 0;
-    uint32 octet = (uint32) ((uint8) *str);
-    uint32 octet2, octet3, octet4;
-
-    if (octet == 0)  // null terminator, end of string.
-        return 0;
-
-    else if (octet < 128)  // one octet char: 0 to 127
-    {
-        (*_str)++;  // skip to next possible start of codepoint.
-        return octet;
-    } // else if
-
-    else if ((octet > 127) && (octet < 192))  // bad (starts with 10xxxxxx).
-    {
-        // Apparently each of these is supposed to be flagged as a bogus
-        //  char, instead of just resyncing to the next valid codepoint.
-        (*_str)++;  // skip to next possible start of codepoint.
-        return UNICODE_BOGUS_CHAR_VALUE;
-    } // else if
-
-    else if (octet < 224)  // two octets
-    {
-        octet -= (128+64);
-        octet2 = (uint32) ((uint8) *(++str));
-        if ((octet2 & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        *_str += 2;  // skip to next possible start of codepoint.
-        retval = ((octet << 6) | (octet2 - 128));
-        if ((retval >= 0x80) && (retval <= 0x7FF))
-            return retval;
-    } // else if
-
-    else if (octet < 240)  // three octets
-    {
-        octet -= (128+64+32);
-        octet2 = (uint32) ((uint8) *(++str));
-        if ((octet2 & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        octet3 = (uint32) ((uint8) *(++str));
-        if ((octet3 & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        *_str += 3;  // skip to next possible start of codepoint.
-        retval = ( ((octet << 12)) | ((octet2-128) << 6) | ((octet3-128)) );
-
-        // There are seven "UTF-16 surrogates" that are illegal in UTF-8.
-        switch (retval)
-        {
-            case 0xD800:
-            case 0xDB7F:
-            case 0xDB80:
-            case 0xDBFF:
-            case 0xDC00:
-            case 0xDF80:
-            case 0xDFFF:
-                return UNICODE_BOGUS_CHAR_VALUE;
-        } // switch
-
-        // 0xFFFE and 0xFFFF are illegal, too, so we check them at the edge.
-        if ((retval >= 0x800) && (retval <= 0xFFFD))
-            return retval;
-    } // else if
-
-    else if (octet < 248)  // four octets
-    {
-        octet -= (128+64+32+16);
-        octet2 = (uint32) ((uint8) *(++str));
-        if ((octet2 & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        octet3 = (uint32) ((uint8) *(++str));
-        if ((octet3 & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        octet4 = (uint32) ((uint8) *(++str));
-        if ((octet4 & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        *_str += 4;  // skip to next possible start of codepoint.
-        retval = ( ((octet << 18)) | ((octet2 - 128) << 12) |
-                   ((octet3 - 128) << 6) | ((octet4 - 128)) );
-        if ((retval >= 0x10000) && (retval <= 0x10FFFF))
-            return retval;
-    } // else if
-
-    // Five and six octet sequences became illegal in rfc3629.
-    //  We throw the codepoint away, but parse them to make sure we move
-    //  ahead the right number of bytes and don't overflow the buffer.
-
-    else if (octet < 252)  // five octets
-    {
-        octet = (uint32) ((uint8) *(++str));
-        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        octet = (uint32) ((uint8) *(++str));
-        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        octet = (uint32) ((uint8) *(++str));
-        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        octet = (uint32) ((uint8) *(++str));
-        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        *_str += 5;  // skip to next possible start of codepoint.
-        return UNICODE_BOGUS_CHAR_VALUE;
-    } // else if
-
-    else  // six octets
-    {
-        octet = (uint32) ((uint8) *(++str));
-        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        octet = (uint32) ((uint8) *(++str));
-        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        octet = (uint32) ((uint8) *(++str));
-        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        octet = (uint32) ((uint8) *(++str));
-        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        octet = (uint32) ((uint8) *(++str));
-        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        *_str += 6;  // skip to next possible start of codepoint.
-        return UNICODE_BOGUS_CHAR_VALUE;
-    } // else if
-
-    return UNICODE_BOGUS_CHAR_VALUE;
-} // utf8codepoint
-
 void utf8ToUcs2(const char *src, uint16 *dst, uint64 len)
 {
     len -= sizeof (uint16);   // save room for null char.

Modified: trunk/universal.h
===================================================================
--- trunk/universal.h	2008-03-02 06:22:08 UTC (rev 538)
+++ trunk/universal.h	2008-03-02 06:37:37 UTC (rev 539)
@@ -96,6 +96,13 @@
 // Static, non-stack memory for scratch work...not thread safe!
 extern uint8 scratchbuf_128k[128 * 1024];
 
+
+#define UNICODE_BOGUS_CHAR_VALUE 0xFFFFFFFF
+#define UNICODE_BOGUS_CHAR_CODEPOINT '?'
+// !!! FIXME: document me!
+uint32 utf8codepoint(const char **_str);
+
+
 // Format a string, sort of (but not exactly!) like sprintf().
 //  The only formatters accepted are %0 through %9 (and %%), which do not
 //  have to appear in order in the string, but match the varargs passed to the
@@ -347,6 +354,7 @@
     char *(*format)(const char *fmt, ...);
     const char *(*numstr)(int val);
     uint32 (*ticks)(void);
+    uint32 (*utf8codepoint)(const char **_str);
 } MojoSetupEntryPoints;
 extern MojoSetupEntryPoints GEntryPoints;