r539 - trunk
DONOTREPLY at icculus.org
DONOTREPLY at icculus.org
Sun Mar 2 01:37:37 EST 2008
Author: icculus
Date: 2008-03-02 01:37:37 -0500 (Sun, 02 Mar 2008)
New Revision: 539
Modified:
trunk/gui.h
trunk/mojosetup.c
trunk/platform_windows.c
trunk/universal.h
Log:
Moved utf8codepoint() out of platform_windows.c and into the core, with hooks
for the GUI plugins, so I can clean up the unicode FIXMEs in the stdio and
ncurses targets.
Modified: trunk/gui.h
===================================================================
--- trunk/gui.h 2008-03-02 06:22:08 UTC (rev 538)
+++ trunk/gui.h 2008-03-02 06:37:37 UTC (rev 539)
@@ -249,9 +249,14 @@
#endif
#define ticks() entry->ticks()
+#ifdef utf8codepoint
+#undef utf8codepoint
#endif
+#define utf8codepoint(x) entry->utf8codepoint(x)
+#endif
+
/*
* make some decisions about which GUI plugins to build...
* We list them all here, but some are built, some aren't. Some are DLLs,
Modified: trunk/mojosetup.c
===================================================================
--- trunk/mojosetup.c 2008-03-02 06:22:08 UTC (rev 538)
+++ trunk/mojosetup.c 2008-03-02 06:37:37 UTC (rev 539)
@@ -39,6 +39,7 @@
format,
numstr,
MojoPlatform_ticks,
+ utf8codepoint,
};
int GArgc = 0;
@@ -629,6 +630,153 @@
} // xstrncpy
+uint32 utf8codepoint(const char **_str)
+{
+ const char *str = *_str;
+ uint32 retval = 0;
+ uint32 octet = (uint32) ((uint8) *str);
+ uint32 octet2, octet3, octet4;
+
+ if (octet == 0) // null terminator, end of string.
+ return 0;
+
+ else if (octet < 128) // one octet char: 0 to 127
+ {
+ (*_str)++; // skip to next possible start of codepoint.
+ return octet;
+ } // else if
+
+ else if ((octet > 127) && (octet < 192)) // bad (starts with 10xxxxxx).
+ {
+ // Apparently each of these is supposed to be flagged as a bogus
+ // char, instead of just resyncing to the next valid codepoint.
+ (*_str)++; // skip to next possible start of codepoint.
+ return UNICODE_BOGUS_CHAR_VALUE;
+ } // else if
+
+ else if (octet < 224) // two octets
+ {
+ octet -= (128+64);
+ octet2 = (uint32) ((uint8) *(++str));
+ if ((octet2 & (128+64)) != 128) // Format isn't 10xxxxxx?
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ *_str += 2; // skip to next possible start of codepoint.
+ retval = ((octet << 6) | (octet2 - 128));
+ if ((retval >= 0x80) && (retval <= 0x7FF))
+ return retval;
+ } // else if
+
+ else if (octet < 240) // three octets
+ {
+ octet -= (128+64+32);
+ octet2 = (uint32) ((uint8) *(++str));
+ if ((octet2 & (128+64)) != 128) // Format isn't 10xxxxxx?
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ octet3 = (uint32) ((uint8) *(++str));
+ if ((octet3 & (128+64)) != 128) // Format isn't 10xxxxxx?
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ *_str += 3; // skip to next possible start of codepoint.
+ retval = ( ((octet << 12)) | ((octet2-128) << 6) | ((octet3-128)) );
+
+ // There are seven "UTF-16 surrogates" that are illegal in UTF-8.
+ switch (retval)
+ {
+ case 0xD800:
+ case 0xDB7F:
+ case 0xDB80:
+ case 0xDBFF:
+ case 0xDC00:
+ case 0xDF80:
+ case 0xDFFF:
+ return UNICODE_BOGUS_CHAR_VALUE;
+ } // switch
+
+ // 0xFFFE and 0xFFFF are illegal, too, so we check them at the edge.
+ if ((retval >= 0x800) && (retval <= 0xFFFD))
+ return retval;
+ } // else if
+
+ else if (octet < 248) // four octets
+ {
+ octet -= (128+64+32+16);
+ octet2 = (uint32) ((uint8) *(++str));
+ if ((octet2 & (128+64)) != 128) // Format isn't 10xxxxxx?
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ octet3 = (uint32) ((uint8) *(++str));
+ if ((octet3 & (128+64)) != 128) // Format isn't 10xxxxxx?
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ octet4 = (uint32) ((uint8) *(++str));
+ if ((octet4 & (128+64)) != 128) // Format isn't 10xxxxxx?
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ *_str += 4; // skip to next possible start of codepoint.
+ retval = ( ((octet << 18)) | ((octet2 - 128) << 12) |
+ ((octet3 - 128) << 6) | ((octet4 - 128)) );
+ if ((retval >= 0x10000) && (retval <= 0x10FFFF))
+ return retval;
+ } // else if
+
+ // Five and six octet sequences became illegal in rfc3629.
+ // We throw the codepoint away, but parse them to make sure we move
+ // ahead the right number of bytes and don't overflow the buffer.
+
+ else if (octet < 252) // five octets
+ {
+ octet = (uint32) ((uint8) *(++str));
+ if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ octet = (uint32) ((uint8) *(++str));
+ if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ octet = (uint32) ((uint8) *(++str));
+ if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ octet = (uint32) ((uint8) *(++str));
+ if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ *_str += 5; // skip to next possible start of codepoint.
+ return UNICODE_BOGUS_CHAR_VALUE;
+ } // else if
+
+ else // six octets
+ {
+ octet = (uint32) ((uint8) *(++str));
+ if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ octet = (uint32) ((uint8) *(++str));
+ if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ octet = (uint32) ((uint8) *(++str));
+ if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ octet = (uint32) ((uint8) *(++str));
+ if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ octet = (uint32) ((uint8) *(++str));
+ if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
+ return UNICODE_BOGUS_CHAR_VALUE;
+
+ *_str += 6; // skip to next possible start of codepoint.
+ return UNICODE_BOGUS_CHAR_VALUE;
+ } // else if
+
+ return UNICODE_BOGUS_CHAR_VALUE;
+} // utf8codepoint
+
+
static void outOfMemory(void)
{
// Try to translate "out of memory", but not if it causes recursion.
Modified: trunk/platform_windows.c
===================================================================
--- trunk/platform_windows.c 2008-03-02 06:22:08 UTC (rev 538)
+++ trunk/platform_windows.c 2008-03-02 06:37:37 UTC (rev 539)
@@ -71,155 +71,8 @@
#endif
-#define UNICODE_BOGUS_CHAR_VALUE 0xFFFFFFFF
-#define UNICODE_BOGUS_CHAR_CODEPOINT '?'
+// these utf-8 functions may move to mojosetup.c some day...
-static uint32 utf8codepoint(const char **_str)
-{
- const char *str = *_str;
- uint32 retval = 0;
- uint32 octet = (uint32) ((uint8) *str);
- uint32 octet2, octet3, octet4;
-
- if (octet == 0) // null terminator, end of string.
- return 0;
-
- else if (octet < 128) // one octet char: 0 to 127
- {
- (*_str)++; // skip to next possible start of codepoint.
- return octet;
- } // else if
-
- else if ((octet > 127) && (octet < 192)) // bad (starts with 10xxxxxx).
- {
- // Apparently each of these is supposed to be flagged as a bogus
- // char, instead of just resyncing to the next valid codepoint.
- (*_str)++; // skip to next possible start of codepoint.
- return UNICODE_BOGUS_CHAR_VALUE;
- } // else if
-
- else if (octet < 224) // two octets
- {
- octet -= (128+64);
- octet2 = (uint32) ((uint8) *(++str));
- if ((octet2 & (128+64)) != 128) // Format isn't 10xxxxxx?
- return UNICODE_BOGUS_CHAR_VALUE;
-
- *_str += 2; // skip to next possible start of codepoint.
- retval = ((octet << 6) | (octet2 - 128));
- if ((retval >= 0x80) && (retval <= 0x7FF))
- return retval;
- } // else if
-
- else if (octet < 240) // three octets
- {
- octet -= (128+64+32);
- octet2 = (uint32) ((uint8) *(++str));
- if ((octet2 & (128+64)) != 128) // Format isn't 10xxxxxx?
- return UNICODE_BOGUS_CHAR_VALUE;
-
- octet3 = (uint32) ((uint8) *(++str));
- if ((octet3 & (128+64)) != 128) // Format isn't 10xxxxxx?
- return UNICODE_BOGUS_CHAR_VALUE;
-
- *_str += 3; // skip to next possible start of codepoint.
- retval = ( ((octet << 12)) | ((octet2-128) << 6) | ((octet3-128)) );
-
- // There are seven "UTF-16 surrogates" that are illegal in UTF-8.
- switch (retval)
- {
- case 0xD800:
- case 0xDB7F:
- case 0xDB80:
- case 0xDBFF:
- case 0xDC00:
- case 0xDF80:
- case 0xDFFF:
- return UNICODE_BOGUS_CHAR_VALUE;
- } // switch
-
- // 0xFFFE and 0xFFFF are illegal, too, so we check them at the edge.
- if ((retval >= 0x800) && (retval <= 0xFFFD))
- return retval;
- } // else if
-
- else if (octet < 248) // four octets
- {
- octet -= (128+64+32+16);
- octet2 = (uint32) ((uint8) *(++str));
- if ((octet2 & (128+64)) != 128) // Format isn't 10xxxxxx?
- return UNICODE_BOGUS_CHAR_VALUE;
-
- octet3 = (uint32) ((uint8) *(++str));
- if ((octet3 & (128+64)) != 128) // Format isn't 10xxxxxx?
- return UNICODE_BOGUS_CHAR_VALUE;
-
- octet4 = (uint32) ((uint8) *(++str));
- if ((octet4 & (128+64)) != 128) // Format isn't 10xxxxxx?
- return UNICODE_BOGUS_CHAR_VALUE;
-
- *_str += 4; // skip to next possible start of codepoint.
- retval = ( ((octet << 18)) | ((octet2 - 128) << 12) |
- ((octet3 - 128) << 6) | ((octet4 - 128)) );
- if ((retval >= 0x10000) && (retval <= 0x10FFFF))
- return retval;
- } // else if
-
- // Five and six octet sequences became illegal in rfc3629.
- // We throw the codepoint away, but parse them to make sure we move
- // ahead the right number of bytes and don't overflow the buffer.
-
- else if (octet < 252) // five octets
- {
- octet = (uint32) ((uint8) *(++str));
- if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
- return UNICODE_BOGUS_CHAR_VALUE;
-
- octet = (uint32) ((uint8) *(++str));
- if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
- return UNICODE_BOGUS_CHAR_VALUE;
-
- octet = (uint32) ((uint8) *(++str));
- if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
- return UNICODE_BOGUS_CHAR_VALUE;
-
- octet = (uint32) ((uint8) *(++str));
- if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
- return UNICODE_BOGUS_CHAR_VALUE;
-
- *_str += 5; // skip to next possible start of codepoint.
- return UNICODE_BOGUS_CHAR_VALUE;
- } // else if
-
- else // six octets
- {
- octet = (uint32) ((uint8) *(++str));
- if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
- return UNICODE_BOGUS_CHAR_VALUE;
-
- octet = (uint32) ((uint8) *(++str));
- if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
- return UNICODE_BOGUS_CHAR_VALUE;
-
- octet = (uint32) ((uint8) *(++str));
- if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
- return UNICODE_BOGUS_CHAR_VALUE;
-
- octet = (uint32) ((uint8) *(++str));
- if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
- return UNICODE_BOGUS_CHAR_VALUE;
-
- octet = (uint32) ((uint8) *(++str));
- if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
- return UNICODE_BOGUS_CHAR_VALUE;
-
- *_str += 6; // skip to next possible start of codepoint.
- return UNICODE_BOGUS_CHAR_VALUE;
- } // else if
-
- return UNICODE_BOGUS_CHAR_VALUE;
-} // utf8codepoint
-
void utf8ToUcs2(const char *src, uint16 *dst, uint64 len)
{
len -= sizeof (uint16); // save room for null char.
Modified: trunk/universal.h
===================================================================
--- trunk/universal.h 2008-03-02 06:22:08 UTC (rev 538)
+++ trunk/universal.h 2008-03-02 06:37:37 UTC (rev 539)
@@ -96,6 +96,13 @@
// Static, non-stack memory for scratch work...not thread safe!
extern uint8 scratchbuf_128k[128 * 1024];
+
+#define UNICODE_BOGUS_CHAR_VALUE 0xFFFFFFFF
+#define UNICODE_BOGUS_CHAR_CODEPOINT '?'
+// !!! FIXME: document me!
+uint32 utf8codepoint(const char **_str);
+
+
// Format a string, sort of (but not exactly!) like sprintf().
// The only formatters accepted are %0 through %9 (and %%), which do not
// have to appear in order in the string, but match the varargs passed to the
@@ -347,6 +354,7 @@
char *(*format)(const char *fmt, ...);
const char *(*numstr)(int val);
uint32 (*ticks)(void);
+ uint32 (*utf8codepoint)(const char **_str);
} MojoSetupEntryPoints;
extern MojoSetupEntryPoints GEntryPoints;
More information about the mojosetup-commits
mailing list