[mojosetup] utf8codepoint() must advance the pointer even when encountering an invalid character

Francois Gouget fgouget at codeweavers.com
Mon May 23 18:08:52 EDT 2011


# HG changeset patch
# User Francois Gouget <fgouget at codeweavers.com>
# Date 1306186314 -7200
# Node ID 446e28688755f9297d0b1a7e073f7c5bfc75147e
# Parent  bc35a9e23f6d798f5da5ba91bbca1692d617e82c
utf8codepoint() must advance the pointer even when encountering an invalid character.

Otherwise utf8len() gets into an infinite loop if the string is not valid UTF-8.

diff -r bc35a9e23f6d -r 446e28688755 mojosetup.c
--- a/mojosetup.c	Mon May 23 23:32:09 2011 +0200
+++ b/mojosetup.c	Mon May 23 23:31:54 2011 +0200
@@ -667,12 +667,13 @@
 
     else if (octet < 224)  // two octets
     {
+        (*_str)++;  // advance at least one byte in case of an error
         octet -= (128+64);
         octet2 = (uint32) ((uint8) *(++str));
         if ((octet2 & (128+64)) != 128)  // Format isn't 10xxxxxx?
             return UNICODE_BOGUS_CHAR_VALUE;
 
-        *_str += 2;  // skip to next possible start of codepoint.
+        *_str += 1;  // skip to next possible start of codepoint.
         retval = ((octet << 6) | (octet2 - 128));
         if ((retval >= 0x80) && (retval <= 0x7FF))
             return retval;
@@ -680,6 +681,7 @@
 
     else if (octet < 240)  // three octets
     {
+        (*_str)++;  // advance at least one byte in case of an error
         octet -= (128+64+32);
         octet2 = (uint32) ((uint8) *(++str));
         if ((octet2 & (128+64)) != 128)  // Format isn't 10xxxxxx?
@@ -689,7 +691,7 @@
         if ((octet3 & (128+64)) != 128)  // Format isn't 10xxxxxx?
             return UNICODE_BOGUS_CHAR_VALUE;
 
-        *_str += 3;  // skip to next possible start of codepoint.
+        *_str += 2;  // skip to next possible start of codepoint.
         retval = ( ((octet << 12)) | ((octet2-128) << 6) | ((octet3-128)) );
 
         // There are seven "UTF-16 surrogates" that are illegal in UTF-8.
@@ -712,6 +714,7 @@
 
     else if (octet < 248)  // four octets
     {
+        (*_str)++;  // advance at least one byte in case of an error
         octet -= (128+64+32+16);
         octet2 = (uint32) ((uint8) *(++str));
         if ((octet2 & (128+64)) != 128)  // Format isn't 10xxxxxx?
@@ -725,7 +728,7 @@
         if ((octet4 & (128+64)) != 128)  // Format isn't 10xxxxxx?
             return UNICODE_BOGUS_CHAR_VALUE;
 
-        *_str += 4;  // skip to next possible start of codepoint.
+        *_str += 3;  // skip to next possible start of codepoint.
         retval = ( ((octet << 18)) | ((octet2 - 128) << 12) |
                    ((octet3 - 128) << 6) | ((octet4 - 128)) );
         if ((retval >= 0x10000) && (retval <= 0x10FFFF))
@@ -738,6 +741,7 @@
 
     else if (octet < 252)  // five octets
     {
+        (*_str)++;  // advance at least one byte in case of an error
         octet = (uint32) ((uint8) *(++str));
         if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
             return UNICODE_BOGUS_CHAR_VALUE;
@@ -754,7 +758,7 @@
         if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
             return UNICODE_BOGUS_CHAR_VALUE;
 
-        *_str += 5;  // skip to next possible start of codepoint.
+        *_str += 4;  // skip to next possible start of codepoint.
         return UNICODE_BOGUS_CHAR_VALUE;
     } // else if
 



More information about the mojosetup mailing list