00001 // -*- mode: C++; indent-tabs-mode: nil; c-basic-offset: 2; -*- 00002 #ifndef __ustring_hh 00003 #define __ustring_hh 00004 00005 /*! @file ustring.hh 00006 @brief Provides a simple UTF-8 encoded string 00007 */ 00008 00009 extern "C" { 00010 00011 #ifdef HAVE_STDINT_H 00012 # include <stdint.h> 00013 #else 00014 # ifdef HAVE_SYS_TYPES_H 00015 # include <sys/types.h> 00016 # endif 00017 #endif 00018 00019 } 00020 00021 #include <string> 00022 00023 namespace otk { 00024 00025 00026 #ifdef HAVE_STDINT_H 00027 typedef uint32_t unichar; 00028 #else 00029 typedef u_int32_t unichar; 00030 #endif 00031 00032 00033 #ifndef DOXYGEN_IGNORE 00034 00035 unichar utf8_get_char(const char *p); 00036 00037 #endif // DOXYGEN_IGNORE 00038 00039 //! The iterator type for ustring 00040 /*! 00041 Note this is not a random access iterator but a bidirectional one, since all 00042 index operations need to iterate over the UTF-8 data. Use std::advance() to 00043 move to a certain position. 00044 <p> 00045 A writeable iterator isn't provided because: The number of bytes of the old 00046 UTF-8 character and the new one to write could be different. Therefore, any 00047 write operation would invalidate all other iterators pointing into the same 00048 string. 00049 */ 00050 00051 template <class T> 00052 class ustring_Iterator 00053 { 00054 public: 00055 typedef std::bidirectional_iterator_tag iterator_category; 00056 typedef unichar value_type; 00057 typedef std::string::difference_type difference_type; 00058 //typedef value_type reference; 00059 typedef void pointer; 00060 00061 inline ustring_Iterator() {} 00062 inline ustring_Iterator(const ustring_Iterator<std::string::iterator>& 00063 other) : _pos(other.base()) {} 00064 00065 00066 inline value_type operator*() const { 00067 // get an iterator to the internal string 00068 std::string::const_iterator pos = _pos; 00069 return utf8_get_char(&(*pos)); 00070 } 00071 00072 00073 inline ustring_Iterator<T> & operator++() { 00074 pos_ += g_utf8_skip[static_cast<unsigned char>(*pos_)]; 00075 return *this; 00076 } 00077 inline ustring_Iterator<T> & operator--() { 00078 do { --_pos; } while((*_pos & '\xC0') == '\x80'); 00079 return *this; 00080 } 00081 00082 explicit inline ustring_Iterator(T pos) : _pos(pos) {} 00083 inline T base() const { return _pos; } 00084 00085 private: 00086 T _pos; 00087 }; 00088 00089 00090 //! This class provides a simple wrapper to a std::string that can be encoded 00091 //! as UTF-8. The ustring::utf() member specifies if the given string is UTF-8 00092 //! encoded. ustrings default to specifying UTF-8 encoding. 00093 /*! 00094 This class does <b>not</b> handle extended 8-bit ASCII charsets like 00095 ISO-8859-1. 00096 <p> 00097 More info on Unicode and UTF-8 can be found here: 00098 http://www.cl.cam.ac.uk/~mgk25/unicode.html 00099 <p> 00100 This does not subclass std::string, because std::string was intended to be a 00101 final class. For instance, it does not have a virtual destructor. 00102 */ 00103 class ustring { 00104 std::string _string; 00105 bool _utf8; 00106 00107 public: 00108 typedef std::string::size_type size_type; 00109 typedef std::string::difference_type difference_type; 00110 00111 typedef unichar value_type; 00112 //typedef unichar & reference; 00113 //typedef const unichar & const_reference; 00114 00115 //typedef ustring_Iterator<std::string::iterator> iterator; 00116 //typedef ustring_Iterator<std::string::const_iterator> const_iterator; 00117 00118 static const size_type npos = std::string::npos; 00119 00120 ustring(bool utf8 = true); 00121 ~ustring(); 00122 00123 // make new strings 00124 00125 ustring(const ustring& other); 00126 ustring& operator=(const ustring& other); 00127 ustring(const std::string& src, bool utf8 = true); 00128 ustring(const char* src, bool utf8 = true); 00129 00130 // append to the string 00131 00132 ustring& operator+=(const ustring& src); 00133 ustring& operator+=(const char* src); 00134 ustring& operator+=(char c); 00135 00136 // sizes 00137 00138 ustring::size_type size() const; 00139 ustring::size_type bytes() const; 00140 ustring::size_type capacity() const; 00141 ustring::size_type max_size() const; 00142 bool empty() const; 00143 00144 // erase substrings 00145 00146 void clear(); 00147 ustring& erase(size_type i, size_type n=npos); 00148 00149 // change the string's size 00150 00151 void resize(size_type n, char c='\0'); 00152 00153 // extract characters 00154 00155 // No reference return; use replace() to write characters. 00156 value_type operator[](size_type i) const; 00157 00158 // compare strings 00159 00160 bool operator==(const ustring &other) const; 00161 bool operator==(const std::string &other) const; 00162 bool operator==(const char *other) const; 00163 00164 // internal data 00165 00166 const char* data() const; 00167 const char* c_str() const; 00168 00169 // encoding 00170 00171 bool utf8() const; 00172 void setUtf8(bool utf8); 00173 }; 00174 00175 } 00176 00177 #endif // __ustring_hh