00001 // Filename: textEncoder.I 00002 // Created by: drose (26Mar03) 00003 // 00004 //////////////////////////////////////////////////////////////////// 00005 // 00006 // PANDA 3D SOFTWARE 00007 // Copyright (c) 2001, Disney Enterprises, Inc. All rights reserved 00008 // 00009 // All use of this software is subject to the terms of the Panda 3d 00010 // Software license. You should have received a copy of this license 00011 // along with this source code; you will also find a current copy of 00012 // the license at http://www.panda3d.org/license.txt . 00013 // 00014 // To contact the maintainers of this program write to 00015 // panda3d@yahoogroups.com . 00016 // 00017 //////////////////////////////////////////////////////////////////// 00018 00019 00020 //////////////////////////////////////////////////////////////////// 00021 // Function: TextEncoder::Constructor 00022 // Access: Published 00023 // Description: 00024 //////////////////////////////////////////////////////////////////// 00025 INLINE TextEncoder:: 00026 TextEncoder() { 00027 _encoding = _default_encoding; 00028 00029 // Initially, since the text string is empty, we know that both 00030 // _text and _wtext accurately reflect the empty state; so we "got" 00031 // both of them. 00032 _flags = (F_got_text | F_got_wtext); 00033 } 00034 00035 //////////////////////////////////////////////////////////////////// 00036 // Function: TextEncoder::set_encoding 00037 // Access: Published 00038 // Description: Specifies how the string set via set_text() is to be 00039 // interpreted. The default, E_iso8859, means a 00040 // standard string with one-byte characters 00041 // (i.e. ASCII). Other encodings are possible to take 00042 // advantage of character sets with more than 256 00043 // characters. 00044 // 00045 // This affects only future calls to set_text(); it does 00046 // not change text that was set previously. 00047 //////////////////////////////////////////////////////////////////// 00048 INLINE void TextEncoder:: 00049 set_encoding(TextEncoder::Encoding encoding) { 00050 // Force the previously-set strings to be encoded or decoded now. 00051 get_text(); 00052 get_wtext(); 00053 _encoding = encoding; 00054 } 00055 00056 //////////////////////////////////////////////////////////////////// 00057 // Function: TextEncoder::get_encoding 00058 // Access: Published 00059 // Description: Returns the encoding by which the string set via 00060 // set_text() is to be interpreted. See set_encoding(). 00061 //////////////////////////////////////////////////////////////////// 00062 INLINE TextEncoder::Encoding TextEncoder:: 00063 get_encoding() const { 00064 return _encoding; 00065 } 00066 00067 //////////////////////////////////////////////////////////////////// 00068 // Function: TextEncoder::set_default_encoding 00069 // Access: Published, Static 00070 // Description: Specifies the default encoding to be used for all 00071 // subsequently created TextEncoder objects. See 00072 // set_encoding(). 00073 //////////////////////////////////////////////////////////////////// 00074 INLINE void TextEncoder:: 00075 set_default_encoding(TextEncoder::Encoding encoding) { 00076 _default_encoding = encoding; 00077 } 00078 00079 //////////////////////////////////////////////////////////////////// 00080 // Function: TextEncoder::get_default_encoding 00081 // Access: Published, Static 00082 // Description: Specifies the default encoding to be used for all 00083 // subsequently created TextEncoder objects. See 00084 // set_encoding(). 00085 //////////////////////////////////////////////////////////////////// 00086 INLINE TextEncoder::Encoding TextEncoder:: 00087 get_default_encoding() { 00088 return _default_encoding; 00089 } 00090 00091 //////////////////////////////////////////////////////////////////// 00092 // Function: TextEncoder::set_text 00093 // Access: Published 00094 // Description: Changes the text that is stored in the encoder. The 00095 // text should be encoded according to the method 00096 // indicated by set_encoding(). Subsequent calls to 00097 // get_text() will return this same string, while 00098 // get_wtext() will return the decoded version of the 00099 // string. 00100 //////////////////////////////////////////////////////////////////// 00101 INLINE void TextEncoder:: 00102 set_text(const string &text) { 00103 if (!has_text() || _text != text) { 00104 _text = text; 00105 _flags = (_flags | F_got_text) & ~F_got_wtext; 00106 } 00107 } 00108 00109 //////////////////////////////////////////////////////////////////// 00110 // Function: TextEncoder::set_text 00111 // Access: Published 00112 // Description: The two-parameter version of set_text() accepts an 00113 // explicit encoding; the text is immediately decoded 00114 // and stored as a wide-character string. Subsequent 00115 // calls to get_text() will return the same text 00116 // re-encoded using whichever encoding is specified by 00117 // set_encoding(). 00118 //////////////////////////////////////////////////////////////////// 00119 INLINE void TextEncoder:: 00120 set_text(const string &text, TextEncoder::Encoding encoding) { 00121 set_wtext(decode_text(text, encoding)); 00122 } 00123 00124 //////////////////////////////////////////////////////////////////// 00125 // Function: TextEncoder::clear_text 00126 // Access: Published 00127 // Description: Removes the text from the TextEncoder. 00128 //////////////////////////////////////////////////////////////////// 00129 INLINE void TextEncoder:: 00130 clear_text() { 00131 _text = string(); 00132 _wtext = wstring(); 00133 _flags |= (F_got_text | F_got_wtext); 00134 } 00135 00136 //////////////////////////////////////////////////////////////////// 00137 // Function: TextEncoder::has_text 00138 // Access: Published 00139 // Description: 00140 //////////////////////////////////////////////////////////////////// 00141 INLINE bool TextEncoder:: 00142 has_text() const { 00143 if (_flags & F_got_wtext) { 00144 return !_wtext.empty(); 00145 } else { 00146 return !_text.empty(); 00147 } 00148 } 00149 00150 //////////////////////////////////////////////////////////////////// 00151 // Function: TextEncoder::get_text 00152 // Access: Published 00153 // Description: Returns the current text, as encoded via the current 00154 // encoding system. 00155 //////////////////////////////////////////////////////////////////// 00156 INLINE string TextEncoder:: 00157 get_text() const { 00158 if ((_flags & F_got_text) == 0) { 00159 ((TextEncoder *)this)->_text = encode_wtext(_wtext); 00160 ((TextEncoder *)this)->_flags |= F_got_text; 00161 } 00162 return _text; 00163 } 00164 00165 //////////////////////////////////////////////////////////////////// 00166 // Function: TextEncoder::get_text 00167 // Access: Published 00168 // Description: Returns the current text, as encoded via the indicated 00169 // encoding system. 00170 //////////////////////////////////////////////////////////////////// 00171 INLINE string TextEncoder:: 00172 get_text(TextEncoder::Encoding encoding) const { 00173 return encode_wtext(get_wtext(), encoding); 00174 } 00175 00176 //////////////////////////////////////////////////////////////////// 00177 // Function: TextEncoder::append_text 00178 // Access: Published 00179 // Description: Appends the indicates string to the end of the stored 00180 // text. 00181 //////////////////////////////////////////////////////////////////// 00182 INLINE void TextEncoder:: 00183 append_text(const string &text) { 00184 _text = get_text() + text; 00185 _flags = (_flags | F_got_text) & ~F_got_wtext; 00186 } 00187 00188 //////////////////////////////////////////////////////////////////// 00189 // Function: TextEncoder::append_unicode_char 00190 // Access: Published 00191 // Description: Appends a single character to the end of the stored 00192 // text. This may be a wide character, up to 16 bits in 00193 // Unicode. 00194 //////////////////////////////////////////////////////////////////// 00195 INLINE void TextEncoder:: 00196 append_unicode_char(int character) { 00197 _wtext = get_wtext() + wstring(1, (wchar_t)character); 00198 _flags = (_flags | F_got_wtext) & ~F_got_text; 00199 } 00200 00201 //////////////////////////////////////////////////////////////////// 00202 // Function: TextEncoder::get_num_chars 00203 // Access: Published 00204 // Description: Returns the number of characters in the stored text. 00205 // This is a count of wide characters, after the string 00206 // has been decoded according to set_encoding(). 00207 //////////////////////////////////////////////////////////////////// 00208 INLINE int TextEncoder:: 00209 get_num_chars() const { 00210 return get_wtext().length(); 00211 } 00212 00213 //////////////////////////////////////////////////////////////////// 00214 // Function: TextEncoder::get_unicode_char 00215 // Access: Published 00216 // Description: Returns the Unicode value of the nth character in the 00217 // stored text. This may be a wide character (greater 00218 // than 255), after the string has been decoded 00219 // according to set_encoding(). 00220 //////////////////////////////////////////////////////////////////// 00221 INLINE int TextEncoder:: 00222 get_unicode_char(int index) const { 00223 get_wtext(); 00224 nassertr(index >= 0 && index < (int)_wtext.length(), 0); 00225 return _wtext[index]; 00226 } 00227 00228 //////////////////////////////////////////////////////////////////// 00229 // Function: TextEncoder::set_unicode_char 00230 // Access: Published 00231 // Description: Sets the Unicode value of the nth character in the 00232 // stored text. This may be a wide character (greater 00233 // than 255), after the string has been decoded 00234 // according to set_encoding(). 00235 //////////////////////////////////////////////////////////////////// 00236 INLINE void TextEncoder:: 00237 set_unicode_char(int index, int character) { 00238 get_wtext(); 00239 nassertv(index >= 0 && index < (int)_wtext.length()); 00240 _wtext[index] = character; 00241 _flags &= ~F_got_text; 00242 } 00243 00244 //////////////////////////////////////////////////////////////////// 00245 // Function: TextEncoder::get_encoded_char 00246 // Access: Published 00247 // Description: Returns the nth char of the stored text, as a one-, 00248 // two-, or three-byte encoded string. 00249 //////////////////////////////////////////////////////////////////// 00250 INLINE string TextEncoder:: 00251 get_encoded_char(int index) const { 00252 return get_encoded_char(index, get_encoding()); 00253 } 00254 00255 //////////////////////////////////////////////////////////////////// 00256 // Function: TextEncoder::get_encoded_char 00257 // Access: Published 00258 // Description: Returns the nth char of the stored text, as a one-, 00259 // two-, or three-byte encoded string. 00260 //////////////////////////////////////////////////////////////////// 00261 INLINE string TextEncoder:: 00262 get_encoded_char(int index, TextEncoder::Encoding encoding) const { 00263 wstring wch(1, (wchar_t)get_unicode_char(index)); 00264 return encode_wtext(wch, encoding); 00265 } 00266 00267 //////////////////////////////////////////////////////////////////// 00268 // Function: TextEncoder::get_text_as_ascii 00269 // Access: Published 00270 // Description: Returns the text associated with the node, converted 00271 // as nearly as possible to a fully-ASCII 00272 // representation. This means replacing accented 00273 // letters with their unaccented ASCII equivalents. 00274 // 00275 // It is possible that some characters in the string 00276 // cannot be converted to ASCII. (The string may 00277 // involve symbols like the copyright symbol, for 00278 // instance, or it might involve letters in some other 00279 // alphabet such as Greek or Cyrillic, or even Latin 00280 // letters like thorn or eth that are not part of the 00281 // ASCII character set.) In this case, as much of the 00282 // string as possible will be converted to ASCII, and 00283 // the nonconvertible characters will remain encoded in 00284 // the encoding specified by set_encoding(). 00285 //////////////////////////////////////////////////////////////////// 00286 INLINE string TextEncoder:: 00287 get_text_as_ascii() const { 00288 return encode_wtext(get_wtext_as_ascii()); 00289 } 00290 00291 //////////////////////////////////////////////////////////////////// 00292 // Function: TextEncoder::reencode_text 00293 // Access: Published, Static 00294 // Description: Given the indicated text string, which is assumed to 00295 // be encoded via the encoding "from", decodes it and 00296 // then reencodes it into the encoding "to", and returns 00297 // the newly encoded string. This does not change or 00298 // affect any properties on the TextEncoder itself. 00299 //////////////////////////////////////////////////////////////////// 00300 INLINE string TextEncoder:: 00301 reencode_text(const string &text, TextEncoder::Encoding from, 00302 TextEncoder::Encoding to) { 00303 return encode_wtext(decode_text(text, from), to); 00304 } 00305 00306 //////////////////////////////////////////////////////////////////// 00307 // Function: TextEncoder::unicode_isalpha 00308 // Access: Published, Static 00309 // Description: Returns true if the indicated character is an 00310 // alphabetic letter, false otherwise. This is akin to 00311 // ctype's isalpha(), extended to Unicode. 00312 //////////////////////////////////////////////////////////////////// 00313 INLINE bool TextEncoder:: 00314 unicode_isalpha(int character) { 00315 const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character); 00316 if (entry == (const UnicodeLatinMap::Entry *)NULL) { 00317 return false; 00318 } 00319 return entry->_char_type == UnicodeLatinMap::CT_upper || 00320 entry->_char_type == UnicodeLatinMap::CT_lower; 00321 } 00322 00323 //////////////////////////////////////////////////////////////////// 00324 // Function: TextEncoder::unicode_isdigit 00325 // Access: Published, Static 00326 // Description: Returns true if the indicated character is a 00327 // numeric digit, false otherwise. This is akin to 00328 // ctype's isdigit(), extended to Unicode. 00329 //////////////////////////////////////////////////////////////////// 00330 INLINE bool TextEncoder:: 00331 unicode_isdigit(int character) { 00332 const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character); 00333 if (entry == (const UnicodeLatinMap::Entry *)NULL) { 00334 // The digits aren't actually listed in the map. 00335 return (character >= '0' && character <= '9'); 00336 } 00337 // This silly test (!= 0) is necessary to prevent a VC++ warning. 00338 return (isdigit(entry->_ascii_equiv) != 0); 00339 } 00340 00341 //////////////////////////////////////////////////////////////////// 00342 // Function: TextEncoder::unicode_ispunct 00343 // Access: Published, Static 00344 // Description: Returns true if the indicated character is a 00345 // punctuation mark, false otherwise. This is akin to 00346 // ctype's ispunct(), extended to Unicode. 00347 //////////////////////////////////////////////////////////////////// 00348 INLINE bool TextEncoder:: 00349 unicode_ispunct(int character) { 00350 const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character); 00351 if (entry == (const UnicodeLatinMap::Entry *)NULL) { 00352 // Some punctuation marks aren't listed in the map. 00353 return (character >= 0 && character < 128 && ispunct(character)); 00354 } 00355 return entry->_char_type == UnicodeLatinMap::CT_punct; 00356 } 00357 00358 //////////////////////////////////////////////////////////////////// 00359 // Function: TextEncoder::unicode_isupper 00360 // Access: Published, Static 00361 // Description: Returns true if the indicated character is an 00362 // uppercase letter, false otherwise. This is akin to 00363 // ctype's isupper(), extended to Unicode. 00364 //////////////////////////////////////////////////////////////////// 00365 INLINE bool TextEncoder:: 00366 unicode_isupper(int character) { 00367 const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character); 00368 if (entry == (const UnicodeLatinMap::Entry *)NULL) { 00369 return false; 00370 } 00371 return entry->_char_type == UnicodeLatinMap::CT_upper; 00372 } 00373 00374 //////////////////////////////////////////////////////////////////// 00375 // Function: TextEncoder::unicode_islower 00376 // Access: Published, Static 00377 // Description: Returns true if the indicated character is a 00378 // lowercase letter, false otherwise. This is akin to 00379 // ctype's islower(), extended to Unicode. 00380 //////////////////////////////////////////////////////////////////// 00381 INLINE bool TextEncoder:: 00382 unicode_islower(int character) { 00383 const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character); 00384 if (entry == (const UnicodeLatinMap::Entry *)NULL) { 00385 return false; 00386 } 00387 return entry->_char_type == UnicodeLatinMap::CT_lower; 00388 } 00389 00390 //////////////////////////////////////////////////////////////////// 00391 // Function: TextEncoder::unicode_toupper 00392 // Access: Published, Static 00393 // Description: Returns the uppercase equivalent of the given Unicode 00394 // character. This is akin to ctype's toupper(), 00395 // extended to Unicode. 00396 //////////////////////////////////////////////////////////////////// 00397 INLINE int TextEncoder:: 00398 unicode_toupper(int character) { 00399 const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character); 00400 if (entry == (const UnicodeLatinMap::Entry *)NULL) { 00401 return character; 00402 } 00403 return entry->_toupper_character; 00404 } 00405 00406 //////////////////////////////////////////////////////////////////// 00407 // Function: TextEncoder::unicode_tolower 00408 // Access: Published, Static 00409 // Description: Returns the uppercase equivalent of the given Unicode 00410 // character. This is akin to ctype's tolower(), 00411 // extended to Unicode. 00412 //////////////////////////////////////////////////////////////////// 00413 INLINE int TextEncoder:: 00414 unicode_tolower(int character) { 00415 const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character); 00416 if (entry == (const UnicodeLatinMap::Entry *)NULL) { 00417 return character; 00418 } 00419 return entry->_tolower_character; 00420 } 00421 00422 //////////////////////////////////////////////////////////////////// 00423 // Function: TextEncoder::upper 00424 // Access: Published, Static 00425 // Description: Converts the string to uppercase, assuming the string 00426 // is encoded in the default encoding. 00427 //////////////////////////////////////////////////////////////////// 00428 INLINE string TextEncoder:: 00429 upper(const string &source) { 00430 return upper(source, get_default_encoding()); 00431 } 00432 00433 //////////////////////////////////////////////////////////////////// 00434 // Function: TextEncoder::upper 00435 // Access: Published, Static 00436 // Description: Converts the string to uppercase, assuming the string 00437 // is encoded in the indicated encoding. 00438 //////////////////////////////////////////////////////////////////// 00439 INLINE string TextEncoder:: 00440 upper(const string &source, TextEncoder::Encoding encoding) { 00441 TextEncoder encoder; 00442 encoder.set_encoding(encoding); 00443 encoder.set_text(source); 00444 encoder.make_upper(); 00445 return encoder.get_text(); 00446 } 00447 00448 //////////////////////////////////////////////////////////////////// 00449 // Function: TextEncoder::lower 00450 // Access: Published, Static 00451 // Description: Converts the string to lowercase, assuming the string 00452 // is encoded in the default encoding. 00453 //////////////////////////////////////////////////////////////////// 00454 INLINE string TextEncoder:: 00455 lower(const string &source) { 00456 return lower(source, get_default_encoding()); 00457 } 00458 00459 //////////////////////////////////////////////////////////////////// 00460 // Function: TextEncoder::lower 00461 // Access: Published, Static 00462 // Description: Converts the string to lowercase, assuming the string 00463 // is encoded in the indicated encoding. 00464 //////////////////////////////////////////////////////////////////// 00465 INLINE string TextEncoder:: 00466 lower(const string &source, TextEncoder::Encoding encoding) { 00467 TextEncoder encoder; 00468 encoder.set_encoding(encoding); 00469 encoder.set_text(source); 00470 encoder.make_lower(); 00471 return encoder.get_text(); 00472 } 00473 00474 //////////////////////////////////////////////////////////////////// 00475 // Function: TextEncoder::set_wtext 00476 // Access: Public 00477 // Description: Changes the text that is stored in the encoder. 00478 // Subsequent calls to get_wtext() will return this same 00479 // string, while get_text() will return the encoded 00480 // version of the string. 00481 //////////////////////////////////////////////////////////////////// 00482 INLINE void TextEncoder:: 00483 set_wtext(const wstring &wtext) { 00484 if (!has_text() || _wtext != wtext) { 00485 _wtext = wtext; 00486 _flags = (_flags | F_got_wtext) & ~F_got_text; 00487 } 00488 } 00489 00490 //////////////////////////////////////////////////////////////////// 00491 // Function: TextEncoder::get_wtext 00492 // Access: Public 00493 // Description: Returns the text associated with the TextEncoder, as 00494 // a wide-character string. 00495 //////////////////////////////////////////////////////////////////// 00496 INLINE const wstring &TextEncoder:: 00497 get_wtext() const { 00498 if ((_flags & F_got_wtext) == 0) { 00499 ((TextEncoder *)this)->_wtext = decode_text(_text); 00500 ((TextEncoder *)this)->_flags |= F_got_wtext; 00501 } 00502 return _wtext; 00503 } 00504 00505 //////////////////////////////////////////////////////////////////// 00506 // Function: TextEncoder::append_wtext 00507 // Access: Public 00508 // Description: Appends the indicates string to the end of the stored 00509 // wide-character text. 00510 //////////////////////////////////////////////////////////////////// 00511 INLINE void TextEncoder:: 00512 append_wtext(const wstring &wtext) { 00513 _wtext = get_wtext() + wtext; 00514 _flags = (_flags | F_got_wtext) & ~F_got_text; 00515 } 00516 00517 //////////////////////////////////////////////////////////////////// 00518 // Function: TextEncoder::encode_wtext 00519 // Access: Public 00520 // Description: Encodes a wide-text string into a single-char string, 00521 // according to the current encoding. 00522 //////////////////////////////////////////////////////////////////// 00523 INLINE string TextEncoder:: 00524 encode_wtext(const wstring &wtext) const { 00525 return encode_wtext(wtext, _encoding); 00526 } 00527 00528 //////////////////////////////////////////////////////////////////// 00529 // Function: TextEncoder::decode_text 00530 // Access: Public 00531 // Description: Returns the given wstring decoded to a single-byte 00532 // string, via the current encoding system. 00533 //////////////////////////////////////////////////////////////////// 00534 INLINE wstring TextEncoder:: 00535 decode_text(const string &text) const { 00536 return decode_text(text, _encoding); 00537 }