panda/src/express/textEncoder.I Source File

00001 // Filename: textEncoder.I
00002 // Created by:  drose (26Mar03)
00003 //
00004 ////////////////////////////////////////////////////////////////////
00005 //
00006 // PANDA 3D SOFTWARE
00007 // Copyright (c) 2001, Disney Enterprises, Inc.  All rights reserved
00008 //
00009 // All use of this software is subject to the terms of the Panda 3d
00010 // Software license.  You should have received a copy of this license
00011 // along with this source code; you will also find a current copy of
00012 // the license at http://www.panda3d.org/license.txt .
00013 //
00014 // To contact the maintainers of this program write to
00015 // panda3d@yahoogroups.com .
00016 //
00017 ////////////////////////////////////////////////////////////////////
00018 
00019 
00020 ////////////////////////////////////////////////////////////////////
00021 //     Function: TextEncoder::Constructor
00022 //       Access: Published
00023 //  Description:
00024 ////////////////////////////////////////////////////////////////////
00025 INLINE TextEncoder::
00026 TextEncoder() {
00027   _encoding = _default_encoding;
00028   
00029   // Initially, since the text string is empty, we know that both
00030   // _text and _wtext accurately reflect the empty state; so we "got"
00031   // both of them.
00032   _flags = (F_got_text | F_got_wtext);
00033 }
00034 
00035 ////////////////////////////////////////////////////////////////////
00036 //     Function: TextEncoder::set_encoding
00037 //       Access: Published
00038 //  Description: Specifies how the string set via set_text() is to be
00039 //               interpreted.  The default, E_iso8859, means a
00040 //               standard string with one-byte characters
00041 //               (i.e. ASCII).  Other encodings are possible to take
00042 //               advantage of character sets with more than 256
00043 //               characters.
00044 //
00045 //               This affects only future calls to set_text(); it does
00046 //               not change text that was set previously.
00047 ////////////////////////////////////////////////////////////////////
00048 INLINE void TextEncoder::
00049 set_encoding(TextEncoder::Encoding encoding) {
00050   // Force the previously-set strings to be encoded or decoded now.
00051   get_text();
00052   get_wtext();
00053   _encoding = encoding;
00054 }
00055 
00056 ////////////////////////////////////////////////////////////////////
00057 //     Function: TextEncoder::get_encoding
00058 //       Access: Published
00059 //  Description: Returns the encoding by which the string set via
00060 //               set_text() is to be interpreted.  See set_encoding().
00061 ////////////////////////////////////////////////////////////////////
00062 INLINE TextEncoder::Encoding TextEncoder::
00063 get_encoding() const {
00064   return _encoding;
00065 }
00066 
00067 ////////////////////////////////////////////////////////////////////
00068 //     Function: TextEncoder::set_default_encoding
00069 //       Access: Published, Static
00070 //  Description: Specifies the default encoding to be used for all
00071 //               subsequently created TextEncoder objects.  See
00072 //               set_encoding().
00073 ////////////////////////////////////////////////////////////////////
00074 INLINE void TextEncoder::
00075 set_default_encoding(TextEncoder::Encoding encoding) {
00076   _default_encoding = encoding;
00077 }
00078 
00079 ////////////////////////////////////////////////////////////////////
00080 //     Function: TextEncoder::get_default_encoding
00081 //       Access: Published, Static
00082 //  Description: Specifies the default encoding to be used for all
00083 //               subsequently created TextEncoder objects.  See
00084 //               set_encoding().
00085 ////////////////////////////////////////////////////////////////////
00086 INLINE TextEncoder::Encoding TextEncoder::
00087 get_default_encoding() {
00088   return _default_encoding;
00089 }
00090 
00091 ////////////////////////////////////////////////////////////////////
00092 //     Function: TextEncoder::set_text
00093 //       Access: Published
00094 //  Description: Changes the text that is stored in the encoder.  The
00095 //               text should be encoded according to the method
00096 //               indicated by set_encoding().  Subsequent calls to
00097 //               get_text() will return this same string, while
00098 //               get_wtext() will return the decoded version of the
00099 //               string.
00100 ////////////////////////////////////////////////////////////////////
00101 INLINE void TextEncoder::
00102 set_text(const string &text) {
00103   if (!has_text() || _text != text) {
00104     _text = text;
00105     _flags = (_flags | F_got_text) & ~F_got_wtext;
00106   }
00107 }
00108 
00109 ////////////////////////////////////////////////////////////////////
00110 //     Function: TextEncoder::set_text
00111 //       Access: Published
00112 //  Description: The two-parameter version of set_text() accepts an
00113 //               explicit encoding; the text is immediately decoded
00114 //               and stored as a wide-character string.  Subsequent
00115 //               calls to get_text() will return the same text
00116 //               re-encoded using whichever encoding is specified by
00117 //               set_encoding().
00118 ////////////////////////////////////////////////////////////////////
00119 INLINE void TextEncoder::
00120 set_text(const string &text, TextEncoder::Encoding encoding) {
00121   set_wtext(decode_text(text, encoding));
00122 }
00123 
00124 ////////////////////////////////////////////////////////////////////
00125 //     Function: TextEncoder::clear_text
00126 //       Access: Published
00127 //  Description: Removes the text from the TextEncoder.
00128 ////////////////////////////////////////////////////////////////////
00129 INLINE void TextEncoder::
00130 clear_text() {
00131   _text = string();
00132   _wtext = wstring();
00133   _flags |= (F_got_text | F_got_wtext);
00134 }
00135 
00136 ////////////////////////////////////////////////////////////////////
00137 //     Function: TextEncoder::has_text
00138 //       Access: Published
00139 //  Description:
00140 ////////////////////////////////////////////////////////////////////
00141 INLINE bool TextEncoder::
00142 has_text() const {
00143   if (_flags & F_got_wtext) {
00144     return !_wtext.empty();
00145   } else {
00146     return !_text.empty();
00147   }
00148 }
00149 
00150 ////////////////////////////////////////////////////////////////////
00151 //     Function: TextEncoder::get_text
00152 //       Access: Published
00153 //  Description: Returns the current text, as encoded via the current
00154 //               encoding system.
00155 ////////////////////////////////////////////////////////////////////
00156 INLINE string TextEncoder::
00157 get_text() const {
00158   if ((_flags & F_got_text) == 0) {
00159     ((TextEncoder *)this)->_text = encode_wtext(_wtext);
00160     ((TextEncoder *)this)->_flags |= F_got_text;
00161   }
00162   return _text;
00163 }
00164 
00165 ////////////////////////////////////////////////////////////////////
00166 //     Function: TextEncoder::get_text
00167 //       Access: Published
00168 //  Description: Returns the current text, as encoded via the indicated
00169 //               encoding system.
00170 ////////////////////////////////////////////////////////////////////
00171 INLINE string TextEncoder::
00172 get_text(TextEncoder::Encoding encoding) const {
00173   return encode_wtext(get_wtext(), encoding);
00174 }
00175 
00176 ////////////////////////////////////////////////////////////////////
00177 //     Function: TextEncoder::append_text
00178 //       Access: Published
00179 //  Description: Appends the indicates string to the end of the stored
00180 //               text.
00181 ////////////////////////////////////////////////////////////////////
00182 INLINE void TextEncoder::
00183 append_text(const string &text) {
00184   _text = get_text() + text;
00185   _flags = (_flags | F_got_text) & ~F_got_wtext;
00186 }
00187 
00188 ////////////////////////////////////////////////////////////////////
00189 //     Function: TextEncoder::append_unicode_char
00190 //       Access: Published
00191 //  Description: Appends a single character to the end of the stored
00192 //               text.  This may be a wide character, up to 16 bits in
00193 //               Unicode.
00194 ////////////////////////////////////////////////////////////////////
00195 INLINE void TextEncoder::
00196 append_unicode_char(int character) {
00197   _wtext = get_wtext() + wstring(1, (wchar_t)character);
00198   _flags = (_flags | F_got_wtext) & ~F_got_text;
00199 }
00200 
00201 ////////////////////////////////////////////////////////////////////
00202 //     Function: TextEncoder::get_num_chars
00203 //       Access: Published
00204 //  Description: Returns the number of characters in the stored text.
00205 //               This is a count of wide characters, after the string
00206 //               has been decoded according to set_encoding().
00207 ////////////////////////////////////////////////////////////////////
00208 INLINE int TextEncoder::
00209 get_num_chars() const {
00210   return get_wtext().length();
00211 }
00212 
00213 ////////////////////////////////////////////////////////////////////
00214 //     Function: TextEncoder::get_unicode_char
00215 //       Access: Published
00216 //  Description: Returns the Unicode value of the nth character in the
00217 //               stored text.  This may be a wide character (greater
00218 //               than 255), after the string has been decoded
00219 //               according to set_encoding().
00220 ////////////////////////////////////////////////////////////////////
00221 INLINE int TextEncoder::
00222 get_unicode_char(int index) const {
00223   get_wtext();
00224   nassertr(index >= 0 && index < (int)_wtext.length(), 0);
00225   return _wtext[index];
00226 }
00227 
00228 ////////////////////////////////////////////////////////////////////
00229 //     Function: TextEncoder::set_unicode_char
00230 //       Access: Published
00231 //  Description: Sets the Unicode value of the nth character in the
00232 //               stored text.  This may be a wide character (greater
00233 //               than 255), after the string has been decoded
00234 //               according to set_encoding().
00235 ////////////////////////////////////////////////////////////////////
00236 INLINE void TextEncoder::
00237 set_unicode_char(int index, int character) {
00238   get_wtext();
00239   nassertv(index >= 0 && index < (int)_wtext.length());
00240   _wtext[index] = character;
00241   _flags &= ~F_got_text;
00242 }
00243 
00244 ////////////////////////////////////////////////////////////////////
00245 //     Function: TextEncoder::get_encoded_char
00246 //       Access: Published
00247 //  Description: Returns the nth char of the stored text, as a one-,
00248 //               two-, or three-byte encoded string.
00249 ////////////////////////////////////////////////////////////////////
00250 INLINE string TextEncoder::
00251 get_encoded_char(int index) const {
00252   return get_encoded_char(index, get_encoding());
00253 }
00254 
00255 ////////////////////////////////////////////////////////////////////
00256 //     Function: TextEncoder::get_encoded_char
00257 //       Access: Published
00258 //  Description: Returns the nth char of the stored text, as a one-,
00259 //               two-, or three-byte encoded string.
00260 ////////////////////////////////////////////////////////////////////
00261 INLINE string TextEncoder::
00262 get_encoded_char(int index, TextEncoder::Encoding encoding) const {
00263   wstring wch(1, (wchar_t)get_unicode_char(index));
00264   return encode_wtext(wch, encoding);
00265 }
00266 
00267 ////////////////////////////////////////////////////////////////////
00268 //     Function: TextEncoder::get_text_as_ascii
00269 //       Access: Published
00270 //  Description: Returns the text associated with the node, converted
00271 //               as nearly as possible to a fully-ASCII
00272 //               representation.  This means replacing accented
00273 //               letters with their unaccented ASCII equivalents.
00274 //
00275 //               It is possible that some characters in the string
00276 //               cannot be converted to ASCII.  (The string may
00277 //               involve symbols like the copyright symbol, for
00278 //               instance, or it might involve letters in some other
00279 //               alphabet such as Greek or Cyrillic, or even Latin
00280 //               letters like thorn or eth that are not part of the
00281 //               ASCII character set.)  In this case, as much of the
00282 //               string as possible will be converted to ASCII, and
00283 //               the nonconvertible characters will remain encoded in
00284 //               the encoding specified by set_encoding().
00285 ////////////////////////////////////////////////////////////////////
00286 INLINE string TextEncoder::
00287 get_text_as_ascii() const {
00288   return encode_wtext(get_wtext_as_ascii());
00289 }
00290 
00291 ////////////////////////////////////////////////////////////////////
00292 //     Function: TextEncoder::reencode_text
00293 //       Access: Published, Static
00294 //  Description: Given the indicated text string, which is assumed to
00295 //               be encoded via the encoding "from", decodes it and
00296 //               then reencodes it into the encoding "to", and returns
00297 //               the newly encoded string.  This does not change or
00298 //               affect any properties on the TextEncoder itself.
00299 ////////////////////////////////////////////////////////////////////
00300 INLINE string TextEncoder::
00301 reencode_text(const string &text, TextEncoder::Encoding from, 
00302               TextEncoder::Encoding to) {
00303   return encode_wtext(decode_text(text, from), to);
00304 }
00305 
00306 ////////////////////////////////////////////////////////////////////
00307 //     Function: TextEncoder::unicode_isalpha
00308 //       Access: Published, Static
00309 //  Description: Returns true if the indicated character is an
00310 //               alphabetic letter, false otherwise.  This is akin to
00311 //               ctype's isalpha(), extended to Unicode.
00312 ////////////////////////////////////////////////////////////////////
00313 INLINE bool TextEncoder::
00314 unicode_isalpha(int character) {
00315   const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
00316   if (entry == (const UnicodeLatinMap::Entry *)NULL) {
00317     return false;
00318   }
00319   return entry->_char_type == UnicodeLatinMap::CT_upper ||
00320     entry->_char_type == UnicodeLatinMap::CT_lower;
00321 }
00322 
00323 ////////////////////////////////////////////////////////////////////
00324 //     Function: TextEncoder::unicode_isdigit
00325 //       Access: Published, Static
00326 //  Description: Returns true if the indicated character is a
00327 //               numeric digit, false otherwise.  This is akin to
00328 //               ctype's isdigit(), extended to Unicode.
00329 ////////////////////////////////////////////////////////////////////
00330 INLINE bool TextEncoder::
00331 unicode_isdigit(int character) {
00332   const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
00333   if (entry == (const UnicodeLatinMap::Entry *)NULL) {
00334     // The digits aren't actually listed in the map.
00335     return (character >= '0' && character <= '9');
00336   }
00337   // This silly test (!= 0) is necessary to prevent a VC++ warning.
00338   return (isdigit(entry->_ascii_equiv) != 0);
00339 }
00340 
00341 ////////////////////////////////////////////////////////////////////
00342 //     Function: TextEncoder::unicode_ispunct
00343 //       Access: Published, Static
00344 //  Description: Returns true if the indicated character is a
00345 //               punctuation mark, false otherwise.  This is akin to
00346 //               ctype's ispunct(), extended to Unicode.
00347 ////////////////////////////////////////////////////////////////////
00348 INLINE bool TextEncoder::
00349 unicode_ispunct(int character) {
00350   const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
00351   if (entry == (const UnicodeLatinMap::Entry *)NULL) {
00352     // Some punctuation marks aren't listed in the map.
00353     return (character >= 0 && character < 128 && ispunct(character));
00354   }
00355   return entry->_char_type == UnicodeLatinMap::CT_punct;
00356 }
00357 
00358 ////////////////////////////////////////////////////////////////////
00359 //     Function: TextEncoder::unicode_isupper
00360 //       Access: Published, Static
00361 //  Description: Returns true if the indicated character is an
00362 //               uppercase letter, false otherwise.  This is akin to
00363 //               ctype's isupper(), extended to Unicode.
00364 ////////////////////////////////////////////////////////////////////
00365 INLINE bool TextEncoder::
00366 unicode_isupper(int character) {
00367   const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
00368   if (entry == (const UnicodeLatinMap::Entry *)NULL) {
00369     return false;
00370   }
00371   return entry->_char_type == UnicodeLatinMap::CT_upper;
00372 }
00373 
00374 ////////////////////////////////////////////////////////////////////
00375 //     Function: TextEncoder::unicode_islower
00376 //       Access: Published, Static
00377 //  Description: Returns true if the indicated character is a
00378 //               lowercase letter, false otherwise.  This is akin to
00379 //               ctype's islower(), extended to Unicode.
00380 ////////////////////////////////////////////////////////////////////
00381 INLINE bool TextEncoder::
00382 unicode_islower(int character) {
00383   const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
00384   if (entry == (const UnicodeLatinMap::Entry *)NULL) {
00385     return false;
00386   }
00387   return entry->_char_type == UnicodeLatinMap::CT_lower;
00388 }
00389 
00390 ////////////////////////////////////////////////////////////////////
00391 //     Function: TextEncoder::unicode_toupper
00392 //       Access: Published, Static
00393 //  Description: Returns the uppercase equivalent of the given Unicode
00394 //               character.  This is akin to ctype's toupper(),
00395 //               extended to Unicode.
00396 ////////////////////////////////////////////////////////////////////
00397 INLINE int TextEncoder::
00398 unicode_toupper(int character) {
00399   const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
00400   if (entry == (const UnicodeLatinMap::Entry *)NULL) {
00401     return character;
00402   } 
00403   return entry->_toupper_character;
00404 }
00405 
00406 ////////////////////////////////////////////////////////////////////
00407 //     Function: TextEncoder::unicode_tolower
00408 //       Access: Published, Static
00409 //  Description: Returns the uppercase equivalent of the given Unicode
00410 //               character.  This is akin to ctype's tolower(),
00411 //               extended to Unicode.
00412 ////////////////////////////////////////////////////////////////////
00413 INLINE int TextEncoder::
00414 unicode_tolower(int character) {
00415   const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
00416   if (entry == (const UnicodeLatinMap::Entry *)NULL) {
00417     return character;
00418   } 
00419   return entry->_tolower_character;
00420 }
00421 
00422 ////////////////////////////////////////////////////////////////////
00423 //     Function: TextEncoder::upper
00424 //       Access: Published, Static
00425 //  Description: Converts the string to uppercase, assuming the string
00426 //               is encoded in the default encoding.
00427 ////////////////////////////////////////////////////////////////////
00428 INLINE string TextEncoder::
00429 upper(const string &source) {
00430   return upper(source, get_default_encoding());
00431 }
00432 
00433 ////////////////////////////////////////////////////////////////////
00434 //     Function: TextEncoder::upper
00435 //       Access: Published, Static
00436 //  Description: Converts the string to uppercase, assuming the string
00437 //               is encoded in the indicated encoding.
00438 ////////////////////////////////////////////////////////////////////
00439 INLINE string TextEncoder::
00440 upper(const string &source, TextEncoder::Encoding encoding) {
00441   TextEncoder encoder;
00442   encoder.set_encoding(encoding);
00443   encoder.set_text(source);
00444   encoder.make_upper();
00445   return encoder.get_text();
00446 }
00447 
00448 ////////////////////////////////////////////////////////////////////
00449 //     Function: TextEncoder::lower
00450 //       Access: Published, Static
00451 //  Description: Converts the string to lowercase, assuming the string
00452 //               is encoded in the default encoding.
00453 ////////////////////////////////////////////////////////////////////
00454 INLINE string TextEncoder::
00455 lower(const string &source) {
00456   return lower(source, get_default_encoding());
00457 }
00458 
00459 ////////////////////////////////////////////////////////////////////
00460 //     Function: TextEncoder::lower
00461 //       Access: Published, Static
00462 //  Description: Converts the string to lowercase, assuming the string
00463 //               is encoded in the indicated encoding.
00464 ////////////////////////////////////////////////////////////////////
00465 INLINE string TextEncoder::
00466 lower(const string &source, TextEncoder::Encoding encoding) {
00467   TextEncoder encoder;
00468   encoder.set_encoding(encoding);
00469   encoder.set_text(source);
00470   encoder.make_lower();
00471   return encoder.get_text();
00472 }
00473 
00474 ////////////////////////////////////////////////////////////////////
00475 //     Function: TextEncoder::set_wtext
00476 //       Access: Public
00477 //  Description: Changes the text that is stored in the encoder.
00478 //               Subsequent calls to get_wtext() will return this same
00479 //               string, while get_text() will return the encoded
00480 //               version of the string.
00481 ////////////////////////////////////////////////////////////////////
00482 INLINE void TextEncoder::
00483 set_wtext(const wstring &wtext) {
00484   if (!has_text() || _wtext != wtext) {
00485     _wtext = wtext;
00486     _flags = (_flags | F_got_wtext) & ~F_got_text;
00487   }
00488 }
00489 
00490 ////////////////////////////////////////////////////////////////////
00491 //     Function: TextEncoder::get_wtext
00492 //       Access: Public
00493 //  Description: Returns the text associated with the TextEncoder, as
00494 //               a wide-character string.
00495 ////////////////////////////////////////////////////////////////////
00496 INLINE const wstring &TextEncoder::
00497 get_wtext() const {
00498   if ((_flags & F_got_wtext) == 0) {
00499     ((TextEncoder *)this)->_wtext = decode_text(_text);
00500     ((TextEncoder *)this)->_flags |= F_got_wtext;
00501   }
00502   return _wtext;
00503 }
00504 
00505 ////////////////////////////////////////////////////////////////////
00506 //     Function: TextEncoder::append_wtext
00507 //       Access: Public
00508 //  Description: Appends the indicates string to the end of the stored
00509 //               wide-character text.
00510 ////////////////////////////////////////////////////////////////////
00511 INLINE void TextEncoder::
00512 append_wtext(const wstring &wtext) {
00513   _wtext = get_wtext() + wtext;
00514   _flags = (_flags | F_got_wtext) & ~F_got_text;
00515 }
00516 
00517 ////////////////////////////////////////////////////////////////////
00518 //     Function: TextEncoder::encode_wtext
00519 //       Access: Public
00520 //  Description: Encodes a wide-text string into a single-char string,
00521 //               according to the current encoding.
00522 ////////////////////////////////////////////////////////////////////
00523 INLINE string TextEncoder::
00524 encode_wtext(const wstring &wtext) const {
00525   return encode_wtext(wtext, _encoding);
00526 }
00527 
00528 ////////////////////////////////////////////////////////////////////
00529 //     Function: TextEncoder::decode_text
00530 //       Access: Public
00531 //  Description: Returns the given wstring decoded to a single-byte
00532 //               string, via the current encoding system.
00533 ////////////////////////////////////////////////////////////////////
00534 INLINE wstring TextEncoder::
00535 decode_text(const string &text) const {
00536   return decode_text(text, _encoding);
00537 }