panda/src/express/textEncoder.cxx Source File

00001 // Filename: textEncoder.cxx
00002 // Created by:  drose (26Mar03)
00003 //
00004 ////////////////////////////////////////////////////////////////////
00005 //
00006 // PANDA 3D SOFTWARE
00007 // Copyright (c) 2001, Disney Enterprises, Inc.  All rights reserved
00008 //
00009 // All use of this software is subject to the terms of the Panda 3d
00010 // Software license.  You should have received a copy of this license
00011 // along with this source code; you will also find a current copy of
00012 // the license at http://www.panda3d.org/license.txt .
00013 //
00014 // To contact the maintainers of this program write to
00015 // panda3d@yahoogroups.com .
00016 //
00017 ////////////////////////////////////////////////////////////////////
00018 
00019 #include "textEncoder.h"
00020 #include "stringDecoder.h"
00021 #include "unicodeLatinMap.h"
00022 
00023 TypeHandle TextEncoder::_type_handle;
00024 TextEncoder::Encoding TextEncoder::_default_encoding;
00025 
00026 ////////////////////////////////////////////////////////////////////
00027 //     Function: TextEncoder::make_upper
00028 //       Access: Published
00029 //  Description: Adjusts the text stored within the encoder to all
00030 //               uppercase letters (preserving accent marks
00031 //               correctly).
00032 ////////////////////////////////////////////////////////////////////
00033 void TextEncoder::
00034 make_upper() {
00035   get_wtext();
00036   wstring::iterator si;
00037   for (si = _wtext.begin(); si != _wtext.end(); ++si) {
00038     (*si) = unicode_toupper(*si);
00039   }
00040   _flags &= ~F_got_text;
00041 }
00042 
00043 ////////////////////////////////////////////////////////////////////
00044 //     Function: TextEncoder::make_lower
00045 //       Access: Published
00046 //  Description: Adjusts the text stored within the encoder to all
00047 //               lowercase letters (preserving accent marks
00048 //               correctly).
00049 ////////////////////////////////////////////////////////////////////
00050 void TextEncoder::
00051 make_lower() {
00052   get_wtext();
00053   wstring::iterator si;
00054   for (si = _wtext.begin(); si != _wtext.end(); ++si) {
00055     (*si) = unicode_tolower(*si);
00056   }
00057   _flags &= ~F_got_text;
00058 }
00059 
00060 ////////////////////////////////////////////////////////////////////
00061 //     Function: TextEncoder::get_wtext_as_ascii
00062 //       Access: Public
00063 //  Description: Returns the text associated with the node, converted
00064 //               as nearly as possible to a fully-ASCII
00065 //               representation.  This means replacing accented
00066 //               letters with their unaccented ASCII equivalents.
00067 //
00068 //               It is possible that some characters in the string
00069 //               cannot be converted to ASCII.  (The string may
00070 //               involve symbols like the copyright symbol, for
00071 //               instance, or it might involve letters in some other
00072 //               alphabet such as Greek or Cyrillic, or even Latin
00073 //               letters like thorn or eth that are not part of the
00074 //               ASCII character set.)  In this case, as much of the
00075 //               string as possible will be converted to ASCII, and
00076 //               the nonconvertible characters will remain in their
00077 //               original form.
00078 ////////////////////////////////////////////////////////////////////
00079 wstring TextEncoder::
00080 get_wtext_as_ascii() const {
00081   get_wtext();
00082   wstring result;
00083   wstring::const_iterator si;
00084   for (si = _wtext.begin(); si != _wtext.end(); ++si) {
00085     wchar_t character = (*si);
00086 
00087     const UnicodeLatinMap::Entry *map_entry = 
00088       UnicodeLatinMap::look_up(character);
00089     if (map_entry != NULL && map_entry->_ascii_equiv != 0) {
00090       result += (wchar_t)map_entry->_ascii_equiv;
00091       if (map_entry->_ascii_additional != 0) {
00092         result += (wchar_t)map_entry->_ascii_additional;
00093       }
00094 
00095     } else {
00096       result += character;
00097     }
00098   }
00099 
00100   return result;
00101 }
00102 
00103 ////////////////////////////////////////////////////////////////////
00104 //     Function: TextEncoder::encode_wchar
00105 //       Access: Public, Static
00106 //  Description: Encodes a single wide char into a one-, two-, or
00107 //               three-byte string, according to the given encoding
00108 //               system.
00109 ////////////////////////////////////////////////////////////////////
00110 string TextEncoder::
00111 encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) {
00112   switch (encoding) {
00113   case E_iso8859:
00114     if (ch < 0x100) {
00115       return string(1, (char)ch);
00116     } else {
00117       // The character won't fit in the 8-bit ISO 8859.  See if we can
00118       // make it fit by reducing it to its ascii equivalent
00119       // (essentially stripping off an unusual accent mark).
00120       const UnicodeLatinMap::Entry *map_entry = 
00121         UnicodeLatinMap::look_up(ch);
00122       if (map_entry != NULL && map_entry->_ascii_equiv != 0) {
00123         // Yes, it has an ascii equivalent.
00124         if (map_entry->_ascii_additional != 0) {
00125           // In fact, it has two of them.
00126           return
00127             string(1, map_entry->_ascii_equiv) +
00128             string(1, map_entry->_ascii_additional);
00129         }
00130         return string(1, map_entry->_ascii_equiv);
00131       }
00132       // Nope; return "." for lack of anything better.
00133       return ".";
00134     }
00135 
00136   case E_utf8:
00137     if (ch < 0x80) {
00138       return string(1, (char)ch);
00139     } else if (ch < 0x800) {
00140       return 
00141         string(1, (char)((ch >> 6) | 0xc0)) +
00142         string(1, (char)((ch & 0x3f) | 0x80));
00143     } else {
00144       return 
00145         string(1, (char)((ch >> 12) | 0xe0)) +
00146         string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
00147         string(1, (char)((ch & 0x3f) | 0x80));
00148     }
00149 
00150   case E_unicode:
00151     return
00152       string(1, (char)(ch >> 8)) + 
00153       string(1, (char)(ch & 0xff));
00154   }
00155 
00156   return "";
00157 }
00158 
00159 ////////////////////////////////////////////////////////////////////
00160 //     Function: TextEncoder::encode_wtext
00161 //       Access: Public, Static
00162 //  Description: Encodes a wide-text string into a single-char string,
00163 //               according to the given encoding.
00164 ////////////////////////////////////////////////////////////////////
00165 string TextEncoder::
00166 encode_wtext(const wstring &wtext, TextEncoder::Encoding encoding) {
00167   string result;
00168 
00169   for (wstring::const_iterator pi = wtext.begin(); pi != wtext.end(); ++pi) {
00170     result += encode_wchar(*pi, encoding);
00171   }
00172 
00173   return result;
00174 }
00175 
00176 ////////////////////////////////////////////////////////////////////
00177 //     Function: TextEncoder::decode_text
00178 //       Access: Public, Static
00179 //  Description: Returns the given wstring decoded to a single-byte
00180 //               string, via the given encoding system.
00181 ////////////////////////////////////////////////////////////////////
00182 wstring TextEncoder::
00183 decode_text(const string &text, TextEncoder::Encoding encoding) {
00184   switch (encoding) {
00185   case E_utf8:
00186     {
00187       StringUtf8Decoder decoder(text);
00188       return decode_text_impl(decoder);
00189     }
00190 
00191   case E_unicode:
00192     {
00193       StringUnicodeDecoder decoder(text);
00194       return decode_text_impl(decoder);
00195     }
00196 
00197   case E_iso8859:
00198   default:
00199     {
00200       StringDecoder decoder(text);
00201       return decode_text_impl(decoder);
00202     }
00203   };
00204 }
00205 
00206 ////////////////////////////////////////////////////////////////////
00207 //     Function: TextEncoder::decode_text_impl
00208 //       Access: Private, Static
00209 //  Description: Decodes the eight-bit stream from the indicated
00210 //               decoder, returning the decoded wide-char string.
00211 ////////////////////////////////////////////////////////////////////
00212 wstring TextEncoder::
00213 decode_text_impl(StringDecoder &decoder) {
00214   wstring result;
00215   //  bool expand_amp = get_expand_amp();
00216 
00217   wchar_t character = decoder.get_next_character();
00218   while (!decoder.is_eof()) {
00219     /*
00220     if (character == '&' && expand_amp) {
00221       // An ampersand in expand_amp mode is treated as an escape
00222       // character.
00223       character = expand_amp_sequence(decoder);
00224     }
00225     */
00226     result += character;
00227     character = decoder.get_next_character();
00228   }
00229 
00230   return result;
00231 }
00232 
00233 /*
00234 ////////////////////////////////////////////////////////////////////
00235 //     Function: TextEncoder::expand_amp_sequence
00236 //       Access: Private
00237 //  Description: Given that we have just read an ampersand from the
00238 //               StringDecoder, and that we have expand_amp in effect
00239 //               and are therefore expected to expand the sequence
00240 //               that this ampersand begins into a single unicode
00241 //               character, do the expansion and return the character.
00242 ////////////////////////////////////////////////////////////////////
00243 int TextEncoder::
00244 expand_amp_sequence(StringDecoder &decoder) const {
00245   int result = 0;
00246 
00247   int character = decoder.get_next_character();
00248   if (!decoder.is_eof() && character == '#') {
00249     // An explicit numeric sequence: &#nnn;
00250     result = 0;
00251     character = decoder.get_next_character();
00252     while (!decoder.is_eof() && character < 128 && isdigit((unsigned int)character)) {
00253       result = (result * 10) + (character - '0');
00254       character = decoder.get_next_character();
00255     }
00256     if (character != ';') {
00257       // Invalid sequence.
00258       return 0;
00259     }
00260 
00261     return result;
00262   }
00263 
00264   string sequence;
00265   
00266   // Some non-numeric sequence.
00267   while (!decoder.is_eof() && character < 128 && isalpha((unsigned int)character)) {
00268     sequence += character;
00269     character = decoder.get_next_character();
00270   }
00271   if (character != ';') {
00272     // Invalid sequence.
00273     return 0;
00274   }
00275 
00276   static const struct {
00277     const char *name;
00278     int code;
00279   } tokens[] = {
00280     { "amp", '&' }, { "lt", '<' }, { "gt", '>' }, { "quot", '"' },
00281     { "nbsp", ' ' },
00282 
00283     { "iexcl", 161 }, { "cent", 162 }, { "pound", 163 }, { "curren", 164 },
00284     { "yen", 165 }, { "brvbar", 166 }, { "brkbar", 166 }, { "sect", 167 },
00285     { "uml", 168 }, { "die", 168 }, { "copy", 169 }, { "ordf", 170 },
00286     { "laquo", 171 }, { "not", 172 }, { "shy", 173 }, { "reg", 174 },
00287     { "macr", 175 }, { "hibar", 175 }, { "deg", 176 }, { "plusmn", 177 },
00288     { "sup2", 178 }, { "sup3", 179 }, { "acute", 180 }, { "micro", 181 },
00289     { "para", 182 }, { "middot", 183 }, { "cedil", 184 }, { "sup1", 185 },
00290     { "ordm", 186 }, { "raquo", 187 }, { "frac14", 188 }, { "frac12", 189 },
00291     { "frac34", 190 }, { "iquest", 191 }, { "Agrave", 192 }, { "Aacute", 193 },
00292     { "Acirc", 194 }, { "Atilde", 195 }, { "Auml", 196 }, { "Aring", 197 },
00293     { "AElig", 198 }, { "Ccedil", 199 }, { "Egrave", 200 }, { "Eacute", 201 },
00294     { "Ecirc", 202 }, { "Euml", 203 }, { "Igrave", 204 }, { "Iacute", 205 },
00295     { "Icirc", 206 }, { "Iuml", 207 }, { "ETH", 208 }, { "Dstrok", 208 },
00296     { "Ntilde", 209 }, { "Ograve", 210 }, { "Oacute", 211 }, { "Ocirc", 212 },
00297     { "Otilde", 213 }, { "Ouml", 214 }, { "times", 215 }, { "Oslash", 216 },
00298     { "Ugrave", 217 }, { "Uacute", 218 }, { "Ucirc", 219 }, { "Uuml", 220 },
00299     { "Yacute", 221 }, { "THORN", 222 }, { "szlig", 223 }, { "agrave", 224 },
00300     { "aacute", 225 }, { "acirc", 226 }, { "atilde", 227 }, { "auml", 228 },
00301     { "aring", 229 }, { "aelig", 230 }, { "ccedil", 231 }, { "egrave", 232 },
00302     { "eacute", 233 }, { "ecirc", 234 }, { "euml", 235 }, { "igrave", 236 },
00303     { "iacute", 237 }, { "icirc", 238 }, { "iuml", 239 }, { "eth", 240 },
00304     { "ntilde", 241 }, { "ograve", 242 }, { "oacute", 243 }, { "ocirc", 244 },
00305     { "otilde", 245 }, { "ouml", 246 }, { "divide", 247 }, { "oslash", 248 },
00306     { "ugrave", 249 }, { "uacute", 250 }, { "ucirc", 251 }, { "uuml", 252 },
00307     { "yacute", 253 }, { "thorn", 254 }, { "yuml", 255 },
00308 
00309     { NULL, 0 },
00310   };
00311 
00312   for (int i = 0; tokens[i].name != NULL; i++) {
00313     if (sequence == tokens[i].name) {
00314       // Here's a match.
00315       return tokens[i].code;
00316     }
00317   }
00318 
00319   // Some unrecognized sequence.
00320   return 0;
00321 }
00322 */
00323