00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #include "textEncoder.h"
00020 #include "stringDecoder.h"
00021 #include "unicodeLatinMap.h"
00022
00023 TypeHandle TextEncoder::_type_handle;
00024 TextEncoder::Encoding TextEncoder::_default_encoding;
00025
00026
00027
00028
00029
00030
00031
00032
00033 void TextEncoder::
00034 make_upper() {
00035 get_wtext();
00036 wstring::iterator si;
00037 for (si = _wtext.begin(); si != _wtext.end(); ++si) {
00038 (*si) = unicode_toupper(*si);
00039 }
00040 _flags &= ~F_got_text;
00041 }
00042
00043
00044
00045
00046
00047
00048
00049
00050 void TextEncoder::
00051 make_lower() {
00052 get_wtext();
00053 wstring::iterator si;
00054 for (si = _wtext.begin(); si != _wtext.end(); ++si) {
00055 (*si) = unicode_tolower(*si);
00056 }
00057 _flags &= ~F_got_text;
00058 }
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079 wstring TextEncoder::
00080 get_wtext_as_ascii() const {
00081 get_wtext();
00082 wstring result;
00083 wstring::const_iterator si;
00084 for (si = _wtext.begin(); si != _wtext.end(); ++si) {
00085 wchar_t character = (*si);
00086
00087 const UnicodeLatinMap::Entry *map_entry =
00088 UnicodeLatinMap::look_up(character);
00089 if (map_entry != NULL && map_entry->_ascii_equiv != 0) {
00090 result += (wchar_t)map_entry->_ascii_equiv;
00091 if (map_entry->_ascii_additional != 0) {
00092 result += (wchar_t)map_entry->_ascii_additional;
00093 }
00094
00095 } else {
00096 result += character;
00097 }
00098 }
00099
00100 return result;
00101 }
00102
00103
00104
00105
00106
00107
00108
00109
00110 string TextEncoder::
00111 encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) {
00112 switch (encoding) {
00113 case E_iso8859:
00114 if (ch < 0x100) {
00115 return string(1, (char)ch);
00116 } else {
00117
00118
00119
00120 const UnicodeLatinMap::Entry *map_entry =
00121 UnicodeLatinMap::look_up(ch);
00122 if (map_entry != NULL && map_entry->_ascii_equiv != 0) {
00123
00124 if (map_entry->_ascii_additional != 0) {
00125
00126 return
00127 string(1, map_entry->_ascii_equiv) +
00128 string(1, map_entry->_ascii_additional);
00129 }
00130 return string(1, map_entry->_ascii_equiv);
00131 }
00132
00133 return ".";
00134 }
00135
00136 case E_utf8:
00137 if (ch < 0x80) {
00138 return string(1, (char)ch);
00139 } else if (ch < 0x800) {
00140 return
00141 string(1, (char)((ch >> 6) | 0xc0)) +
00142 string(1, (char)((ch & 0x3f) | 0x80));
00143 } else {
00144 return
00145 string(1, (char)((ch >> 12) | 0xe0)) +
00146 string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
00147 string(1, (char)((ch & 0x3f) | 0x80));
00148 }
00149
00150 case E_unicode:
00151 return
00152 string(1, (char)(ch >> 8)) +
00153 string(1, (char)(ch & 0xff));
00154 }
00155
00156 return "";
00157 }
00158
00159
00160
00161
00162
00163
00164
00165 string TextEncoder::
00166 encode_wtext(const wstring &wtext, TextEncoder::Encoding encoding) {
00167 string result;
00168
00169 for (wstring::const_iterator pi = wtext.begin(); pi != wtext.end(); ++pi) {
00170 result += encode_wchar(*pi, encoding);
00171 }
00172
00173 return result;
00174 }
00175
00176
00177
00178
00179
00180
00181
00182 wstring TextEncoder::
00183 decode_text(const string &text, TextEncoder::Encoding encoding) {
00184 switch (encoding) {
00185 case E_utf8:
00186 {
00187 StringUtf8Decoder decoder(text);
00188 return decode_text_impl(decoder);
00189 }
00190
00191 case E_unicode:
00192 {
00193 StringUnicodeDecoder decoder(text);
00194 return decode_text_impl(decoder);
00195 }
00196
00197 case E_iso8859:
00198 default:
00199 {
00200 StringDecoder decoder(text);
00201 return decode_text_impl(decoder);
00202 }
00203 };
00204 }
00205
00206
00207
00208
00209
00210
00211
00212 wstring TextEncoder::
00213 decode_text_impl(StringDecoder &decoder) {
00214 wstring result;
00215
00216
00217 wchar_t character = decoder.get_next_character();
00218 while (!decoder.is_eof()) {
00219
00220
00221
00222
00223
00224
00225
00226 result += character;
00227 character = decoder.get_next_character();
00228 }
00229
00230 return result;
00231 }
00232
00233
00234
00235 // Function: TextEncoder::expand_amp_sequence
00236 // Access: Private
00237 // Description: Given that we have just read an ampersand from the
00238 // StringDecoder, and that we have expand_amp in effect
00239 // and are therefore expected to expand the sequence
00240 // that this ampersand begins into a single unicode
00241 // character, do the expansion and return the character.
00242
00243 int TextEncoder::
00244 expand_amp_sequence(StringDecoder &decoder) const {
00245 int result = 0;
00246
00247 int character = decoder.get_next_character();
00248 if (!decoder.is_eof() && character == '#') {
00249 // An explicit numeric sequence: &#nnn;
00250 result = 0;
00251 character = decoder.get_next_character();
00252 while (!decoder.is_eof() && character < 128 && isdigit((unsigned int)character)) {
00253 result = (result * 10) + (character - '0');
00254 character = decoder.get_next_character();
00255 }
00256 if (character != ';') {
00257 // Invalid sequence.
00258 return 0;
00259 }
00260
00261 return result;
00262 }
00263
00264 string sequence;
00265
00266 // Some non-numeric sequence.
00267 while (!decoder.is_eof() && character < 128 && isalpha((unsigned int)character)) {
00268 sequence += character;
00269 character = decoder.get_next_character();
00270 }
00271 if (character != ';') {
00272 // Invalid sequence.
00273 return 0;
00274 }
00275
00276 static const struct {
00277 const char *name;
00278 int code;
00279 } tokens[] = {
00280 { "amp", '&' }, { "lt", '<' }, { "gt", '>' }, { "quot", '"' },
00281 { "nbsp", ' ' },
00282
00283 { "iexcl", 161 }, { "cent", 162 }, { "pound", 163 }, { "curren", 164 },
00284 { "yen", 165 }, { "brvbar", 166 }, { "brkbar", 166 }, { "sect", 167 },
00285 { "uml", 168 }, { "die", 168 }, { "copy", 169 }, { "ordf", 170 },
00286 { "laquo", 171 }, { "not", 172 }, { "shy", 173 }, { "reg", 174 },
00287 { "macr", 175 }, { "hibar", 175 }, { "deg", 176 }, { "plusmn", 177 },
00288 { "sup2", 178 }, { "sup3", 179 }, { "acute", 180 }, { "micro", 181 },
00289 { "para", 182 }, { "middot", 183 }, { "cedil", 184 }, { "sup1", 185 },
00290 { "ordm", 186 }, { "raquo", 187 }, { "frac14", 188 }, { "frac12", 189 },
00291 { "frac34", 190 }, { "iquest", 191 }, { "Agrave", 192 }, { "Aacute", 193 },
00292 { "Acirc", 194 }, { "Atilde", 195 }, { "Auml", 196 }, { "Aring", 197 },
00293 { "AElig", 198 }, { "Ccedil", 199 }, { "Egrave", 200 }, { "Eacute", 201 },
00294 { "Ecirc", 202 }, { "Euml", 203 }, { "Igrave", 204 }, { "Iacute", 205 },
00295 { "Icirc", 206 }, { "Iuml", 207 }, { "ETH", 208 }, { "Dstrok", 208 },
00296 { "Ntilde", 209 }, { "Ograve", 210 }, { "Oacute", 211 }, { "Ocirc", 212 },
00297 { "Otilde", 213 }, { "Ouml", 214 }, { "times", 215 }, { "Oslash", 216 },
00298 { "Ugrave", 217 }, { "Uacute", 218 }, { "Ucirc", 219 }, { "Uuml", 220 },
00299 { "Yacute", 221 }, { "THORN", 222 }, { "szlig", 223 }, { "agrave", 224 },
00300 { "aacute", 225 }, { "acirc", 226 }, { "atilde", 227 }, { "auml", 228 },
00301 { "aring", 229 }, { "aelig", 230 }, { "ccedil", 231 }, { "egrave", 232 },
00302 { "eacute", 233 }, { "ecirc", 234 }, { "euml", 235 }, { "igrave", 236 },
00303 { "iacute", 237 }, { "icirc", 238 }, { "iuml", 239 }, { "eth", 240 },
00304 { "ntilde", 241 }, { "ograve", 242 }, { "oacute", 243 }, { "ocirc", 244 },
00305 { "otilde", 245 }, { "ouml", 246 }, { "divide", 247 }, { "oslash", 248 },
00306 { "ugrave", 249 }, { "uacute", 250 }, { "ucirc", 251 }, { "uuml", 252 },
00307 { "yacute", 253 }, { "thorn", 254 }, { "yuml", 255 },
00308
00309 { NULL, 0 },
00310 };
00311
00312 for (int i = 0; tokens[i].name != NULL; i++) {
00313 if (sequence == tokens[i].name) {
00314 // Here's a match.
00315 return tokens[i].code;
00316 }
00317 }
00318
00319 // Some unrecognized sequence.
00320 return 0;
00321 }
00322 */
00323