panda/src/express/stringDecoder.cxx Source File

00001 // Filename: stringDecoder.cxx
00002 // Created by:  drose (11Feb02)
00003 //
00004 ////////////////////////////////////////////////////////////////////
00005 //
00006 // PANDA 3D SOFTWARE
00007 // Copyright (c) 2001, Disney Enterprises, Inc.  All rights reserved
00008 //
00009 // All use of this software is subject to the terms of the Panda 3d
00010 // Software license.  You should have received a copy of this license
00011 // along with this source code; you will also find a current copy of
00012 // the license at http://www.panda3d.org/license.txt .
00013 //
00014 // To contact the maintainers of this program write to
00015 // panda3d@yahoogroups.com .
00016 //
00017 ////////////////////////////////////////////////////////////////////
00018 
00019 #include "stringDecoder.h"
00020 #include "config_express.h"
00021 
00022 ////////////////////////////////////////////////////////////////////
00023 //     Function: StringDecoder::Destructor
00024 //       Access: Public, Virtual
00025 //  Description: 
00026 ////////////////////////////////////////////////////////////////////
00027 StringDecoder::
00028 ~StringDecoder() {
00029 }
00030 
00031 ////////////////////////////////////////////////////////////////////
00032 //     Function: StringDecoder::get_next_character
00033 //       Access: Public, Virtual
00034 //  Description: Returns the next character in sequence.
00035 ////////////////////////////////////////////////////////////////////
00036 int StringDecoder::
00037 get_next_character() {
00038   if (test_eof()) {
00039     return -1;
00040   }
00041   return (unsigned char)_input[_p++];
00042 }
00043 
00044 /*
00045 In UTF-8, each 16-bit Unicode character is encoded as a sequence of
00046 one, two, or three 8-bit bytes, depending on the value of the
00047 character. The following table shows the format of such UTF-8 byte
00048 sequences (where the "free bits" shown by x's in the table are
00049 combined in the order shown, and interpreted from most significant to
00050 least significant):
00051 
00052  Binary format of bytes in sequence:
00053                                         Number of    Maximum expressible
00054  1st byte     2nd byte    3rd byte      free bits:      Unicode value:
00055 
00056  0xxxxxxx                                  7           007F hex   (127)
00057  110xxxxx     10xxxxxx                  (5+6)=11       07FF hex  (2047)
00058  1110xxxx     10xxxxxx    10xxxxxx     (4+6+6)=16      FFFF hex (65535)
00059 
00060 The value of each individual byte indicates its UTF-8 function, as follows:
00061 
00062  00 to 7F hex   (0 to 127):  first and only byte of a sequence.
00063  80 to BF hex (128 to 191):  continuing byte in a multi-byte sequence.
00064  C2 to DF hex (194 to 223):  first byte of a two-byte sequence.
00065  E0 to EF hex (224 to 239):  first byte of a three-byte sequence.
00066 */
00067 
00068 ////////////////////////////////////////////////////////////////////
00069 //     Function: StringUtf8Decoder::get_next_character
00070 //       Access: Public, Virtual
00071 //  Description: Returns the next character in sequence.
00072 ////////////////////////////////////////////////////////////////////
00073 int StringUtf8Decoder::
00074 get_next_character() {
00075   unsigned int result;
00076   while (!test_eof()) {
00077     result = (unsigned char)_input[_p++];
00078     if ((result & 0x80) == 0) {
00079       // A 7-bit ascii value in one byte.
00080       return result;
00081 
00082     } if ((result & 0xe0) == 0xc0) {
00083       // First byte of two.
00084       unsigned int two = 0;
00085       if (test_eof()) {
00086         express_cat.warning()
00087           << "utf-8 encoded string ends abruptly.\n";
00088         return -1;
00089       }
00090       two = (unsigned char)_input[_p++];
00091       result = ((result & 0x1f) << 6) | (two & 0x3f);
00092       return result;
00093       
00094     } else if ((result & 0xf0) == 0xe0) {
00095       // First byte of three.
00096       if (test_eof()) {
00097         express_cat.warning()
00098           << "utf-8 encoded string ends abruptly.\n";
00099         return -1;
00100       }
00101       unsigned int two = (unsigned char)_input[_p++];
00102       if (test_eof()) {
00103         express_cat.warning()
00104           << "utf-8 encoded string ends abruptly.\n";
00105         return -1;
00106       }
00107       unsigned int three = (unsigned char)_input[_p++];
00108       result = ((result & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
00109       return result;
00110     }
00111 
00112     // Otherwise--the high bit is set but it is not one of the
00113     // introductory utf-8 bytes--we have an error.
00114     express_cat.warning()
00115       << "Non utf-8 byte in string: 0x" << hex << result << dec << "\n";
00116   }
00117 
00118   // End of string reached.
00119   return -1;
00120 }
00121 
00122 ////////////////////////////////////////////////////////////////////
00123 //     Function: StringUnicodeDecoder::get_next_character
00124 //       Access: Public, Virtual
00125 //  Description: Returns the next character in sequence.
00126 ////////////////////////////////////////////////////////////////////
00127 int StringUnicodeDecoder::
00128 get_next_character() {
00129   if (test_eof()) {
00130     return -1;
00131   }
00132 
00133   unsigned int high = (unsigned char)_input[_p++];
00134   if (test_eof()) {
00135     express_cat.warning()
00136       << "Unicode-encoded string has odd number of bytes.\n";
00137     return -1;
00138   }
00139   unsigned int low = (unsigned char)_input[_p++];
00140   return ((high << 8) | low);
00141 }