00001 // Filename: stringDecoder.cxx 00002 // Created by: drose (11Feb02) 00003 // 00004 //////////////////////////////////////////////////////////////////// 00005 // 00006 // PANDA 3D SOFTWARE 00007 // Copyright (c) 2001, Disney Enterprises, Inc. All rights reserved 00008 // 00009 // All use of this software is subject to the terms of the Panda 3d 00010 // Software license. You should have received a copy of this license 00011 // along with this source code; you will also find a current copy of 00012 // the license at http://www.panda3d.org/license.txt . 00013 // 00014 // To contact the maintainers of this program write to 00015 // panda3d@yahoogroups.com . 00016 // 00017 //////////////////////////////////////////////////////////////////// 00018 00019 #include "stringDecoder.h" 00020 #include "config_express.h" 00021 00022 //////////////////////////////////////////////////////////////////// 00023 // Function: StringDecoder::Destructor 00024 // Access: Public, Virtual 00025 // Description: 00026 //////////////////////////////////////////////////////////////////// 00027 StringDecoder:: 00028 ~StringDecoder() { 00029 } 00030 00031 //////////////////////////////////////////////////////////////////// 00032 // Function: StringDecoder::get_next_character 00033 // Access: Public, Virtual 00034 // Description: Returns the next character in sequence. 00035 //////////////////////////////////////////////////////////////////// 00036 int StringDecoder:: 00037 get_next_character() { 00038 if (test_eof()) { 00039 return -1; 00040 } 00041 return (unsigned char)_input[_p++]; 00042 } 00043 00044 /* 00045 In UTF-8, each 16-bit Unicode character is encoded as a sequence of 00046 one, two, or three 8-bit bytes, depending on the value of the 00047 character. The following table shows the format of such UTF-8 byte 00048 sequences (where the "free bits" shown by x's in the table are 00049 combined in the order shown, and interpreted from most significant to 00050 least significant): 00051 00052 Binary format of bytes in sequence: 00053 Number of Maximum expressible 00054 1st byte 2nd byte 3rd byte free bits: Unicode value: 00055 00056 0xxxxxxx 7 007F hex (127) 00057 110xxxxx 10xxxxxx (5+6)=11 07FF hex (2047) 00058 1110xxxx 10xxxxxx 10xxxxxx (4+6+6)=16 FFFF hex (65535) 00059 00060 The value of each individual byte indicates its UTF-8 function, as follows: 00061 00062 00 to 7F hex (0 to 127): first and only byte of a sequence. 00063 80 to BF hex (128 to 191): continuing byte in a multi-byte sequence. 00064 C2 to DF hex (194 to 223): first byte of a two-byte sequence. 00065 E0 to EF hex (224 to 239): first byte of a three-byte sequence. 00066 */ 00067 00068 //////////////////////////////////////////////////////////////////// 00069 // Function: StringUtf8Decoder::get_next_character 00070 // Access: Public, Virtual 00071 // Description: Returns the next character in sequence. 00072 //////////////////////////////////////////////////////////////////// 00073 int StringUtf8Decoder:: 00074 get_next_character() { 00075 unsigned int result; 00076 while (!test_eof()) { 00077 result = (unsigned char)_input[_p++]; 00078 if ((result & 0x80) == 0) { 00079 // A 7-bit ascii value in one byte. 00080 return result; 00081 00082 } if ((result & 0xe0) == 0xc0) { 00083 // First byte of two. 00084 unsigned int two = 0; 00085 if (test_eof()) { 00086 express_cat.warning() 00087 << "utf-8 encoded string ends abruptly.\n"; 00088 return -1; 00089 } 00090 two = (unsigned char)_input[_p++]; 00091 result = ((result & 0x1f) << 6) | (two & 0x3f); 00092 return result; 00093 00094 } else if ((result & 0xf0) == 0xe0) { 00095 // First byte of three. 00096 if (test_eof()) { 00097 express_cat.warning() 00098 << "utf-8 encoded string ends abruptly.\n"; 00099 return -1; 00100 } 00101 unsigned int two = (unsigned char)_input[_p++]; 00102 if (test_eof()) { 00103 express_cat.warning() 00104 << "utf-8 encoded string ends abruptly.\n"; 00105 return -1; 00106 } 00107 unsigned int three = (unsigned char)_input[_p++]; 00108 result = ((result & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f); 00109 return result; 00110 } 00111 00112 // Otherwise--the high bit is set but it is not one of the 00113 // introductory utf-8 bytes--we have an error. 00114 express_cat.warning() 00115 << "Non utf-8 byte in string: 0x" << hex << result << dec << "\n"; 00116 } 00117 00118 // End of string reached. 00119 return -1; 00120 } 00121 00122 //////////////////////////////////////////////////////////////////// 00123 // Function: StringUnicodeDecoder::get_next_character 00124 // Access: Public, Virtual 00125 // Description: Returns the next character in sequence. 00126 //////////////////////////////////////////////////////////////////// 00127 int StringUnicodeDecoder:: 00128 get_next_character() { 00129 if (test_eof()) { 00130 return -1; 00131 } 00132 00133 unsigned int high = (unsigned char)_input[_p++]; 00134 if (test_eof()) { 00135 express_cat.warning() 00136 << "Unicode-encoded string has odd number of bytes.\n"; 00137 return -1; 00138 } 00139 unsigned int low = (unsigned char)_input[_p++]; 00140 return ((high << 8) | low); 00141 }