#include <encoder_utf8.hh>
Inheritance diagram for lestes::lang::cplus::lex::encoder_utf8:

Public Member Functions | |
| ptr< ucn_token > | read (void) |
| Reads next token. | |
Static Public Member Functions | |
| static ptr< encoder_utf8 > | create (void) |
| Returns new instance. | |
Protected Member Functions | |
| encoder_utf8 (void) | |
| Creates the object. | |
| virtual void | gc_mark (void) |
| Marks the object. | |
Private Member Functions | |
| encoder_utf8 (const encoder_utf8 &) | |
| Hides copy constructor. | |
| encoder_utf8 & | operator= (const encoder_utf8 &) |
| Hides assignment operator. | |
Performs UTF8 character set encoding.
Definition at line 51 of file encoder_utf8.hh.
| lestes::lang::cplus::lex::encoder_utf8::encoder_utf8 | ( | void | ) | [protected] |
Creates the object.
Creates the object.
Definition at line 52 of file encoder_utf8.cc.
Referenced by create().
| lestes::lang::cplus::lex::encoder_utf8::encoder_utf8 | ( | const encoder_utf8 & | ) | [private] |
Hides copy constructor.
| ptr< ucn_token > lestes::lang::cplus::lex::encoder_utf8::read | ( | void | ) | [virtual] |
Reads next token.
Reads next token. Performs encoding from UTF-8 character set. Checks whether the input contains valid characters. If error is encountered, it returns token with type ucn_token::TOK_ERROR.
Token with type ucn_token::TOK_ERROR if the source character is invalid.
Token with type ucn_token::TOK_EOF in case of previous error.
Implements lestes::lang::cplus::lex::ucn_filter.
Definition at line 65 of file encoder_utf8.cc.
References lestes::lang::cplus::lex::ucn_token::create_error(), lestes::lang::cplus::lex::ucn_filter::input_read(), lestes::lang::cplus::lex::invalid_utf_character, lassert2, lestes::lang::cplus::lex::ucn_token::TOK_EOF, lestes::lang::cplus::lex::ucn_token::TOK_ERROR, and u.
00066 { 00067 // state of the reader 00068 typedef enum { 00069 START, END, ERR, STORE, TWO_LAST, THREE_NEXT, THREE_LAST, FOUR_FIRST, FOUR_NEXT, FOUR_LAST 00070 } state_type; 00071 00072 ptr<ucn_token> t; 00073 ucn_token_type utt; 00074 state_type state; 00075 ulint v = 0xbad, u = 0xbad, x = 0xbad, y = 0xbad, z = 0xbad; 00076 00077 state = START; 00078 00079 do { 00080 t = input_read(); 00081 utt = t->type_get(); 00082 00083 // for inherited errors 00084 if (utt == ucn_token::TOK_ERROR) { 00085 // pass the error through 00086 state = END; 00087 } else if (utt == ucn_token::TOK_EOF) { 00088 if (state == START) 00089 state = END; 00090 else 00091 state = ERR; 00092 } else { 00093 v = character::extract_value(t->value_get()); 00094 if (v > 0xFF) { 00095 // TODO pt report error 00096 state = ERR; 00097 } 00098 switch (state) { 00099 case START: 00100 if ((v & 0x80) == 0) { 00101 // short path, value is already in t 00102 state = END; 00103 } else { 00104 if ((v & 0xE0) == 0xC0) { 00105 // v <= 0xDF holds, implied 00106 if (v >= 0xC2 ) { 00107 u = 0; 00108 z = 0; 00109 y = v & 0x1F; 00110 state = TWO_LAST; 00111 } else { 00112 // TODO error report 00113 state = ERR; 00114 } 00115 } else if ((v & 0xF0) == 0xE0) { 00116 u = 0; 00117 z = v & 0x0F; 00118 state = THREE_NEXT; 00119 } else if ((v & 0xF8) == 0xF0) { 00120 // v >= 0xF0 holds, implied 00121 if (v <= 0xF4) { 00122 u = v & 0x0F; 00123 state = FOUR_FIRST; 00124 } else { 00125 // TODO error report 00126 state = ERR; 00127 } 00128 } else { 00129 // TODO error report 00130 state = ERR; 00131 } 00132 } 00133 break; 00134 case TWO_LAST: 00135 if (v >= 0x80 && v <= 0xBF) { 00136 x = v & 0x3F; 00137 state = STORE; 00138 } else { 00139 // TODO error report 00140 state = ERR; 00141 } 00142 break; 00143 case THREE_NEXT: 00144 if ((z == 0x00 && v >= 0xA0 && v <= 0xBF) || 00145 (z >= 0x01 && z <= 0x0C && v >= 0x80 && v <= 0xBF) || 00146 (z == 0x0D && v >= 0x80 && v <= 0x9F) || 00147 (z >= 0x0E && z <= 0x0F && v >= 0x80 && v <= 0xBF)) { 00148 y = v & 0x3F; 00149 state = THREE_LAST; 00150 } else { 00151 // TODO error report 00152 state = ERR; 00153 } 00154 break; 00155 case THREE_LAST: 00156 if (v >= 0x80 && v <= 0xBF) { 00157 x = v & 0x3F; 00158 state = STORE; 00159 } else { 00160 // TODO error report 00161 state = ERR; 00162 } 00163 break; 00164 case FOUR_FIRST: 00165 if ((u == 0x00 && v >= 0x90 && v <= 0xBF) || 00166 (u >= 0x01 && u <= 0x03 && v >= 0x80 && v <= 0xBF) || 00167 (u == 0x04 && v >= 0x80 && v <= 0x8F)) { 00168 u = (u << 2) | ((v >> 4) & 0x03); 00169 z = v & 0x0F; 00170 state = FOUR_NEXT; 00171 } else { 00172 // TODO error report 00173 state = ERR; 00174 } 00175 break; 00176 case FOUR_NEXT: 00177 if (v >= 0x80 && v <= 0xBF) { 00178 y = v & 0x3F; 00179 state = FOUR_LAST; 00180 } else { 00181 // TODO error report 00182 state = ERR; 00183 } 00184 break; 00185 case FOUR_LAST: 00186 if (v >= 0x80 && v <= 0xBF) { 00187 x = v & 0x3F; 00188 state = STORE; 00189 } else { 00190 // TODO error report 00191 state = ERR; 00192 } 00193 break; 00194 default: 00195 lassert2(false,"You should never get here"); 00196 } 00197 } 00198 00199 if (state == STORE) { 00200 t = t->clone_value((u << 16) | (z << 12) | (y << 6) | x); 00201 state = END; 00202 } else if (state == ERR) { 00203 t = ucn_token::create_error(invalid_utf_character->format()); 00204 state = END; 00205 } 00206 } while (state != END); 00207 // return the token (possibly error) 00208 return t; 00209 }
| ptr< encoder_utf8 > lestes::lang::cplus::lex::encoder_utf8::create | ( | void | ) | [static] |
Returns new instance.
Returns new instance of the encoder.
Definition at line 223 of file encoder_utf8.cc.
References encoder_utf8().
00224 { 00225 return new encoder_utf8(); 00226 }
| void lestes::lang::cplus::lex::encoder_utf8::gc_mark | ( | void | ) | [protected, virtual] |
Marks the object.
Marks the object.
Reimplemented from lestes::lang::cplus::lex::ucn_filter.
Definition at line 214 of file encoder_utf8.cc.
References lestes::lang::cplus::lex::ucn_filter::gc_mark().
00215 { 00216 encoder::gc_mark(); 00217 }
| encoder_utf8& lestes::lang::cplus::lex::encoder_utf8::operator= | ( | const encoder_utf8 & | ) | [private] |
Hides assignment operator.
1.5.1-20070107