encoder_utf8.cc

Go to the documentation of this file.
00001 /*
00002    The lestes compiler suite
00003    Copyright (C) 2002, 2003, 2004, 2005 Miroslav Tichy
00004    Copyright (C) 2002, 2003, 2004, 2005 Petr Zika
00005    Copyright (C) 2002, 2003, 2004, 2005 Vojtech Hala
00006    Copyright (C) 2002, 2003, 2004, 2005 Jiri Kosina
00007    Copyright (C) 2002, 2003, 2004, 2005 Pavel Sanda
00008    Copyright (C) 2002, 2003, 2004, 2005 Jan Zouhar
00009    Copyright (C) 2002, 2003, 2004, 2005 Rudolf Thomas
00010 
00011    This program is free software; you can redistribute it and/or modify
00012    it under the terms of the GNU General Public License as published by
00013    the Free Software Foundation; version 2 of the License.
00014 
00015    This program is distributed in the hope that it will be useful,
00016    but WITHOUT ANY WARRANTY; without even the implied warranty of
00017    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00018    GNU General Public License for more details.
00019 
00020    See the full text of the GNU General Public License version 2, and
00021    the limitations in the file doc/LICENSE.
00022 
00023    By accepting the license the licensee waives any and all claims
00024    against the copyright holder(s) related in whole or in part to the
00025    work, its use, and/or the inability to use it.
00026  
00027  */
00028 /*! \file
00029   \brief UTF-8 encoder.
00030 
00031   Definition of encoder_utf8 class performing UTF-8 character set encoding.
00032   \author pt
00033 */
00034 #include <lestes/common.hh>
00035 #include <lestes/lang/cplus/lex/encoder_utf8.hh>
00036 #include <lestes/lang/cplus/lex/encoder_utf8.m.hh>
00037 #include <lestes/lang/cplus/lex/ucn_token.hh>
00038 #include <lestes/lang/cplus/lex/encoder_utf8.m.hh>
00039 #include <lestes/msg/message.hh>
00040 
00041 package(lestes);
00042 package(lang);
00043 package(cplus);
00044 package(lex);
00045 
00046 using namespace ::std;
00047 
00048 /*!
00049   Creates the object.
00050   \post bad == NULL
00051 */
00052 encoder_utf8::encoder_utf8(void)
00053 {
00054 }
00055 
00056 /*!
00057   Reads next token. Performs encoding from UTF-8 character set.
00058   Checks whether the input contains valid characters.
00059   If error is encountered, it returns token with type ucn_token::TOK_ERROR.
00060   \pre  Input into the filter is set.
00061   \return  The next token encoded from 7 bit ASCII.
00062   \return  Token with type ucn_token::TOK_ERROR if the source character is invalid.
00063   \return  Token with type ucn_token::TOK_EOF in case of previous error.
00064 */
00065 ptr<ucn_token> encoder_utf8::read(void)
00066 {
00067         // state of the reader
00068         typedef enum { 
00069                 START, END, ERR, STORE, TWO_LAST, THREE_NEXT, THREE_LAST, FOUR_FIRST, FOUR_NEXT, FOUR_LAST 
00070         } state_type;
00071 
00072         ptr<ucn_token> t;
00073         ucn_token_type utt;
00074         state_type state;
00075         ulint v = 0xbad, u = 0xbad, x = 0xbad, y = 0xbad, z = 0xbad;
00076 
00077         state = START;
00078 
00079         do {
00080                 t = input_read();
00081                 utt = t->type_get();
00082 
00083                 // for inherited errors
00084                 if (utt == ucn_token::TOK_ERROR) {
00085                         // pass the error through
00086                         state = END;
00087                 } else if (utt == ucn_token::TOK_EOF) {
00088                         if (state == START)
00089                                 state = END;
00090                         else
00091                                 state = ERR;
00092                 } else {              
00093                         v = character::extract_value(t->value_get());
00094                         if (v > 0xFF) {
00095                                 // TODO pt report error
00096                                 state = ERR;
00097                         }
00098                         switch (state) {
00099                                 case START:
00100                                         if ((v & 0x80) == 0) {
00101                                                 // short path, value is already in t
00102                                                 state = END;
00103                                         } else {
00104                                                 if ((v & 0xE0) == 0xC0) {
00105                                                         // v <= 0xDF holds, implied
00106                                                         if (v >= 0xC2 ) {
00107                                                                 u = 0;
00108                                                                 z = 0;
00109                                                                 y = v & 0x1F;
00110                                                                 state = TWO_LAST;
00111                                                         } else {
00112                                                                 // TODO error report
00113                                                                 state = ERR;
00114                                                         }
00115                                                 } else if ((v & 0xF0) == 0xE0) {
00116                                                         u = 0;
00117                                                         z = v & 0x0F;                     
00118                                                         state = THREE_NEXT;
00119                                                 } else if ((v & 0xF8) == 0xF0) {
00120                                                         // v >= 0xF0 holds, implied
00121                                                         if (v <= 0xF4) {
00122                                                                 u = v & 0x0F;
00123                                                                 state = FOUR_FIRST;
00124                                                         } else {
00125                                                                 // TODO error report
00126                                                                 state = ERR;
00127                                                         }
00128                                                 } else {
00129                                                                 // TODO error report
00130                                                                 state = ERR;
00131                                                 }
00132                                         }
00133                                         break;
00134                                 case TWO_LAST:
00135                                         if (v >= 0x80 && v <= 0xBF) {
00136                                                 x = v & 0x3F;
00137                                                 state = STORE;
00138                                         } else {
00139                                                 // TODO error report
00140                                                 state = ERR;
00141                                         }
00142                                         break;
00143                                 case THREE_NEXT:
00144                                         if ((z == 0x00 && v >= 0xA0 && v <= 0xBF) ||
00145                                                  (z >= 0x01 && z <= 0x0C && v >= 0x80 && v <= 0xBF) ||
00146                                                  (z == 0x0D && v >= 0x80 && v <= 0x9F) ||
00147                                                  (z >= 0x0E && z <= 0x0F && v >= 0x80 && v <= 0xBF)) {
00148                                                 y = v & 0x3F;
00149                                                 state = THREE_LAST;
00150                                         } else {
00151                                                 // TODO error report
00152                                                 state = ERR;
00153                                         }
00154                                         break;
00155                                 case THREE_LAST:
00156                                         if (v >= 0x80 && v <= 0xBF) {
00157                                                 x = v & 0x3F;
00158                                                 state = STORE;
00159                                         } else {
00160                                                 // TODO error report
00161                                                 state = ERR;
00162                                         }
00163                                         break;
00164                                 case FOUR_FIRST:
00165                                         if ((u == 0x00 && v >= 0x90 && v <= 0xBF) ||
00166                                                  (u >= 0x01 && u <= 0x03 && v >= 0x80 && v <= 0xBF) ||
00167                                                  (u == 0x04 && v >= 0x80 && v <= 0x8F)) {
00168                                                 u = (u << 2) | ((v >> 4) & 0x03);
00169                                                 z = v & 0x0F;
00170                                                 state = FOUR_NEXT;
00171                                         } else {
00172                                                 // TODO error report
00173                                                 state = ERR;
00174                                         }
00175                                         break;
00176                                 case FOUR_NEXT:
00177                                         if (v >= 0x80 && v <= 0xBF) {
00178                                                 y = v & 0x3F;
00179                                                 state = FOUR_LAST;
00180                                         } else {
00181                                                 // TODO error report
00182                                                 state = ERR;
00183                                         }
00184                                         break;
00185                                 case FOUR_LAST:
00186                                         if (v >= 0x80 && v <= 0xBF) {
00187                                                 x = v & 0x3F;
00188                                                 state = STORE;
00189                                         } else {
00190                                                 // TODO error report
00191                                                 state = ERR;
00192                                         }
00193                                         break;
00194                                 default:
00195                                         lassert2(false,"You should never get here");
00196                         }         
00197                 }
00198                                 
00199                 if (state == STORE) {
00200                         t = t->clone_value((u << 16) | (z << 12) | (y << 6) | x);
00201                         state = END;
00202                 } else if (state == ERR) {
00203                         t = ucn_token::create_error(invalid_utf_character->format());
00204                         state = END;
00205                 }
00206         } while (state != END);
00207         // return the token (possibly error)
00208         return t;
00209 }
00210 
00211 /*!
00212   Marks the object.
00213 */
00214 void encoder_utf8::gc_mark(void)
00215 {
00216         encoder::gc_mark();
00217 }
00218 
00219 /*!
00220   Returns new instance of the encoder.
00221   \return  The new instance.
00222 */
00223 ptr<encoder_utf8> encoder_utf8::create(void)
00224 {
00225         return new encoder_utf8();
00226 }
00227 
00228 end_package(lex);
00229 end_package(cplus);
00230 end_package(lang);
00231 end_package(lestes);
00232 
00233 /* vim: set ft=lestes : */

Generated on Mon Feb 12 18:22:33 2007 for lestes by doxygen 1.5.1-20070107