00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035 #include <lestes/common.hh>
00036 #include <lestes/lang/cplus/lex/ucn_token_buffer.hh>
00037 #include <lestes/lang/cplus/lex/ucn_token_buffer.m.hh>
00038 #include <lestes/lang/cplus/lex/ucn_token.hh>
00039 #include <lestes/lang/cplus/lex/token_value.hh>
00040 #include <lestes/lang/cplus/lex/line_control.hh>
00041 #include <lestes/std/source_location.hh>
00042
00043 package(lestes);
00044 package(lang);
00045 package(cplus);
00046 package(lex);
00047
00048 using namespace ::std;
00049
00050
00051
00052
00053
00054
00055
00056 ucn_token_buffer::ucn_token_buffer(const ptr<line_control> &a_lines):
00057 buffer(buffer_type::create()),
00058 lines(checked(a_lines))
00059 {
00060 }
00061
00062
00063
00064
00065
00066 void ucn_token_buffer::add_back(const ptr<ucn_token> &item)
00067 {
00068 buffer->push_back(item);
00069 }
00070
00071
00072
00073
00074
00075
00076
00077 void ucn_token_buffer::advance(ucn_token_buffer::size_type len)
00078 {
00079 lassert(len <= length());
00080 while (len != 0) {
00081 buffer->pop_front();
00082 len--;
00083 }
00084 }
00085
00086
00087
00088
00089
00090
00091 ptr<ucn_token> ucn_token_buffer::peek_front(void) const
00092 {
00093 lassert(length() != 0);
00094 return buffer->front();
00095 }
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106 ptr<token_value> ucn_token_buffer::extract_ordinary(ucn_token_buffer::size_type len)
00107 {
00108 lassert(len <= length());
00109
00110
00111 ucn_string us(len,0xbeef);
00112
00113 buffer_type::iterator bit = buffer->begin();
00114 ucn_string::iterator sit = us.begin();
00115 for (ucn_token_buffer::size_type i = 0; i < len; i++, ++bit, ++sit) {
00116 *sit = (*bit)->value_get();
00117 }
00118
00119
00120 buffer->erase(buffer->begin(),bit);
00121
00122 return token_value::create(us);
00123 }
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135 ptr<token_value> ucn_token_buffer::extract_until(ucn stop)
00136 {
00137 ucn_token_buffer::size_type len = length();
00138
00139 ucn_string us(len,0xbeef);
00140 ucn u;
00141
00142 buffer_type::iterator bit = buffer->begin();
00143 ucn_string::iterator sit = us.begin();
00144 for (ucn_token_buffer::size_type i = 0; i < len; i++, ++bit, ++sit) {
00145 u = (*bit)->value_get();
00146 if (u == stop) {
00147
00148 buffer->erase(buffer->begin(),bit);
00149
00150 return token_value::create(ucn_string(us.begin(),sit));
00151 }
00152 *sit = u;
00153 }
00154
00155 lassert2(false,"The stop value was not found");
00156 return NULL;
00157 }
00158
00159
00160
00161
00162
00163
00164
00165
00166
00167
00168
00169
00170
00171
00172
00173 ptr<token_value> ucn_token_buffer::extract_invalid_ucn(ucn_token_buffer::size_type len, bool identifier)
00174 {
00175 lassert(len <= length());
00176
00177
00178 enum {
00179 BEGIN,
00180 BACK,
00181 UCN
00182 } fstate = BEGIN;
00183
00184 ulint count = 0xbad, value = 0xbad;
00185 ptr<ucn_token> t;
00186 ptr<simple_location> loc;
00187 ucn_token_type utt;
00188 ucn u;
00189
00190
00191 ucn_string us(len,0xbeef);
00192
00193 buffer_type::iterator bit = buffer->begin();
00194 ucn_string::iterator sit = us.begin();
00195 for (ucn_token_buffer::size_type i = 0; i < len; i++, ++bit) {
00196 t = *bit;
00197 utt = t->type_get();
00198 u = t->value_get();
00199 loc = t->location_get();
00200 switch (fstate) {
00201 case BACK:
00202
00203 lassert(utt == ucn_token::TOK_BASIC &&
00204 (u == character::ascii_lower_u || u == character::ascii_upper_u));
00205 count = u == character::ascii_lower_u ? 4 : 8;
00206 value = 0;
00207 fstate = UCN;
00208 break;
00209 case UCN:
00210 if (utt == ucn_token::TOK_BASIC && character::is_xdigit(u)) {
00211 value = (value << 4) | (character::extract_xdigit(u) & 0x0f);
00212 if (--count == 0) {
00213 u = character::create_internal(value);
00214
00215 if (!character::is_translated(u)) {
00216
00217 report << ucn_escape_value_invalid << lines->translate_location(loc);
00218
00219 u = character::ascii_underscore;
00220 } else if (identifier && !character::is_translated_identifier(u)) {
00221
00222 report << ucn_escape_value_invalid_in_identifier << lines->translate_location(loc);
00223
00224 }
00225
00226 *sit = u;
00227 ++sit;
00228 fstate = BEGIN;
00229 }
00230 break;
00231 }
00232
00233
00234 report << ucn_escape_insufficient_digits << lines->translate_location(loc);
00235
00236
00237 *sit = character::ascii_underscore;
00238 ++sit;
00239 fstate = BEGIN;
00240
00241
00242 case BEGIN:
00243 if (utt == ucn_token::TOK_BASIC) {
00244 if (u == character::ascii_backslash) {
00245 fstate = BACK;
00246 break;
00247 }
00248 } else {
00249 lassert(utt == ucn_token::TOK_TRANSLATED);
00250 if (identifier && !character::is_translated_identifier(u)) {
00251
00252 report << ucn_escape_value_invalid_in_identifier << lines->translate_location(loc);
00253 }
00254 }
00255 *sit = u;
00256 ++sit;
00257 break;
00258 }
00259 }
00260
00261
00262 lassert(fstate != BACK);
00263
00264 if (fstate == UCN) {
00265
00266 report << ucn_escape_insufficient_digits << lines->translate_location(loc);
00267
00268 *sit = character::ascii_underscore;
00269 ++sit;
00270 }
00271
00272
00273 buffer->erase(buffer->begin(),bit);
00274
00275
00276 return token_value::create(ucn_string(us.begin(),sit));
00277 }
00278
00279
00280
00281
00282
00283
00284
00285
00286
00287
00288
00289
00290
00291 ptr<token_value> ucn_token_buffer::extract_simple_ucn(ucn_token_buffer::size_type len, bool identifier)
00292 {
00293 lassert(len <= length());
00294
00295
00296 enum {
00297 BEGIN,
00298 BACK,
00299 UCN
00300 } fstate = BEGIN;
00301
00302 ulint count = 0xbad, value = 0xbad;
00303 ptr<ucn_token> t;
00304 ptr<simple_location> loc;
00305 ucn_token_type utt;
00306 ucn u;
00307
00308
00309 ucn_string us(len,0xbeef);
00310
00311 buffer_type::iterator bit = buffer->begin();
00312 ucn_string::iterator sit = us.begin();
00313 for (ucn_token_buffer::size_type i = 0; i < len; i++, ++bit) {
00314 t = *bit;
00315 loc = t->location_get();
00316 utt = t->type_get();
00317 u = t->value_get();
00318
00319 switch (fstate) {
00320 case BEGIN:
00321 if (utt == ucn_token::TOK_BASIC) {
00322 if (u == character::ascii_backslash) {
00323 fstate = BACK;
00324 break;
00325 }
00326 } else {
00327 lassert(utt == ucn_token::TOK_TRANSLATED);
00328 if (identifier && !character::is_translated_identifier(u)) {
00329
00330 report << ucn_escape_value_invalid_in_identifier << lines->translate_location(loc);
00331 }
00332 }
00333 *sit = u;
00334 ++sit;
00335 break;
00336 case BACK:
00337
00338 lassert(utt == ucn_token::TOK_BASIC &&
00339 (u == character::ascii_lower_u || u == character::ascii_upper_u));
00340 count = u == character::ascii_lower_u ? 4 : 8;
00341 value = 0;
00342 fstate = UCN;
00343 break;
00344 case UCN:
00345
00346 lassert(utt == ucn_token::TOK_BASIC && character::is_xdigit(u));
00347 value = (value << 4) | (character::extract_xdigit(u) & 0x0f);
00348 if (--count == 0) {
00349 u = character::create_internal(value);
00350
00351 if (!character::is_translated(u)) {
00352
00353 report << ucn_escape_value_invalid << lines->translate_location(loc);
00354
00355 u = character::ascii_underscore;
00356 } else if (identifier && !character::is_translated_identifier(u)) {
00357
00358 report << ucn_escape_value_invalid_in_identifier << lines->translate_location(loc);
00359
00360 }
00361 *sit = u;
00362 ++sit;
00363 fstate = BEGIN;
00364 }
00365 break;
00366 }
00367 }
00368
00369
00370 lassert(fstate == BEGIN);
00371
00372
00373 buffer->erase(buffer->begin(),bit);
00374
00375
00376 return token_value::create(ucn_string(us.begin(),sit));
00377 }
00378
00379
00380
00381
00382
00383
00384
00385
00386
00387
00388
00389
00390 ptr<token_value> ucn_token_buffer::extract_ucn_literal(ucn_token_buffer::size_type len)
00391 {
00392 lassert(len <= length());
00393
00394
00395 enum {
00396 BEGIN,
00397 BACK,
00398 UCN
00399 } fstate = BEGIN;
00400 ulint count = 0xbad, value = 0xbad;
00401 ptr<ucn_token> t;
00402 ucn_token_type utt;
00403 ptr<simple_location> loc;
00404 ucn u;
00405
00406
00407 ucn_string us(len,0xbeef);
00408
00409 buffer_type::iterator bit = buffer->begin();
00410 ucn_string::iterator sit = us.begin();
00411 for (ucn_token_buffer::size_type i = 0; i < len; i++, ++bit) {
00412 t = *bit;
00413 loc = t->location_get();
00414 utt = t->type_get();
00415 u = t->value_get();
00416
00417 switch (fstate) {
00418 case BEGIN:
00419 if (utt == ucn_token::TOK_BASIC && u == character::ascii_backslash) {
00420 fstate = BACK;
00421 } else {
00422 *sit = u;
00423 ++sit;
00424 }
00425 break;
00426 case BACK:
00427
00428 lassert(utt == ucn_token::TOK_BASIC);
00429 if (u == character::ascii_lower_u || u == character::ascii_upper_u) {
00430 count = u == character::ascii_lower_u ? 4 : 8;
00431 value = 0;
00432 fstate = UCN;
00433 } else fstate = BEGIN;
00434 break;
00435 case UCN:
00436
00437 lassert(utt == ucn_token::TOK_BASIC && character::is_xdigit(u));
00438 value = (value << 4) | (character::extract_xdigit(u) & 0x0f);
00439 if (--count == 0) {
00440 u = character::create_internal(value);
00441
00442 if (!character::is_translated(u)) {
00443
00444 report << ucn_escape_value_invalid << lines->translate_location(loc);
00445
00446 u = character::ascii_underscore;
00447 }
00448
00449 *sit = u;
00450 ++sit;
00451 fstate = BEGIN;
00452 }
00453 break;
00454 }
00455 }
00456
00457
00458 lassert(fstate == BEGIN);
00459
00460
00461 buffer->erase(buffer->begin(),bit);
00462
00463
00464 return token_value::create(ucn_string(us.begin(),sit));
00465 }
00466
00467
00468
00469
00470
00471
00472
00473
00474
00475
00476
00477 ptr<token_value> ucn_token_buffer::extract_bad_literal(ucn_token_buffer::size_type len)
00478 {
00479 lassert(len <= length());
00480
00481
00482 enum {
00483 BEGIN,
00484 PASS,
00485 BACK,
00486 UCN,
00487 OCT,
00488 HEX
00489 } fstate = BEGIN;
00490
00491 ulint count = 0xbad, value = 0xbad;
00492 ptr<ucn_token> t;
00493 ptr<simple_location> loc;
00494 ucn_token_type utt = 0xbad;
00495 ucn u = 0xbad;
00496
00497
00498 ucn_string us(len,0xbeef);
00499
00500 buffer_type::iterator bit = buffer->begin();
00501 ucn_string::iterator sit = us.begin();
00502 ucn_token_buffer::size_type i = 0;
00503 while (true) {
00504
00505 if (fstate == PASS) {
00506 fstate = BEGIN;
00507 } else if (i < len) {
00508 t = *bit;
00509 ++bit;
00510 ++i;
00511 loc = t->location_get();
00512 utt = t->type_get();
00513 u = t->value_get();
00514 } else break;
00515
00516 switch (fstate) {
00517 case BEGIN:
00518 if (utt == ucn_token::TOK_BASIC && u == character::ascii_backslash) {
00519 fstate = BACK;
00520 } else {
00521 *sit = u;
00522 ++sit;
00523 }
00524 break;
00525 case BACK:
00526 if (utt == ucn_token::TOK_BASIC) {
00527 switch (u) {
00528 case character::ascii_lower_u:
00529 count = 4;
00530 fstate = UCN;
00531 break;
00532 case character::ascii_upper_u:
00533 count = 8;
00534 fstate = UCN;
00535 break;
00536 case character::ascii_lower_x:
00537 count = 1;
00538 fstate = HEX;
00539 break;
00540 case character::ascii_quote:
00541 case character::ascii_dquote:
00542 case character::ascii_qmark:
00543 case character::ascii_backslash:
00544 case character::ascii_lower_a:
00545 case character::ascii_lower_b:
00546 case character::ascii_lower_f:
00547 case character::ascii_lower_n:
00548 case character::ascii_lower_r:
00549 case character::ascii_lower_t:
00550 case character::ascii_lower_v:
00551 *sit = character::ascii_backslash;
00552 ++sit;
00553 *sit = u;
00554 ++sit;
00555 fstate = BEGIN;
00556 break;
00557 default:
00558 if (character::is_odigit(u)) {
00559 *sit = character::ascii_backslash;
00560 ++sit;
00561 *sit = u;
00562 ++sit;
00563 count = 2;
00564 fstate = OCT;
00565 } else {
00566
00567 report << invalid_escape_sequence << lines->translate_location(loc);
00568 *sit = character::ascii_underscore;
00569 ++sit;
00570 fstate = BEGIN;
00571 }
00572 }
00573 } else {
00574
00575 report << invalid_escape_sequence << lines->translate_location(loc);
00576 *sit = character::ascii_underscore;
00577 ++sit;
00578 fstate = BEGIN;
00579 }
00580 break;
00581 case UCN:
00582
00583 if (utt != ucn_token::TOK_BASIC || !character::is_xdigit(u)) {
00584
00585 report << ucn_escape_insufficient_digits << lines->translate_location(loc);
00586 *sit = character::ascii_underscore;
00587 ++sit;
00588
00589 fstate = PASS;
00590 break;
00591 }
00592
00593 value = (value << 4) | (character::extract_xdigit(u) & 0x0f);
00594 if (--count == 0) {
00595 u = character::create_internal(value);
00596 if (!character::is_translated(u)) {
00597
00598 report << ucn_escape_value_invalid << lines->translate_location(loc);
00599 *sit = character::ascii_underscore;
00600 ++sit;
00601 } else {
00602 *sit = u;
00603 ++sit;
00604 }
00605 fstate = BEGIN;
00606 }
00607 break;
00608 case OCT:
00609 if (utt == ucn_token::TOK_BASIC && character::is_odigit(u)) {
00610 *sit = u;
00611 ++sit;
00612 if (--count == 0) fstate = BEGIN;
00613 } else fstate = PASS;
00614 break;
00615 case HEX:
00616 if (utt == ucn_token::TOK_BASIC && character::is_xdigit(u)) {
00617 if (count) {
00618 count = 0;
00619 *sit = character::ascii_backslash;
00620 ++sit;
00621 *sit = character::ascii_lower_x;
00622 ++sit;
00623 }
00624 *sit = u;
00625 ++sit;
00626 } else {
00627 if (count) {
00628
00629 report << missing_hexadecimal_digits << lines->translate_location(loc);
00630 *sit = character::ascii_underscore;
00631 ++sit;
00632 }
00633
00634 fstate = PASS;
00635 }
00636 break;
00637 default:
00638 lassert2(false,"You should never get here");
00639 }
00640 }
00641
00642 switch (fstate) {
00643 case BEGIN:
00644 case OCT:
00645 break;
00646 case HEX:
00647 if (count) {
00648
00649 report << missing_hexadecimal_digits << lines->translate_location(loc);
00650 *sit = character::ascii_underscore;
00651 ++sit;
00652 }
00653 break;
00654 case UCN:
00655
00656 report << ucn_escape_insufficient_digits << lines->translate_location(loc);
00657 *sit = character::ascii_underscore;
00658 ++sit;
00659 break;
00660 default:
00661 lassert2(false,"You should never get here");
00662 }
00663
00664
00665 buffer->erase(buffer->begin(),bit);
00666
00667
00668 return token_value::create(ucn_string(us.begin(),sit));
00669 }
00670
00671
00672
00673
00674
00675 ucn_token_buffer::size_type ucn_token_buffer::length(void) const
00676 {
00677 return buffer->size();
00678 }
00679
00680
00681
00682
00683 void ucn_token_buffer::gc_mark(void)
00684 {
00685 buffer.gc_mark();
00686 lines.gc_mark();
00687 }
00688
00689
00690
00691
00692
00693
00694
00695 ptr<ucn_token_buffer> ucn_token_buffer::create(const ptr<line_control> &a_lines)
00696 {
00697 return new ucn_token_buffer(a_lines);
00698 }
00699
00700 end_package(lex);
00701 end_package(cplus);
00702 end_package(lang);
00703 end_package(lestes);
00704