Numworks Epsilon  1.4.1
Graphing Calculator Operating System
lexer.c
Go to the documentation of this file.
1 /*
2  * This file is part of the MicroPython project, http://micropython.org/
3  *
4  * The MIT License (MIT)
5  *
6  * Copyright (c) 2013, 2014 Damien P. George
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a copy
9  * of this software and associated documentation files (the "Software"), to deal
10  * in the Software without restriction, including without limitation the rights
11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12  * copies of the Software, and to permit persons to whom the Software is
13  * furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice shall be included in
16  * all copies or substantial portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24  * THE SOFTWARE.
25  */
26 
27 #include <stdio.h>
28 #include <string.h>
29 #include <assert.h>
30 
31 #include "py/reader.h"
32 #include "py/lexer.h"
33 #include "py/runtime.h"
34 
35 #if MICROPY_ENABLE_COMPILER
36 
37 #define TAB_SIZE (8)
38 
39 // TODO seems that CPython allows NULL byte in the input stream
40 // don't know if that's intentional or not, but we don't allow it
41 
42 #define MP_LEXER_EOF ((unichar)MP_READER_EOF)
43 #define CUR_CHAR(lex) ((lex)->chr0)
44 
45 STATIC bool is_end(mp_lexer_t *lex) {
46  return lex->chr0 == MP_LEXER_EOF;
47 }
48 
49 STATIC bool is_physical_newline(mp_lexer_t *lex) {
50  return lex->chr0 == '\n';
51 }
52 
53 STATIC bool is_char(mp_lexer_t *lex, byte c) {
54  return lex->chr0 == c;
55 }
56 
57 STATIC bool is_char_or(mp_lexer_t *lex, byte c1, byte c2) {
58  return lex->chr0 == c1 || lex->chr0 == c2;
59 }
60 
61 STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
62  return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
63 }
64 
65 STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
66  return lex->chr1 == c;
67 }
68 
69 STATIC bool is_char_following_or(mp_lexer_t *lex, byte c1, byte c2) {
70  return lex->chr1 == c1 || lex->chr1 == c2;
71 }
72 
73 STATIC bool is_char_following_following_or(mp_lexer_t *lex, byte c1, byte c2) {
74  return lex->chr2 == c1 || lex->chr2 == c2;
75 }
76 
77 STATIC bool is_char_and(mp_lexer_t *lex, byte c1, byte c2) {
78  return lex->chr0 == c1 && lex->chr1 == c2;
79 }
80 
81 STATIC bool is_whitespace(mp_lexer_t *lex) {
82  return unichar_isspace(lex->chr0);
83 }
84 
85 STATIC bool is_letter(mp_lexer_t *lex) {
86  return unichar_isalpha(lex->chr0);
87 }
88 
89 STATIC bool is_digit(mp_lexer_t *lex) {
90  return unichar_isdigit(lex->chr0);
91 }
92 
93 STATIC bool is_following_digit(mp_lexer_t *lex) {
94  return unichar_isdigit(lex->chr1);
95 }
96 
97 STATIC bool is_following_base_char(mp_lexer_t *lex) {
98  const unichar chr1 = lex->chr1 | 0x20;
99  return chr1 == 'b' || chr1 == 'o' || chr1 == 'x';
100 }
101 
102 STATIC bool is_following_odigit(mp_lexer_t *lex) {
103  return lex->chr1 >= '0' && lex->chr1 <= '7';
104 }
105 
106 STATIC bool is_string_or_bytes(mp_lexer_t *lex) {
107  return is_char_or(lex, '\'', '\"')
108  || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
109  || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r'))
110  && is_char_following_following_or(lex, '\'', '\"'));
111 }
112 
113 // to easily parse utf-8 identifiers we allow any raw byte with high bit set
114 STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
115  return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;
116 }
117 
118 STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
119  return is_head_of_identifier(lex) || is_digit(lex);
120 }
121 
122 STATIC void next_char(mp_lexer_t *lex) {
123  if (lex->chr0 == '\n') {
124  // a new line
125  ++lex->line;
126  lex->column = 1;
127  } else if (lex->chr0 == '\t') {
128  // a tab
129  lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
130  } else {
131  // a character worth one column
132  ++lex->column;
133  }
134 
135  lex->chr0 = lex->chr1;
136  lex->chr1 = lex->chr2;
137  lex->chr2 = lex->reader.readbyte(lex->reader.data);
138 
139  if (lex->chr1 == '\r') {
140  // CR is a new line, converted to LF
141  lex->chr1 = '\n';
142  if (lex->chr2 == '\n') {
143  // CR LF is a single new line, throw out the extra LF
144  lex->chr2 = lex->reader.readbyte(lex->reader.data);
145  }
146  }
147 
148  // check if we need to insert a newline at end of file
149  if (lex->chr2 == MP_LEXER_EOF && lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
150  lex->chr2 = '\n';
151  }
152 }
153 
154 STATIC void indent_push(mp_lexer_t *lex, size_t indent) {
155  if (lex->num_indent_level >= lex->alloc_indent_level) {
158  }
159  lex->indent_level[lex->num_indent_level++] = indent;
160 }
161 
162 STATIC size_t indent_top(mp_lexer_t *lex) {
163  return lex->indent_level[lex->num_indent_level - 1];
164 }
165 
166 STATIC void indent_pop(mp_lexer_t *lex) {
167  lex->num_indent_level -= 1;
168 }
169 
170 // some tricky operator encoding:
171 // <op> = begin with <op>, if this opchar matches then begin here
172 // e<op> = end with <op>, if this opchar matches then end
173 // c<op> = continue with <op>, if this opchar matches then continue matching
174 // this means if the start of two ops are the same then they are equal til the last char
175 
176 STATIC const char *const tok_enc =
177  "()[]{},:;@~" // singles
178  "<e=c<e=" // < <= << <<=
179  ">e=c>e=" // > >= >> >>=
180  "*e=c*e=" // * *= ** **=
181  "+e=" // + +=
182  "-e=e>" // - -= ->
183  "&e=" // & &=
184  "|e=" // | |=
185  "/e=c/e=" // / /= // //=
186  "%e=" // % %=
187  "^e=" // ^ ^=
188  "=e=" // = ==
189  "!."; // start of special cases: != . ...
190 
191 // TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
192 STATIC const uint8_t tok_enc_kind[] = {
197 
209 };
210 
211 // must have the same order as enum in lexer.h
212 // must be sorted according to strcmp
213 STATIC const char *const tok_kw[] = {
214  "False",
215  "None",
216  "True",
217  "__debug__",
218  "and",
219  "as",
220  "assert",
221  #if MICROPY_PY_ASYNC_AWAIT
222  "async",
223  "await",
224  #endif
225  "break",
226  "class",
227  "continue",
228  "def",
229  "del",
230  "elif",
231  "else",
232  "except",
233  "finally",
234  "for",
235  "from",
236  "global",
237  "if",
238  "import",
239  "in",
240  "is",
241  "lambda",
242  "nonlocal",
243  "not",
244  "or",
245  "pass",
246  "raise",
247  "return",
248  "try",
249  "while",
250  "with",
251  "yield",
252 };
253 
254 // This is called with CUR_CHAR() before first hex digit, and should return with
255 // it pointing to last hex digit
256 // num_digits must be greater than zero
257 STATIC bool get_hex(mp_lexer_t *lex, size_t num_digits, mp_uint_t *result) {
258  mp_uint_t num = 0;
259  while (num_digits-- != 0) {
260  next_char(lex);
261  unichar c = CUR_CHAR(lex);
262  if (!unichar_isxdigit(c)) {
263  return false;
264  }
265  num = (num << 4) + unichar_xdigit_value(c);
266  }
267  *result = num;
268  return true;
269 }
270 
271 STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
272  // get first quoting character
273  char quote_char = '\'';
274  if (is_char(lex, '\"')) {
275  quote_char = '\"';
276  }
277  next_char(lex);
278 
279  // work out if it's a single or triple quoted literal
280  size_t num_quotes;
281  if (is_char_and(lex, quote_char, quote_char)) {
282  // triple quotes
283  next_char(lex);
284  next_char(lex);
285  num_quotes = 3;
286  } else {
287  // single quotes
288  num_quotes = 1;
289  }
290 
291  size_t n_closing = 0;
292  while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
293  if (is_char(lex, quote_char)) {
294  n_closing += 1;
295  vstr_add_char(&lex->vstr, CUR_CHAR(lex));
296  } else {
297  n_closing = 0;
298  if (is_char(lex, '\\')) {
299  next_char(lex);
300  unichar c = CUR_CHAR(lex);
301  if (is_raw) {
302  // raw strings allow escaping of quotes, but the backslash is also emitted
303  vstr_add_char(&lex->vstr, '\\');
304  } else {
305  switch (c) {
306  // note: "c" can never be MP_LEXER_EOF because next_char
307  // always inserts a newline at the end of the input stream
308  case '\n': c = MP_LEXER_EOF; break; // backslash escape the newline, just ignore it
309  case '\\': break;
310  case '\'': break;
311  case '"': break;
312  case 'a': c = 0x07; break;
313  case 'b': c = 0x08; break;
314  case 't': c = 0x09; break;
315  case 'n': c = 0x0a; break;
316  case 'v': c = 0x0b; break;
317  case 'f': c = 0x0c; break;
318  case 'r': c = 0x0d; break;
319  case 'u':
320  case 'U':
321  if (lex->tok_kind == MP_TOKEN_BYTES) {
322  // b'\u1234' == b'\\u1234'
323  vstr_add_char(&lex->vstr, '\\');
324  break;
325  }
326  // Otherwise fall through.
327  case 'x':
328  {
329  mp_uint_t num = 0;
330  if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
331  // not enough hex chars for escape sequence
332  lex->tok_kind = MP_TOKEN_INVALID;
333  }
334  c = num;
335  break;
336  }
337  case 'N':
338  // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
339  // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
340  // 3MB of text; even gzip-compressed and with minimal structure, it'll take
341  // roughly half a meg of storage. This form of Unicode escape may be added
342  // later on, but it's definitely not a priority right now. -- CJA 20140607
343  mp_raise_NotImplementedError("unicode name escapes");
344  break;
345  default:
346  if (c >= '0' && c <= '7') {
347  // Octal sequence, 1-3 chars
348  size_t digits = 3;
349  mp_uint_t num = c - '0';
350  while (is_following_odigit(lex) && --digits != 0) {
351  next_char(lex);
352  num = num * 8 + (CUR_CHAR(lex) - '0');
353  }
354  c = num;
355  } else {
356  // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
357  vstr_add_char(&lex->vstr, '\\');
358  }
359  break;
360  }
361  }
362  if (c != MP_LEXER_EOF) {
364  if (c < 0x110000 && lex->tok_kind == MP_TOKEN_STRING) {
365  vstr_add_char(&lex->vstr, c);
366  } else if (c < 0x100 && lex->tok_kind == MP_TOKEN_BYTES) {
367  vstr_add_byte(&lex->vstr, c);
368  } else {
369  // unicode character out of range
370  // this raises a generic SyntaxError; could provide more info
371  lex->tok_kind = MP_TOKEN_INVALID;
372  }
373  } else {
374  // without unicode everything is just added as an 8-bit byte
375  if (c < 0x100) {
376  vstr_add_byte(&lex->vstr, c);
377  } else {
378  // 8-bit character out of range
379  // this raises a generic SyntaxError; could provide more info
380  lex->tok_kind = MP_TOKEN_INVALID;
381  }
382  }
383  }
384  } else {
385  // Add the "character" as a byte so that we remain 8-bit clean.
386  // This way, strings are parsed correctly whether or not they contain utf-8 chars.
387  vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
388  }
389  }
390  next_char(lex);
391  }
392 
393  // check we got the required end quotes
394  if (n_closing < num_quotes) {
396  }
397 
398  // cut off the end quotes from the token text
399  vstr_cut_tail_bytes(&lex->vstr, n_closing);
400 }
401 
402 STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
403  bool had_physical_newline = false;
404  while (!is_end(lex)) {
405  if (is_physical_newline(lex)) {
406  if (stop_at_newline && lex->nested_bracket_level == 0) {
407  break;
408  }
409  had_physical_newline = true;
410  next_char(lex);
411  } else if (is_whitespace(lex)) {
412  next_char(lex);
413  } else if (is_char(lex, '#')) {
414  next_char(lex);
415  while (!is_end(lex) && !is_physical_newline(lex)) {
416  next_char(lex);
417  }
418  // had_physical_newline will be set on next loop
419  } else if (is_char_and(lex, '\\', '\n')) {
420  // line-continuation, so don't set had_physical_newline
421  next_char(lex);
422  next_char(lex);
423  } else {
424  break;
425  }
426  }
427  return had_physical_newline;
428 }
429 
430 void mp_lexer_to_next(mp_lexer_t *lex) {
431  // start new token text
432  vstr_reset(&lex->vstr);
433 
434  // skip white space and comments
435  bool had_physical_newline = skip_whitespace(lex, false);
436 
437  // set token source information
438  lex->tok_line = lex->line;
439  lex->tok_column = lex->column;
440 
441  if (lex->emit_dent < 0) {
442  lex->tok_kind = MP_TOKEN_DEDENT;
443  lex->emit_dent += 1;
444 
445  } else if (lex->emit_dent > 0) {
446  lex->tok_kind = MP_TOKEN_INDENT;
447  lex->emit_dent -= 1;
448 
449  } else if (had_physical_newline && lex->nested_bracket_level == 0) {
450  lex->tok_kind = MP_TOKEN_NEWLINE;
451 
452  size_t num_spaces = lex->column - 1;
453  if (num_spaces == indent_top(lex)) {
454  } else if (num_spaces > indent_top(lex)) {
455  indent_push(lex, num_spaces);
456  lex->emit_dent += 1;
457  } else {
458  while (num_spaces < indent_top(lex)) {
459  indent_pop(lex);
460  lex->emit_dent -= 1;
461  }
462  if (num_spaces != indent_top(lex)) {
464  }
465  }
466 
467  } else if (is_end(lex)) {
468  lex->tok_kind = MP_TOKEN_END;
469 
470  } else if (is_string_or_bytes(lex)) {
471  // a string or bytes literal
472 
473  // Python requires adjacent string/bytes literals to be automatically
474  // concatenated. We do it here in the tokeniser to make efficient use of RAM,
475  // because then the lexer's vstr can be used to accumulate the string literal,
476  // in contrast to creating a parse tree of strings and then joining them later
477  // in the compiler. It's also more compact in code size to do it here.
478 
479  // MP_TOKEN_END is used to indicate that this is the first string token
480  lex->tok_kind = MP_TOKEN_END;
481 
482  // Loop to accumulate string/bytes literals
483  do {
484  // parse type codes
485  bool is_raw = false;
487  int n_char = 0;
488  if (is_char(lex, 'u')) {
489  n_char = 1;
490  } else if (is_char(lex, 'b')) {
491  kind = MP_TOKEN_BYTES;
492  n_char = 1;
493  if (is_char_following(lex, 'r')) {
494  is_raw = true;
495  n_char = 2;
496  }
497  } else if (is_char(lex, 'r')) {
498  is_raw = true;
499  n_char = 1;
500  if (is_char_following(lex, 'b')) {
501  kind = MP_TOKEN_BYTES;
502  n_char = 2;
503  }
504  }
505 
506  // Set or check token kind
507  if (lex->tok_kind == MP_TOKEN_END) {
508  lex->tok_kind = kind;
509  } else if (lex->tok_kind != kind) {
510  // Can't concatenate string with bytes
511  break;
512  }
513 
514  // Skip any type code characters
515  if (n_char != 0) {
516  next_char(lex);
517  if (n_char == 2) {
518  next_char(lex);
519  }
520  }
521 
522  // Parse the literal
523  parse_string_literal(lex, is_raw);
524 
525  // Skip whitespace so we can check if there's another string following
526  skip_whitespace(lex, true);
527 
528  } while (is_string_or_bytes(lex));
529 
530  } else if (is_head_of_identifier(lex)) {
531  lex->tok_kind = MP_TOKEN_NAME;
532 
533  // get first char (add as byte to remain 8-bit clean and support utf-8)
534  vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
535  next_char(lex);
536 
537  // get tail chars
538  while (!is_end(lex) && is_tail_of_identifier(lex)) {
539  vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
540  next_char(lex);
541  }
542 
543  // Check if the name is a keyword.
544  // We also check for __debug__ here and convert it to its value. This is
545  // so the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
546  // need to check for this special token in many places in the compiler.
547  const char *s = vstr_null_terminated_str(&lex->vstr);
548  for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
549  int cmp = strcmp(s, tok_kw[i]);
550  if (cmp == 0) {
551  lex->tok_kind = MP_TOKEN_KW_FALSE + i;
552  if (lex->tok_kind == MP_TOKEN_KW___DEBUG__) {
553  lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
554  }
555  break;
556  } else if (cmp < 0) {
557  // Table is sorted and comparison was less-than, so stop searching
558  break;
559  }
560  }
561 
562  } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
563  bool forced_integer = false;
564  if (is_char(lex, '.')) {
566  } else {
567  lex->tok_kind = MP_TOKEN_INTEGER;
568  if (is_char(lex, '0') && is_following_base_char(lex)) {
569  forced_integer = true;
570  }
571  }
572 
573  // get first char
574  vstr_add_char(&lex->vstr, CUR_CHAR(lex));
575  next_char(lex);
576 
577  // get tail chars
578  while (!is_end(lex)) {
579  if (!forced_integer && is_char_or(lex, 'e', 'E')) {
581  vstr_add_char(&lex->vstr, 'e');
582  next_char(lex);
583  if (is_char(lex, '+') || is_char(lex, '-')) {
584  vstr_add_char(&lex->vstr, CUR_CHAR(lex));
585  next_char(lex);
586  }
587  } else if (is_letter(lex) || is_digit(lex) || is_char(lex, '.')) {
588  if (is_char_or3(lex, '.', 'j', 'J')) {
590  }
591  vstr_add_char(&lex->vstr, CUR_CHAR(lex));
592  next_char(lex);
593  } else {
594  break;
595  }
596  }
597 
598  } else {
599  // search for encoded delimiter or operator
600 
601  const char *t = tok_enc;
602  size_t tok_enc_index = 0;
603  for (; *t != 0 && !is_char(lex, *t); t += 1) {
604  if (*t == 'e' || *t == 'c') {
605  t += 1;
606  }
607  tok_enc_index += 1;
608  }
609 
610  next_char(lex);
611 
612  if (*t == 0) {
613  // didn't match any delimiter or operator characters
614  lex->tok_kind = MP_TOKEN_INVALID;
615 
616  } else if (*t == '!') {
617  // "!=" is a special case because "!" is not a valid operator
618  if (is_char(lex, '=')) {
619  next_char(lex);
621  } else {
622  lex->tok_kind = MP_TOKEN_INVALID;
623  }
624 
625  } else if (*t == '.') {
626  // "." and "..." are special cases because ".." is not a valid operator
627  if (is_char_and(lex, '.', '.')) {
628  next_char(lex);
629  next_char(lex);
631  } else {
633  }
634 
635  } else {
636  // matched a delimiter or operator character
637 
638  // get the maximum characters for a valid token
639  t += 1;
640  size_t t_index = tok_enc_index;
641  while (*t == 'c' || *t == 'e') {
642  t_index += 1;
643  if (is_char(lex, t[1])) {
644  next_char(lex);
645  tok_enc_index = t_index;
646  if (*t == 'e') {
647  break;
648  }
649  } else if (*t == 'c') {
650  break;
651  }
652  t += 2;
653  }
654 
655  // set token kind
656  lex->tok_kind = tok_enc_kind[tok_enc_index];
657 
658  // compute bracket level for implicit line joining
660  lex->nested_bracket_level += 1;
662  lex->nested_bracket_level -= 1;
663  }
664  }
665  }
666 }
667 
668 mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
670 
671  lex->source_name = src_name;
672  lex->reader = reader;
673  lex->line = 1;
674  lex->column = (size_t)-2; // account for 3 dummy bytes
675  lex->emit_dent = 0;
676  lex->nested_bracket_level = 0;
678  lex->num_indent_level = 1;
680  vstr_init(&lex->vstr, 32);
681 
682  // store sentinel for first indentation level
683  lex->indent_level[0] = 0;
684 
685  // load lexer with start of file, advancing lex->column to 1
686  // start with dummy bytes and use next_char() for proper EOL/EOF handling
687  lex->chr0 = lex->chr1 = lex->chr2 = 0;
688  next_char(lex);
689  next_char(lex);
690  next_char(lex);
691 
692  // preload first token
693  mp_lexer_to_next(lex);
694 
695  // Check that the first token is in the first column. If it's not then we
696  // convert the token kind to INDENT so that the parser gives a syntax error.
697  if (lex->tok_column != 1) {
698  lex->tok_kind = MP_TOKEN_INDENT;
699  }
700 
701  return lex;
702 }
703 
704 mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, size_t len, size_t free_len) {
705  mp_reader_t reader;
706  mp_reader_new_mem(&reader, (const byte*)str, len, free_len);
707  return mp_lexer_new(src_name, reader);
708 }
709 
710 #if MICROPY_READER_POSIX || MICROPY_READER_VFS
711 
712 mp_lexer_t *mp_lexer_new_from_file(const char *filename) {
713  mp_reader_t reader;
714  mp_reader_new_file(&reader, filename);
715  return mp_lexer_new(qstr_from_str(filename), reader);
716 }
717 
718 #if MICROPY_HELPER_LEXER_UNIX
719 
720 mp_lexer_t *mp_lexer_new_from_fd(qstr filename, int fd, bool close_fd) {
721  mp_reader_t reader;
722  mp_reader_new_file_from_fd(&reader, fd, close_fd);
723  return mp_lexer_new(filename, reader);
724 }
725 
726 #endif
727 
728 #endif
729 
730 void mp_lexer_free(mp_lexer_t *lex) {
731  if (lex) {
732  lex->reader.close(lex->reader.data);
733  vstr_clear(&lex->vstr);
735  m_del_obj(mp_lexer_t, lex);
736  }
737 }
738 
739 #if 0
740 // This function is used to print the current token and should only be
741 // needed to debug the lexer, so it's not available via a config option.
742 void mp_lexer_show_token(const mp_lexer_t *lex) {
743  printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:%zu", lex->tok_line, lex->tok_column, lex->tok_kind, lex->vstr.buf, lex->vstr.len);
744  if (lex->vstr.len > 0) {
745  const byte *i = (const byte *)lex->vstr.buf;
746  const byte *j = (const byte *)i + lex->vstr.len;
747  printf(" ");
748  while (i < j) {
749  unichar c = utf8_get_char(i);
750  i = utf8_next_char(i);
751  if (unichar_isprint(c)) {
752  printf("%c", (int)c);
753  } else {
754  printf("?");
755  }
756  }
757  }
758  printf("\n");
759 }
760 #endif
761 
762 #endif // MICROPY_ENABLE_COMPILER
size_t alloc_indent_level
Definition: lexer.h:160
unichar chr0
Definition: lexer.h:152
uintptr_t mp_uint_t
Definition: mpconfigport.h:74
void mp_lexer_to_next(mp_lexer_t *lex)
void mp_lexer_free(mp_lexer_t *lex)
NORETURN void mp_raise_NotImplementedError(const char *msg)
Definition: runtime.c:1468
bool unichar_isalpha(unichar c)
Definition: unicode.c:132
void mp_reader_new_file_from_fd(mp_reader_t *reader, int fd, bool close_fd)
#define m_del(type, ptr, num)
Definition: misc.h:77
void vstr_add_char(vstr_t *vstr, unichar chr)
Definition: vstr.c:146
char * buf
Definition: misc.h:145
vstr_t vstr
Definition: lexer.h:167
size_t tok_column
Definition: lexer.h:165
unsigned int size_t
Definition: stddef.h:7
#define MICROPY_ALLOC_LEXER_INDENT_INIT
Definition: mpconfig.h:135
mp_lexer_t * mp_lexer_new_from_str_len(qstr src_name, const char *str, size_t len, size_t free_len)
void vstr_init(vstr_t *vstr, size_t alloc)
Definition: vstr.c:40
bool unichar_isxdigit(unichar c)
Definition: unicode.c:146
unsigned short uint16_t
Definition: stdint.h:5
#define MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC
Definition: mpconfig.h:325
void mp_reader_new_file(mp_reader_t *reader, const char *filename)
void vstr_add_byte(vstr_t *vstr, byte v)
Definition: vstr.c:141
mp_lexer_t * mp_lexer_new(qstr src_name, mp_reader_t reader)
qstr qstr_from_str(const char *str)
Definition: qstr.c:183
#define MP_ARRAY_SIZE(a)
Definition: misc.h:106
const byte * utf8_next_char(const byte *s)
Definition: unicode.c:89
size_t len
Definition: misc.h:144
#define MP_STATE_VM(x)
Definition: mpstate.h:241
unsigned char uint8_t
Definition: stdint.h:4
mp_token_kind_t tok_kind
Definition: lexer.h:166
bool unichar_isdigit(unichar c)
Definition: unicode.c:142
#define STATIC
Definition: mpconfig.h:1178
enum _mp_token_kind_t mp_token_kind_t
mp_lexer_t * mp_lexer_new_from_file(const char *filename)
Definition: port.cpp:146
uint16_t * indent_level
Definition: lexer.h:162
c(generic_all_nodes)
size_t tok_line
Definition: lexer.h:164
#define m_del_obj(type, ptr)
Definition: misc.h:80
bool unichar_isspace(unichar c)
Definition: unicode.c:128
mp_uint_t unichar_xdigit_value(unichar c)
Definition: unicode.c:176
size_t qstr
Definition: qstr.h:48
#define UINT_FMT
Definition: mpconfigport.h:71
size_t line
Definition: lexer.h:154
unichar chr1
Definition: lexer.h:152
mp_reader_t reader
Definition: lexer.h:150
unsigned char byte
Definition: misc.h:37
#define m_renew(type, ptr, old_num, new_num)
Definition: misc.h:75
unichar utf8_get_char(const byte *s)
Definition: unicode.c:71
mp_int_t nested_bracket_level
Definition: lexer.h:158
char * vstr_null_terminated_str(vstr_t *vstr)
Definition: vstr.c:132
bool unichar_isprint(unichar c)
size_t num_indent_level
Definition: lexer.h:161
unichar chr2
Definition: lexer.h:152
void(* close)(void *data)
Definition: reader.h:39
void * data
Definition: reader.h:37
#define MICROPY_ALLOC_LEXEL_INDENT_INC
Definition: mpconfig.h:140
int strcmp(const char *s1, const char *s2)
Definition: strcmp.c:3
size_t column
Definition: lexer.h:155
mp_int_t emit_dent
Definition: lexer.h:157
mp_uint_t(* readbyte)(void *data)
Definition: reader.h:38
void mp_reader_new_mem(mp_reader_t *reader, const byte *buf, size_t len, size_t free_len)
Definition: reader.c:58
qstr source_name
Definition: lexer.h:149
#define m_new_obj(type)
Definition: misc.h:60
uint unichar
Definition: misc.h:119
void vstr_clear(vstr_t *vstr)
Definition: vstr.c:70
void vstr_cut_tail_bytes(vstr_t *vstr, size_t bytes_to_cut)
Definition: vstr.c:216
#define m_new(type, num)
Definition: misc.h:57