Numworks Epsilon  1.4.1
Graphing Calculator Operating System
unicode.c
Go to the documentation of this file.
1 /*
2  * This file is part of the MicroPython project, http://micropython.org/
3  *
4  * The MIT License (MIT)
5  *
6  * Copyright (c) 2013, 2014 Damien P. George
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a copy
9  * of this software and associated documentation files (the "Software"), to deal
10  * in the Software without restriction, including without limitation the rights
11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12  * copies of the Software, and to permit persons to whom the Software is
13  * furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice shall be included in
16  * all copies or substantial portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24  * THE SOFTWARE.
25  */
26 
27 #include <stdint.h>
28 
29 #include "py/unicode.h"
30 
31 // attribute flags
32 #define FL_PRINT (0x01)
33 #define FL_SPACE (0x02)
34 #define FL_DIGIT (0x04)
35 #define FL_ALPHA (0x08)
36 #define FL_UPPER (0x10)
37 #define FL_LOWER (0x20)
38 #define FL_XDIGIT (0x40)
39 
40 // shorthand character attributes
41 #define AT_PR (FL_PRINT)
42 #define AT_SP (FL_SPACE | FL_PRINT)
43 #define AT_DI (FL_DIGIT | FL_PRINT | FL_XDIGIT)
44 #define AT_AL (FL_ALPHA | FL_PRINT)
45 #define AT_UP (FL_UPPER | FL_ALPHA | FL_PRINT)
46 #define AT_LO (FL_LOWER | FL_ALPHA | FL_PRINT)
47 #define AT_UX (FL_UPPER | FL_ALPHA | FL_PRINT | FL_XDIGIT)
48 #define AT_LX (FL_LOWER | FL_ALPHA | FL_PRINT | FL_XDIGIT)
49 
50 // table of attributes for ascii characters
51 STATIC const uint8_t attr[] = {
52  0, 0, 0, 0, 0, 0, 0, 0,
53  0, AT_SP, AT_SP, AT_SP, AT_SP, AT_SP, 0, 0,
54  0, 0, 0, 0, 0, 0, 0, 0,
55  0, 0, 0, 0, 0, 0, 0, 0,
68 };
69 
70 // TODO: Rename to str_get_char
72 #if MICROPY_PY_BUILTINS_STR_UNICODE
73  unichar ord = *s++;
74  if (!UTF8_IS_NONASCII(ord)) return ord;
75  ord &= 0x7F;
76  for (unichar mask = 0x40; ord & mask; mask >>= 1) {
77  ord &= ~mask;
78  }
79  while (UTF8_IS_CONT(*s)) {
80  ord = (ord << 6) | (*s++ & 0x3F);
81  }
82  return ord;
83 #else
84  return *s;
85 #endif
86 }
87 
88 // TODO: Rename to str_next_char
89 const byte *utf8_next_char(const byte *s) {
90 #if MICROPY_PY_BUILTINS_STR_UNICODE
91  ++s;
92  while (UTF8_IS_CONT(*s)) {
93  ++s;
94  }
95  return s;
96 #else
97  return s + 1;
98 #endif
99 }
100 
101 mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr) {
102  mp_uint_t i = 0;
103  while (ptr > s) {
104  if (!UTF8_IS_CONT(*--ptr)) {
105  i++;
106  }
107  }
108 
109  return i;
110 }
111 
112 // TODO: Rename to str_charlen
113 mp_uint_t unichar_charlen(const char *str, mp_uint_t len) {
114 #if MICROPY_PY_BUILTINS_STR_UNICODE
115  mp_uint_t charlen = 0;
116  for (const char *top = str + len; str < top; ++str) {
117  if (!UTF8_IS_CONT(*str)) {
118  ++charlen;
119  }
120  }
121  return charlen;
122 #else
123  return len;
124 #endif
125 }
126 
127 // Be aware: These unichar_is* functions are actually ASCII-only!
129  return c < 128 && (attr[c] & FL_SPACE) != 0;
130 }
131 
133  return c < 128 && (attr[c] & FL_ALPHA) != 0;
134 }
135 
136 /* unused
137 bool unichar_isprint(unichar c) {
138  return c < 128 && (attr[c] & FL_PRINT) != 0;
139 }
140 */
141 
143  return c < 128 && (attr[c] & FL_DIGIT) != 0;
144 }
145 
147  return c < 128 && (attr[c] & FL_XDIGIT) != 0;
148 }
149 
151  return c < 128 && ((attr[c] & (FL_ALPHA | FL_DIGIT)) != 0 || c == '_');
152 }
153 
155  return c < 128 && (attr[c] & FL_UPPER) != 0;
156 }
157 
159  return c < 128 && (attr[c] & FL_LOWER) != 0;
160 }
161 
163  if (unichar_isupper(c)) {
164  return c + 0x20;
165  }
166  return c;
167 }
168 
170  if (unichar_islower(c)) {
171  return c - 0x20;
172  }
173  return c;
174 }
175 
177  // c is assumed to be hex digit
178  mp_uint_t n = c - '0';
179  if (n > 9) {
180  n &= ~('a' - 'A');
181  n -= ('A' - ('9' + 1));
182  }
183  return n;
184 }
185 
186 bool utf8_check(const byte *p, size_t len) {
187  uint8_t need = 0;
188  const byte *end = p + len;
189  for (; p < end; p++) {
190  byte c = *p;
191  if (need) {
192  if (c >= 0x80) {
193  need--;
194  } else {
195  // mismatch
196  return 0;
197  }
198  } else {
199  if (c >= 0xc0) {
200  if (c >= 0xf8) {
201  // mismatch
202  return 0;
203  }
204  need = (0xe5 >> ((c >> 3) & 0x6)) & 3;
205  } else if (c >= 0x80) {
206  // mismatch
207  return 0;
208  }
209  }
210  }
211  return need == 0; // no pending fragments allowed
212 }
unichar unichar_toupper(unichar c)
Definition: unicode.c:169
uintptr_t mp_uint_t
Definition: mpconfigport.h:74
#define AT_DI
Definition: unicode.c:43
#define AT_SP
Definition: unicode.c:42
bool unichar_isdigit(unichar c)
Definition: unicode.c:142
STATIC const uint8_t attr[]
Definition: unicode.c:51
bool unichar_islower(unichar c)
Definition: unicode.c:158
#define FL_LOWER
Definition: unicode.c:37
unichar unichar_tolower(unichar c)
Definition: unicode.c:162
bool unichar_isxdigit(unichar c)
Definition: unicode.c:146
unsigned char uint8_t
Definition: stdint.h:4
mp_uint_t unichar_xdigit_value(unichar c)
Definition: unicode.c:176
#define AT_UX
Definition: unicode.c:47
mp_uint_t unichar_charlen(const char *str, mp_uint_t len)
Definition: unicode.c:113
#define STATIC
Definition: mpconfig.h:1178
bool utf8_check(const byte *p, size_t len)
Definition: unicode.c:186
#define UTF8_IS_CONT(ch)
Definition: misc.h:138
#define FL_SPACE
Definition: unicode.c:33
mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr)
Definition: unicode.c:101
c(generic_all_nodes)
#define FL_ALPHA
Definition: unicode.c:35
const byte * utf8_next_char(const byte *s)
Definition: unicode.c:89
#define AT_LX
Definition: unicode.c:48
#define AT_UP
Definition: unicode.c:45
bool unichar_isident(unichar c)
Definition: unicode.c:150
unsigned char byte
Definition: misc.h:37
#define UTF8_IS_NONASCII(ch)
Definition: misc.h:137
bool unichar_isspace(unichar c)
Definition: unicode.c:128
#define AT_LO
Definition: unicode.c:46
#define FL_DIGIT
Definition: unicode.c:34
bool unichar_isupper(unichar c)
Definition: unicode.c:154
unichar utf8_get_char(const byte *s)
Definition: unicode.c:71
bool unichar_isalpha(unichar c)
Definition: unicode.c:132
#define FL_UPPER
Definition: unicode.c:36
uint unichar
Definition: misc.h:119
#define AT_PR
Definition: unicode.c:41
#define FL_XDIGIT
Definition: unicode.c:38