Numworks Epsilon  1.4.1
Graphing Calculator Operating System
objstrunicode.c
Go to the documentation of this file.
1 /*
2  * This file is part of the MicroPython project, http://micropython.org/
3  *
4  * The MIT License (MIT)
5  *
6  * Copyright (c) 2013, 2014 Damien P. George
7  * Copyright (c) 2014 Paul Sokolovsky
8  *
9  * Permission is hereby granted, free of charge, to any person obtaining a copy
10  * of this software and associated documentation files (the "Software"), to deal
11  * in the Software without restriction, including without limitation the rights
12  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13  * copies of the Software, and to permit persons to whom the Software is
14  * furnished to do so, subject to the following conditions:
15  *
16  * The above copyright notice and this permission notice shall be included in
17  * all copies or substantial portions of the Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25  * THE SOFTWARE.
26  */
27 
28 #include <string.h>
29 #include <assert.h>
30 
31 #include "py/objstr.h"
32 #include "py/objlist.h"
33 #include "py/runtime.h"
34 
35 #if MICROPY_PY_BUILTINS_STR_UNICODE
36 
38 
39 /******************************************************************************/
40 /* str */
41 
42 STATIC void uni_print_quoted(const mp_print_t *print, const byte *str_data, uint str_len) {
43  // this escapes characters, but it will be very slow to print (calling print many times)
44  bool has_single_quote = false;
45  bool has_double_quote = false;
46  for (const byte *s = str_data, *top = str_data + str_len; !has_double_quote && s < top; s++) {
47  if (*s == '\'') {
48  has_single_quote = true;
49  } else if (*s == '"') {
50  has_double_quote = true;
51  }
52  }
53  unichar quote_char = '\'';
54  if (has_single_quote && !has_double_quote) {
55  quote_char = '"';
56  }
57  mp_printf(print, "%c", quote_char);
58  const byte *s = str_data, *top = str_data + str_len;
59  while (s < top) {
60  unichar ch;
61  ch = utf8_get_char(s);
62  s = utf8_next_char(s);
63  if (ch == quote_char) {
64  mp_printf(print, "\\%c", quote_char);
65  } else if (ch == '\\') {
66  mp_print_str(print, "\\\\");
67  } else if (32 <= ch && ch <= 126) {
68  mp_printf(print, "%c", ch);
69  } else if (ch == '\n') {
70  mp_print_str(print, "\\n");
71  } else if (ch == '\r') {
72  mp_print_str(print, "\\r");
73  } else if (ch == '\t') {
74  mp_print_str(print, "\\t");
75  } else if (ch < 0x100) {
76  mp_printf(print, "\\x%02x", ch);
77  } else if (ch < 0x10000) {
78  mp_printf(print, "\\u%04x", ch);
79  } else {
80  mp_printf(print, "\\U%08x", ch);
81  }
82  }
83  mp_printf(print, "%c", quote_char);
84 }
85 
86 STATIC void uni_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t kind) {
87  GET_STR_DATA_LEN(self_in, str_data, str_len);
88  #if MICROPY_PY_UJSON
89  if (kind == PRINT_JSON) {
90  mp_str_print_json(print, str_data, str_len);
91  return;
92  }
93  #endif
94  if (kind == PRINT_STR) {
95  mp_printf(print, "%.*s", str_len, str_data);
96  } else {
97  uni_print_quoted(print, str_data, str_len);
98  }
99 }
100 
101 STATIC mp_obj_t uni_unary_op(mp_unary_op_t op, mp_obj_t self_in) {
102  GET_STR_DATA_LEN(self_in, str_data, str_len);
103  switch (op) {
104  case MP_UNARY_OP_BOOL:
105  return mp_obj_new_bool(str_len != 0);
106  case MP_UNARY_OP_LEN:
107  return MP_OBJ_NEW_SMALL_INT(unichar_charlen((const char *)str_data, str_len));
108  default:
109  return MP_OBJ_NULL; // op not supported
110  }
111 }
112 
113 // Convert an index into a pointer to its lead byte. Out of bounds indexing will raise IndexError or
114 // be capped to the first/last character of the string, depending on is_slice.
115 const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, size_t self_len,
116  mp_obj_t index, bool is_slice) {
117  // All str functions also handle bytes objects, and they call str_index_to_ptr(),
118  // so it must handle bytes.
119  if (type == &mp_type_bytes) {
120  // Taken from objstr.c:str_index_to_ptr()
121  size_t index_val = mp_get_index(type, self_len, index, is_slice);
122  return self_data + index_val;
123  }
124 
125  mp_int_t i;
126  // Copied from mp_get_index; I don't want bounds checking, just give me
127  // the integer as-is. (I can't bounds-check without scanning the whole
128  // string; an out-of-bounds index will be caught in the loops below.)
129  if (MP_OBJ_IS_SMALL_INT(index)) {
130  i = MP_OBJ_SMALL_INT_VALUE(index);
131  } else if (!mp_obj_get_int_maybe(index, &i)) {
132  nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "string indices must be integers, not %s", mp_obj_get_type_str(index)));
133  }
134  const byte *s, *top = self_data + self_len;
135  if (i < 0)
136  {
137  // Negative indexing is performed by counting from the end of the string.
138  for (s = top - 1; i; --s) {
139  if (s < self_data) {
140  if (is_slice) {
141  return self_data;
142  }
143  mp_raise_msg(&mp_type_IndexError, "string index out of range");
144  }
145  if (!UTF8_IS_CONT(*s)) {
146  ++i;
147  }
148  }
149  ++s;
150  } else {
151  // Positive indexing, correspondingly, counts from the start of the string.
152  // It's assumed that negative indexing will generally be used with small
153  // absolute values (eg str[-1], not str[-1000000]), which means it'll be
154  // more efficient this way.
155  s = self_data;
156  while (1) {
157  // First check out-of-bounds
158  if (s >= top) {
159  if (is_slice) {
160  return top;
161  }
162  mp_raise_msg(&mp_type_IndexError, "string index out of range");
163  }
164  // Then check completion
165  if (i-- == 0) {
166  break;
167  }
168  // Then skip UTF-8 char
169  ++s;
170  while (UTF8_IS_CONT(*s)) {
171  ++s;
172  }
173  }
174  }
175  return s;
176 }
177 
178 STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
179  mp_obj_type_t *type = mp_obj_get_type(self_in);
180  assert(type == &mp_type_str);
181  GET_STR_DATA_LEN(self_in, self_data, self_len);
182  if (value == MP_OBJ_SENTINEL) {
183  // load
184 #if MICROPY_PY_BUILTINS_SLICE
185  if (MP_OBJ_IS_TYPE(index, &mp_type_slice)) {
186  mp_obj_t ostart, ostop, ostep;
187  mp_obj_slice_get(index, &ostart, &ostop, &ostep);
188  if (ostep != mp_const_none && ostep != MP_OBJ_NEW_SMALL_INT(1)) {
189  mp_raise_NotImplementedError("only slices with step=1 (aka None) are supported");
190  }
191 
192  const byte *pstart, *pstop;
193  if (ostart != mp_const_none) {
194  pstart = str_index_to_ptr(type, self_data, self_len, ostart, true);
195  } else {
196  pstart = self_data;
197  }
198  if (ostop != mp_const_none) {
199  // pstop will point just after the stop character. This depends on
200  // the \0 at the end of the string.
201  pstop = str_index_to_ptr(type, self_data, self_len, ostop, true);
202  } else {
203  pstop = self_data + self_len;
204  }
205  if (pstop < pstart) {
206  return MP_OBJ_NEW_QSTR(MP_QSTR_);
207  }
208  return mp_obj_new_str_of_type(type, (const byte *)pstart, pstop - pstart);
209  }
210 #endif
211  const byte *s = str_index_to_ptr(type, self_data, self_len, index, false);
212  int len = 1;
213  if (UTF8_IS_NONASCII(*s)) {
214  // Count the number of 1 bits (after the first)
215  for (char mask = 0x40; *s & mask; mask >>= 1) {
216  ++len;
217  }
218  }
219  return mp_obj_new_str((const char*)s, len, true); // This will create a one-character string
220  } else {
221  return MP_OBJ_NULL; // op not supported
222  }
223 }
224 
225 STATIC const mp_rom_map_elem_t struni_locals_dict_table[] = {
226 #if MICROPY_CPYTHON_COMPAT
227  { MP_ROM_QSTR(MP_QSTR_encode), MP_ROM_PTR(&str_encode_obj) },
228 #endif
229  { MP_ROM_QSTR(MP_QSTR_find), MP_ROM_PTR(&str_find_obj) },
230  { MP_ROM_QSTR(MP_QSTR_rfind), MP_ROM_PTR(&str_rfind_obj) },
231  { MP_ROM_QSTR(MP_QSTR_index), MP_ROM_PTR(&str_index_obj) },
232  { MP_ROM_QSTR(MP_QSTR_rindex), MP_ROM_PTR(&str_rindex_obj) },
233  { MP_ROM_QSTR(MP_QSTR_join), MP_ROM_PTR(&str_join_obj) },
234  { MP_ROM_QSTR(MP_QSTR_split), MP_ROM_PTR(&str_split_obj) },
235  #if MICROPY_PY_BUILTINS_STR_SPLITLINES
236  { MP_ROM_QSTR(MP_QSTR_splitlines), MP_ROM_PTR(&str_splitlines_obj) },
237  #endif
238  { MP_ROM_QSTR(MP_QSTR_rsplit), MP_ROM_PTR(&str_rsplit_obj) },
239  { MP_ROM_QSTR(MP_QSTR_startswith), MP_ROM_PTR(&str_startswith_obj) },
240  { MP_ROM_QSTR(MP_QSTR_endswith), MP_ROM_PTR(&str_endswith_obj) },
241  { MP_ROM_QSTR(MP_QSTR_strip), MP_ROM_PTR(&str_strip_obj) },
242  { MP_ROM_QSTR(MP_QSTR_lstrip), MP_ROM_PTR(&str_lstrip_obj) },
243  { MP_ROM_QSTR(MP_QSTR_rstrip), MP_ROM_PTR(&str_rstrip_obj) },
244  { MP_ROM_QSTR(MP_QSTR_format), MP_ROM_PTR(&str_format_obj) },
245  { MP_ROM_QSTR(MP_QSTR_replace), MP_ROM_PTR(&str_replace_obj) },
246  { MP_ROM_QSTR(MP_QSTR_count), MP_ROM_PTR(&str_count_obj) },
247  #if MICROPY_PY_BUILTINS_STR_PARTITION
248  { MP_ROM_QSTR(MP_QSTR_partition), MP_ROM_PTR(&str_partition_obj) },
249  { MP_ROM_QSTR(MP_QSTR_rpartition), MP_ROM_PTR(&str_rpartition_obj) },
250  #endif
251  #if MICROPY_PY_BUILTINS_STR_CENTER
252  { MP_ROM_QSTR(MP_QSTR_center), MP_ROM_PTR(&str_center_obj) },
253  #endif
254  { MP_ROM_QSTR(MP_QSTR_lower), MP_ROM_PTR(&str_lower_obj) },
255  { MP_ROM_QSTR(MP_QSTR_upper), MP_ROM_PTR(&str_upper_obj) },
256  { MP_ROM_QSTR(MP_QSTR_isspace), MP_ROM_PTR(&str_isspace_obj) },
257  { MP_ROM_QSTR(MP_QSTR_isalpha), MP_ROM_PTR(&str_isalpha_obj) },
258  { MP_ROM_QSTR(MP_QSTR_isdigit), MP_ROM_PTR(&str_isdigit_obj) },
259  { MP_ROM_QSTR(MP_QSTR_isupper), MP_ROM_PTR(&str_isupper_obj) },
260  { MP_ROM_QSTR(MP_QSTR_islower), MP_ROM_PTR(&str_islower_obj) },
261 };
262 
263 STATIC MP_DEFINE_CONST_DICT(struni_locals_dict, struni_locals_dict_table);
264 
265 const mp_obj_type_t mp_type_str = {
266  { &mp_type_type },
267  .name = MP_QSTR_str,
268  .print = uni_print,
269  .make_new = mp_obj_str_make_new,
270  .unary_op = uni_unary_op,
271  .binary_op = mp_obj_str_binary_op,
272  .subscr = str_subscr,
273  .getiter = mp_obj_new_str_iterator,
274  .buffer_p = { .get_buffer = mp_obj_str_get_buffer },
275  .locals_dict = (mp_obj_dict_t*)&struni_locals_dict,
276 };
277 
278 /******************************************************************************/
279 /* str iterator */
280 
281 typedef struct _mp_obj_str_it_t {
282  mp_obj_base_t base;
283  mp_fun_1_t iternext;
284  mp_obj_t str;
285  size_t cur;
286 } mp_obj_str_it_t;
287 
289  mp_obj_str_it_t *self = MP_OBJ_TO_PTR(self_in);
290  GET_STR_DATA_LEN(self->str, str, len);
291  if (self->cur < len) {
292  const byte *cur = str + self->cur;
293  const byte *end = utf8_next_char(str + self->cur);
294  mp_obj_t o_out = mp_obj_new_str((const char*)cur, end - cur, true);
295  self->cur += end - cur;
296  return o_out;
297  } else {
298  return MP_OBJ_STOP_ITERATION;
299  }
300 }
301 
303  assert(sizeof(mp_obj_str_it_t) <= sizeof(mp_obj_iter_buf_t));
304  mp_obj_str_it_t *o = (mp_obj_str_it_t*)iter_buf;
305  o->base.type = &mp_type_polymorph_iter;
307  o->str = str;
308  o->cur = 0;
309  return MP_OBJ_FROM_PTR(o);
310 }
311 
312 #endif // MICROPY_PY_BUILTINS_STR_UNICODE
intptr_t mp_int_t
Definition: mpconfigport.h:73
mp_obj_t mp_obj_str_binary_op(mp_binary_op_t op, mp_obj_t lhs_in, mp_obj_t rhs_in)
Definition: objstr.c:291
NORETURN void mp_raise_msg(const mp_obj_type_t *exc_type, const char *msg)
Definition: runtime.c:1448
mp_fun_1_t iternext
Definition: obj.h:521
#define assert(e)
Definition: assert.h:9
#define mp_const_none
Definition: obj.h:614
#define MP_DEFINE_CONST_DICT(dict_name, table_name)
Definition: obj.h:317
NORETURN void mp_raise_NotImplementedError(const char *msg)
Definition: runtime.c:1468
const mp_obj_type_t mp_type_TypeError
STATIC mp_obj_t str_it_iternext(mp_obj_t self_in)
Definition: objstr.c:2137
#define MP_OBJ_IS_TYPE(o, t)
Definition: obj.h:254
mp_obj_t mp_obj_new_exception_msg_varg(const mp_obj_type_t *exc_type, const char *fmt,...)
Definition: objexcept.c:380
#define MP_OBJ_SENTINEL
Definition: obj.h:75
mp_obj_type_t * mp_obj_get_type(mp_const_obj_t o_in)
Definition: obj.c:40
int mp_print_str(const mp_print_t *print, const char *str)
Definition: mpprint.c:53
#define MP_ROM_QSTR(q)
Definition: obj.h:241
#define MP_OBJ_FROM_PTR(p)
Definition: obj.h:233
const byte * str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, size_t self_len, mp_obj_t index, bool is_slice)
Definition: objstr.c:399
#define MP_OBJ_NEW_QSTR(qst)
Definition: obj.h:92
#define MP_ROM_PTR(p)
Definition: obj.h:242
const mp_obj_type_t mp_type_bytes
Definition: objstr.c:1964
const byte * utf8_next_char(const byte *s)
Definition: unicode.c:89
mp_unary_op_t
Definition: runtime0.h:45
#define STATIC
Definition: mpconfig.h:1178
#define MP_OBJ_SMALL_INT_VALUE(o)
Definition: obj.h:86
#define UTF8_IS_CONT(ch)
Definition: misc.h:138
mp_obj_t(* mp_fun_1_t)(mp_obj_t)
Definition: obj.h:404
size_t mp_get_index(const mp_obj_type_t *type, size_t len, mp_obj_t index, bool is_slice)
Definition: obj.c:376
mp_print_kind_t
Definition: obj.h:412
#define MP_OBJ_NEW_SMALL_INT(small_int)
Definition: obj.h:87
const mp_obj_type_t mp_type_polymorph_iter
Definition: objpolyiter.c:48
#define MP_OBJ_NULL
Definition: obj.h:73
const mp_obj_type_t mp_type_str
Definition: objstr.c:1950
mp_obj_t mp_obj_new_str(const char *data, size_t len, bool make_qstr_if_not_already)
Definition: objstr.c:2025
unsigned char byte
Definition: misc.h:37
mp_int_t mp_obj_str_get_buffer(mp_obj_t self_in, mp_buffer_info_t *bufinfo, mp_uint_t flags)
Definition: objstr.c:1883
bool mp_obj_get_int_maybe(mp_const_obj_t arg, mp_int_t *value)
Definition: obj.c:258
const mp_obj_type_t mp_type_type
Definition: objtype.c:969
const mp_obj_type_t mp_type_IndexError
void mp_str_print_json(const mp_print_t *print, const byte *str_data, size_t str_len)
unichar utf8_get_char(const byte *s)
Definition: unicode.c:71
const mp_obj_type_t mp_type_slice
#define MP_OBJ_TO_PTR(o)
Definition: obj.h:228
Definition: obj.h:413
mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_kw, const mp_obj_t *args)
Definition: objstr.c:133
#define nlr_raise(val)
Definition: nlr.h:89
mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte *data, size_t len)
Definition: objstr.c:1981
#define UTF8_IS_NONASCII(ch)
Definition: misc.h:137
const char * mp_obj_get_type_str(mp_const_obj_t o_in)
Definition: obj.c:55
void mp_obj_slice_get(mp_obj_t self_in, mp_obj_t *start, mp_obj_t *stop, mp_obj_t *step)
qstr name
Definition: obj.h:478
#define MP_OBJ_STOP_ITERATION
Definition: obj.h:74
uint64_t mp_obj_t
Definition: obj.h:39
int mp_printf(const mp_print_t *print, const char *fmt,...)
Definition: mpprint.c:380
STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf)
Definition: objstr.c:2149
mp_uint_t unichar_charlen(const char *str, mp_uint_t len)
Definition: unicode.c:113
#define GET_STR_DATA_LEN(str_obj_in, str_data, str_len)
Definition: objstr.h:55
uint unichar
Definition: misc.h:119
unsigned int uint
Definition: misc.h:38