| /* stringlib: split implementation */ | |
| #ifndef STRINGLIB_SPLIT_H | |
| #define STRINGLIB_SPLIT_H | |
| #ifndef STRINGLIB_FASTSEARCH_H | |
| #error must include "stringlib/fastsearch.h" before including this module | |
| #endif | |
| /* Overallocate the initial list to reduce the number of reallocs for small | |
| split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three | |
| resizes, to sizes 4, 8, then 16. Most observed string splits are for human | |
| text (roughly 11 words per line) and field delimited data (usually 1-10 | |
| fields). For large strings the split algorithms are bandwidth limited | |
| so increasing the preallocation likely will not improve things.*/ | |
| #define MAX_PREALLOC 12 | |
| /* 5 splits gives 6 elements */ | |
| #define PREALLOC_SIZE(maxsplit) \ | |
| (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1) | |
| #define SPLIT_APPEND(data, left, right) \ | |
| sub = STRINGLIB_NEW((data) + (left), \ | |
| (right) - (left)); \ | |
| if (sub == NULL) \ | |
| goto onError; \ | |
| if (PyList_Append(list, sub)) { \ | |
| Py_DECREF(sub); \ | |
| goto onError; \ | |
| } \ | |
| else \ | |
| Py_DECREF(sub); | |
| #define SPLIT_ADD(data, left, right) { \ | |
| sub = STRINGLIB_NEW((data) + (left), \ | |
| (right) - (left)); \ | |
| if (sub == NULL) \ | |
| goto onError; \ | |
| if (count < MAX_PREALLOC) { \ | |
| PyList_SET_ITEM(list, count, sub); \ | |
| } else { \ | |
| if (PyList_Append(list, sub)) { \ | |
| Py_DECREF(sub); \ | |
| goto onError; \ | |
| } \ | |
| else \ | |
| Py_DECREF(sub); \ | |
| } \ | |
| count++; } | |
| /* Always force the list to the expected size. */ | |
| #define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count | |
| Py_LOCAL_INLINE(PyObject *) | |
| stringlib_split_whitespace(PyObject* str_obj, | |
| const STRINGLIB_CHAR* str, Py_ssize_t str_len, | |
| Py_ssize_t maxcount) | |
| { | |
| Py_ssize_t i, j, count=0; | |
| PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); | |
| PyObject *sub; | |
| if (list == NULL) | |
| return NULL; | |
| i = j = 0; | |
| while (maxcount-- > 0) { | |
| while (i < str_len && STRINGLIB_ISSPACE(str[i])) | |
| i++; | |
| if (i == str_len) break; | |
| j = i; i++; | |
| while (i < str_len && !STRINGLIB_ISSPACE(str[i])) | |
| i++; | |
| #ifndef STRINGLIB_MUTABLE | |
| if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { | |
| /* No whitespace in str_obj, so just use it as list[0] */ | |
| Py_INCREF(str_obj); | |
| PyList_SET_ITEM(list, 0, (PyObject *)str_obj); | |
| count++; | |
| break; | |
| } | |
| #endif | |
| SPLIT_ADD(str, j, i); | |
| } | |
| if (i < str_len) { | |
| /* Only occurs when maxcount was reached */ | |
| /* Skip any remaining whitespace and copy to end of string */ | |
| while (i < str_len && STRINGLIB_ISSPACE(str[i])) | |
| i++; | |
| if (i != str_len) | |
| SPLIT_ADD(str, i, str_len); | |
| } | |
| FIX_PREALLOC_SIZE(list); | |
| return list; | |
| onError: | |
| Py_DECREF(list); | |
| return NULL; | |
| } | |
| Py_LOCAL_INLINE(PyObject *) | |
| stringlib_split_char(PyObject* str_obj, | |
| const STRINGLIB_CHAR* str, Py_ssize_t str_len, | |
| const STRINGLIB_CHAR ch, | |
| Py_ssize_t maxcount) | |
| { | |
| Py_ssize_t i, j, count=0; | |
| PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); | |
| PyObject *sub; | |
| if (list == NULL) | |
| return NULL; | |
| i = j = 0; | |
| while ((j < str_len) && (maxcount-- > 0)) { | |
| for(; j < str_len; j++) { | |
| /* I found that using memchr makes no difference */ | |
| if (str[j] == ch) { | |
| SPLIT_ADD(str, i, j); | |
| i = j = j + 1; | |
| break; | |
| } | |
| } | |
| } | |
| #ifndef STRINGLIB_MUTABLE | |
| if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { | |
| /* ch not in str_obj, so just use str_obj as list[0] */ | |
| Py_INCREF(str_obj); | |
| PyList_SET_ITEM(list, 0, (PyObject *)str_obj); | |
| count++; | |
| } else | |
| #endif | |
| if (i <= str_len) { | |
| SPLIT_ADD(str, i, str_len); | |
| } | |
| FIX_PREALLOC_SIZE(list); | |
| return list; | |
| onError: | |
| Py_DECREF(list); | |
| return NULL; | |
| } | |
| Py_LOCAL_INLINE(PyObject *) | |
| stringlib_split(PyObject* str_obj, | |
| const STRINGLIB_CHAR* str, Py_ssize_t str_len, | |
| const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, | |
| Py_ssize_t maxcount) | |
| { | |
| Py_ssize_t i, j, pos, count=0; | |
| PyObject *list, *sub; | |
| if (sep_len == 0) { | |
| PyErr_SetString(PyExc_ValueError, "empty separator"); | |
| return NULL; | |
| } | |
| else if (sep_len == 1) | |
| return stringlib_split_char(str_obj, str, str_len, sep[0], maxcount); | |
| list = PyList_New(PREALLOC_SIZE(maxcount)); | |
| if (list == NULL) | |
| return NULL; | |
| i = j = 0; | |
| while (maxcount-- > 0) { | |
| pos = fastsearch(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH); | |
| if (pos < 0) | |
| break; | |
| j = i + pos; | |
| SPLIT_ADD(str, i, j); | |
| i = j + sep_len; | |
| } | |
| #ifndef STRINGLIB_MUTABLE | |
| if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { | |
| /* No match in str_obj, so just use it as list[0] */ | |
| Py_INCREF(str_obj); | |
| PyList_SET_ITEM(list, 0, (PyObject *)str_obj); | |
| count++; | |
| } else | |
| #endif | |
| { | |
| SPLIT_ADD(str, i, str_len); | |
| } | |
| FIX_PREALLOC_SIZE(list); | |
| return list; | |
| onError: | |
| Py_DECREF(list); | |
| return NULL; | |
| } | |
| Py_LOCAL_INLINE(PyObject *) | |
| stringlib_rsplit_whitespace(PyObject* str_obj, | |
| const STRINGLIB_CHAR* str, Py_ssize_t str_len, | |
| Py_ssize_t maxcount) | |
| { | |
| Py_ssize_t i, j, count=0; | |
| PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); | |
| PyObject *sub; | |
| if (list == NULL) | |
| return NULL; | |
| i = j = str_len - 1; | |
| while (maxcount-- > 0) { | |
| while (i >= 0 && STRINGLIB_ISSPACE(str[i])) | |
| i--; | |
| if (i < 0) break; | |
| j = i; i--; | |
| while (i >= 0 && !STRINGLIB_ISSPACE(str[i])) | |
| i--; | |
| #ifndef STRINGLIB_MUTABLE | |
| if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) { | |
| /* No whitespace in str_obj, so just use it as list[0] */ | |
| Py_INCREF(str_obj); | |
| PyList_SET_ITEM(list, 0, (PyObject *)str_obj); | |
| count++; | |
| break; | |
| } | |
| #endif | |
| SPLIT_ADD(str, i + 1, j + 1); | |
| } | |
| if (i >= 0) { | |
| /* Only occurs when maxcount was reached */ | |
| /* Skip any remaining whitespace and copy to beginning of string */ | |
| while (i >= 0 && STRINGLIB_ISSPACE(str[i])) | |
| i--; | |
| if (i >= 0) | |
| SPLIT_ADD(str, 0, i + 1); | |
| } | |
| FIX_PREALLOC_SIZE(list); | |
| if (PyList_Reverse(list) < 0) | |
| goto onError; | |
| return list; | |
| onError: | |
| Py_DECREF(list); | |
| return NULL; | |
| } | |
| Py_LOCAL_INLINE(PyObject *) | |
| stringlib_rsplit_char(PyObject* str_obj, | |
| const STRINGLIB_CHAR* str, Py_ssize_t str_len, | |
| const STRINGLIB_CHAR ch, | |
| Py_ssize_t maxcount) | |
| { | |
| Py_ssize_t i, j, count=0; | |
| PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); | |
| PyObject *sub; | |
| if (list == NULL) | |
| return NULL; | |
| i = j = str_len - 1; | |
| while ((i >= 0) && (maxcount-- > 0)) { | |
| for(; i >= 0; i--) { | |
| if (str[i] == ch) { | |
| SPLIT_ADD(str, i + 1, j + 1); | |
| j = i = i - 1; | |
| break; | |
| } | |
| } | |
| } | |
| #ifndef STRINGLIB_MUTABLE | |
| if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { | |
| /* ch not in str_obj, so just use str_obj as list[0] */ | |
| Py_INCREF(str_obj); | |
| PyList_SET_ITEM(list, 0, (PyObject *)str_obj); | |
| count++; | |
| } else | |
| #endif | |
| if (j >= -1) { | |
| SPLIT_ADD(str, 0, j + 1); | |
| } | |
| FIX_PREALLOC_SIZE(list); | |
| if (PyList_Reverse(list) < 0) | |
| goto onError; | |
| return list; | |
| onError: | |
| Py_DECREF(list); | |
| return NULL; | |
| } | |
| Py_LOCAL_INLINE(PyObject *) | |
| stringlib_rsplit(PyObject* str_obj, | |
| const STRINGLIB_CHAR* str, Py_ssize_t str_len, | |
| const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, | |
| Py_ssize_t maxcount) | |
| { | |
| Py_ssize_t j, pos, count=0; | |
| PyObject *list, *sub; | |
| if (sep_len == 0) { | |
| PyErr_SetString(PyExc_ValueError, "empty separator"); | |
| return NULL; | |
| } | |
| else if (sep_len == 1) | |
| return stringlib_rsplit_char(str_obj, str, str_len, sep[0], maxcount); | |
| list = PyList_New(PREALLOC_SIZE(maxcount)); | |
| if (list == NULL) | |
| return NULL; | |
| j = str_len; | |
| while (maxcount-- > 0) { | |
| pos = fastsearch(str, j, sep, sep_len, -1, FAST_RSEARCH); | |
| if (pos < 0) | |
| break; | |
| SPLIT_ADD(str, pos + sep_len, j); | |
| j = pos; | |
| } | |
| #ifndef STRINGLIB_MUTABLE | |
| if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { | |
| /* No match in str_obj, so just use it as list[0] */ | |
| Py_INCREF(str_obj); | |
| PyList_SET_ITEM(list, 0, (PyObject *)str_obj); | |
| count++; | |
| } else | |
| #endif | |
| { | |
| SPLIT_ADD(str, 0, j); | |
| } | |
| FIX_PREALLOC_SIZE(list); | |
| if (PyList_Reverse(list) < 0) | |
| goto onError; | |
| return list; | |
| onError: | |
| Py_DECREF(list); | |
| return NULL; | |
| } | |
| Py_LOCAL_INLINE(PyObject *) | |
| stringlib_splitlines(PyObject* str_obj, | |
| const STRINGLIB_CHAR* str, Py_ssize_t str_len, | |
| int keepends) | |
| { | |
| /* This does not use the preallocated list because splitlines is | |
| usually run with hundreds of newlines. The overhead of | |
| switching between PyList_SET_ITEM and append causes about a | |
| 2-3% slowdown for that common case. A smarter implementation | |
| could move the if check out, so the SET_ITEMs are done first | |
| and the appends only done when the prealloc buffer is full. | |
| That's too much work for little gain.*/ | |
| register Py_ssize_t i; | |
| register Py_ssize_t j; | |
| PyObject *list = PyList_New(0); | |
| PyObject *sub; | |
| if (list == NULL) | |
| return NULL; | |
| for (i = j = 0; i < str_len; ) { | |
| Py_ssize_t eol; | |
| /* Find a line and append it */ | |
| while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i])) | |
| i++; | |
| /* Skip the line break reading CRLF as one line break */ | |
| eol = i; | |
| if (i < str_len) { | |
| if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n') | |
| i += 2; | |
| else | |
| i++; | |
| if (keepends) | |
| eol = i; | |
| } | |
| #ifndef STRINGLIB_MUTABLE | |
| if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { | |
| /* No linebreak in str_obj, so just use it as list[0] */ | |
| if (PyList_Append(list, str_obj)) | |
| goto onError; | |
| break; | |
| } | |
| #endif | |
| SPLIT_APPEND(str, j, eol); | |
| j = i; | |
| } | |
| return list; | |
| onError: | |
| Py_DECREF(list); | |
| return NULL; | |
| } | |
| #endif |