/* stringlib: split implementation */ | |
#ifndef STRINGLIB_SPLIT_H | |
#define STRINGLIB_SPLIT_H | |
#ifndef STRINGLIB_FASTSEARCH_H | |
#error must include "stringlib/fastsearch.h" before including this module | |
#endif | |
/* Overallocate the initial list to reduce the number of reallocs for small | |
split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three | |
resizes, to sizes 4, 8, then 16. Most observed string splits are for human | |
text (roughly 11 words per line) and field delimited data (usually 1-10 | |
fields). For large strings the split algorithms are bandwidth limited | |
so increasing the preallocation likely will not improve things.*/ | |
#define MAX_PREALLOC 12 | |
/* 5 splits gives 6 elements */ | |
#define PREALLOC_SIZE(maxsplit) \ | |
(maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1) | |
#define SPLIT_APPEND(data, left, right) \ | |
sub = STRINGLIB_NEW((data) + (left), \ | |
(right) - (left)); \ | |
if (sub == NULL) \ | |
goto onError; \ | |
if (PyList_Append(list, sub)) { \ | |
Py_DECREF(sub); \ | |
goto onError; \ | |
} \ | |
else \ | |
Py_DECREF(sub); | |
#define SPLIT_ADD(data, left, right) { \ | |
sub = STRINGLIB_NEW((data) + (left), \ | |
(right) - (left)); \ | |
if (sub == NULL) \ | |
goto onError; \ | |
if (count < MAX_PREALLOC) { \ | |
PyList_SET_ITEM(list, count, sub); \ | |
} else { \ | |
if (PyList_Append(list, sub)) { \ | |
Py_DECREF(sub); \ | |
goto onError; \ | |
} \ | |
else \ | |
Py_DECREF(sub); \ | |
} \ | |
count++; } | |
/* Always force the list to the expected size. */ | |
#define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count | |
Py_LOCAL_INLINE(PyObject *) | |
stringlib_split_whitespace(PyObject* str_obj, | |
const STRINGLIB_CHAR* str, Py_ssize_t str_len, | |
Py_ssize_t maxcount) | |
{ | |
Py_ssize_t i, j, count=0; | |
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); | |
PyObject *sub; | |
if (list == NULL) | |
return NULL; | |
i = j = 0; | |
while (maxcount-- > 0) { | |
while (i < str_len && STRINGLIB_ISSPACE(str[i])) | |
i++; | |
if (i == str_len) break; | |
j = i; i++; | |
while (i < str_len && !STRINGLIB_ISSPACE(str[i])) | |
i++; | |
#ifndef STRINGLIB_MUTABLE | |
if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { | |
/* No whitespace in str_obj, so just use it as list[0] */ | |
Py_INCREF(str_obj); | |
PyList_SET_ITEM(list, 0, (PyObject *)str_obj); | |
count++; | |
break; | |
} | |
#endif | |
SPLIT_ADD(str, j, i); | |
} | |
if (i < str_len) { | |
/* Only occurs when maxcount was reached */ | |
/* Skip any remaining whitespace and copy to end of string */ | |
while (i < str_len && STRINGLIB_ISSPACE(str[i])) | |
i++; | |
if (i != str_len) | |
SPLIT_ADD(str, i, str_len); | |
} | |
FIX_PREALLOC_SIZE(list); | |
return list; | |
onError: | |
Py_DECREF(list); | |
return NULL; | |
} | |
Py_LOCAL_INLINE(PyObject *) | |
stringlib_split_char(PyObject* str_obj, | |
const STRINGLIB_CHAR* str, Py_ssize_t str_len, | |
const STRINGLIB_CHAR ch, | |
Py_ssize_t maxcount) | |
{ | |
Py_ssize_t i, j, count=0; | |
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); | |
PyObject *sub; | |
if (list == NULL) | |
return NULL; | |
i = j = 0; | |
while ((j < str_len) && (maxcount-- > 0)) { | |
for(; j < str_len; j++) { | |
/* I found that using memchr makes no difference */ | |
if (str[j] == ch) { | |
SPLIT_ADD(str, i, j); | |
i = j = j + 1; | |
break; | |
} | |
} | |
} | |
#ifndef STRINGLIB_MUTABLE | |
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { | |
/* ch not in str_obj, so just use str_obj as list[0] */ | |
Py_INCREF(str_obj); | |
PyList_SET_ITEM(list, 0, (PyObject *)str_obj); | |
count++; | |
} else | |
#endif | |
if (i <= str_len) { | |
SPLIT_ADD(str, i, str_len); | |
} | |
FIX_PREALLOC_SIZE(list); | |
return list; | |
onError: | |
Py_DECREF(list); | |
return NULL; | |
} | |
Py_LOCAL_INLINE(PyObject *) | |
stringlib_split(PyObject* str_obj, | |
const STRINGLIB_CHAR* str, Py_ssize_t str_len, | |
const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, | |
Py_ssize_t maxcount) | |
{ | |
Py_ssize_t i, j, pos, count=0; | |
PyObject *list, *sub; | |
if (sep_len == 0) { | |
PyErr_SetString(PyExc_ValueError, "empty separator"); | |
return NULL; | |
} | |
else if (sep_len == 1) | |
return stringlib_split_char(str_obj, str, str_len, sep[0], maxcount); | |
list = PyList_New(PREALLOC_SIZE(maxcount)); | |
if (list == NULL) | |
return NULL; | |
i = j = 0; | |
while (maxcount-- > 0) { | |
pos = fastsearch(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH); | |
if (pos < 0) | |
break; | |
j = i + pos; | |
SPLIT_ADD(str, i, j); | |
i = j + sep_len; | |
} | |
#ifndef STRINGLIB_MUTABLE | |
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { | |
/* No match in str_obj, so just use it as list[0] */ | |
Py_INCREF(str_obj); | |
PyList_SET_ITEM(list, 0, (PyObject *)str_obj); | |
count++; | |
} else | |
#endif | |
{ | |
SPLIT_ADD(str, i, str_len); | |
} | |
FIX_PREALLOC_SIZE(list); | |
return list; | |
onError: | |
Py_DECREF(list); | |
return NULL; | |
} | |
Py_LOCAL_INLINE(PyObject *) | |
stringlib_rsplit_whitespace(PyObject* str_obj, | |
const STRINGLIB_CHAR* str, Py_ssize_t str_len, | |
Py_ssize_t maxcount) | |
{ | |
Py_ssize_t i, j, count=0; | |
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); | |
PyObject *sub; | |
if (list == NULL) | |
return NULL; | |
i = j = str_len - 1; | |
while (maxcount-- > 0) { | |
while (i >= 0 && STRINGLIB_ISSPACE(str[i])) | |
i--; | |
if (i < 0) break; | |
j = i; i--; | |
while (i >= 0 && !STRINGLIB_ISSPACE(str[i])) | |
i--; | |
#ifndef STRINGLIB_MUTABLE | |
if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) { | |
/* No whitespace in str_obj, so just use it as list[0] */ | |
Py_INCREF(str_obj); | |
PyList_SET_ITEM(list, 0, (PyObject *)str_obj); | |
count++; | |
break; | |
} | |
#endif | |
SPLIT_ADD(str, i + 1, j + 1); | |
} | |
if (i >= 0) { | |
/* Only occurs when maxcount was reached */ | |
/* Skip any remaining whitespace and copy to beginning of string */ | |
while (i >= 0 && STRINGLIB_ISSPACE(str[i])) | |
i--; | |
if (i >= 0) | |
SPLIT_ADD(str, 0, i + 1); | |
} | |
FIX_PREALLOC_SIZE(list); | |
if (PyList_Reverse(list) < 0) | |
goto onError; | |
return list; | |
onError: | |
Py_DECREF(list); | |
return NULL; | |
} | |
Py_LOCAL_INLINE(PyObject *) | |
stringlib_rsplit_char(PyObject* str_obj, | |
const STRINGLIB_CHAR* str, Py_ssize_t str_len, | |
const STRINGLIB_CHAR ch, | |
Py_ssize_t maxcount) | |
{ | |
Py_ssize_t i, j, count=0; | |
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); | |
PyObject *sub; | |
if (list == NULL) | |
return NULL; | |
i = j = str_len - 1; | |
while ((i >= 0) && (maxcount-- > 0)) { | |
for(; i >= 0; i--) { | |
if (str[i] == ch) { | |
SPLIT_ADD(str, i + 1, j + 1); | |
j = i = i - 1; | |
break; | |
} | |
} | |
} | |
#ifndef STRINGLIB_MUTABLE | |
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { | |
/* ch not in str_obj, so just use str_obj as list[0] */ | |
Py_INCREF(str_obj); | |
PyList_SET_ITEM(list, 0, (PyObject *)str_obj); | |
count++; | |
} else | |
#endif | |
if (j >= -1) { | |
SPLIT_ADD(str, 0, j + 1); | |
} | |
FIX_PREALLOC_SIZE(list); | |
if (PyList_Reverse(list) < 0) | |
goto onError; | |
return list; | |
onError: | |
Py_DECREF(list); | |
return NULL; | |
} | |
Py_LOCAL_INLINE(PyObject *) | |
stringlib_rsplit(PyObject* str_obj, | |
const STRINGLIB_CHAR* str, Py_ssize_t str_len, | |
const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, | |
Py_ssize_t maxcount) | |
{ | |
Py_ssize_t j, pos, count=0; | |
PyObject *list, *sub; | |
if (sep_len == 0) { | |
PyErr_SetString(PyExc_ValueError, "empty separator"); | |
return NULL; | |
} | |
else if (sep_len == 1) | |
return stringlib_rsplit_char(str_obj, str, str_len, sep[0], maxcount); | |
list = PyList_New(PREALLOC_SIZE(maxcount)); | |
if (list == NULL) | |
return NULL; | |
j = str_len; | |
while (maxcount-- > 0) { | |
pos = fastsearch(str, j, sep, sep_len, -1, FAST_RSEARCH); | |
if (pos < 0) | |
break; | |
SPLIT_ADD(str, pos + sep_len, j); | |
j = pos; | |
} | |
#ifndef STRINGLIB_MUTABLE | |
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { | |
/* No match in str_obj, so just use it as list[0] */ | |
Py_INCREF(str_obj); | |
PyList_SET_ITEM(list, 0, (PyObject *)str_obj); | |
count++; | |
} else | |
#endif | |
{ | |
SPLIT_ADD(str, 0, j); | |
} | |
FIX_PREALLOC_SIZE(list); | |
if (PyList_Reverse(list) < 0) | |
goto onError; | |
return list; | |
onError: | |
Py_DECREF(list); | |
return NULL; | |
} | |
Py_LOCAL_INLINE(PyObject *) | |
stringlib_splitlines(PyObject* str_obj, | |
const STRINGLIB_CHAR* str, Py_ssize_t str_len, | |
int keepends) | |
{ | |
/* This does not use the preallocated list because splitlines is | |
usually run with hundreds of newlines. The overhead of | |
switching between PyList_SET_ITEM and append causes about a | |
2-3% slowdown for that common case. A smarter implementation | |
could move the if check out, so the SET_ITEMs are done first | |
and the appends only done when the prealloc buffer is full. | |
That's too much work for little gain.*/ | |
register Py_ssize_t i; | |
register Py_ssize_t j; | |
PyObject *list = PyList_New(0); | |
PyObject *sub; | |
if (list == NULL) | |
return NULL; | |
for (i = j = 0; i < str_len; ) { | |
Py_ssize_t eol; | |
/* Find a line and append it */ | |
while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i])) | |
i++; | |
/* Skip the line break reading CRLF as one line break */ | |
eol = i; | |
if (i < str_len) { | |
if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n') | |
i += 2; | |
else | |
i++; | |
if (keepends) | |
eol = i; | |
} | |
#ifndef STRINGLIB_MUTABLE | |
if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { | |
/* No linebreak in str_obj, so just use it as list[0] */ | |
if (PyList_Append(list, str_obj)) | |
goto onError; | |
break; | |
} | |
#endif | |
SPLIT_APPEND(str, j, eol); | |
j = i; | |
} | |
return list; | |
onError: | |
Py_DECREF(list); | |
return NULL; | |
} | |
#endif |