@@ -785,6 +785,22 @@ unicode_result(PyObject *unicode)
785785static PyObject *
786786unicode_result_unchanged (PyObject * unicode )
787787{
788+
789+ /* Check if a Unicode string is a palindrome */
790+ static int
791+ unicode_is_palindrome (PyObject * unicode )
792+ {
793+ Py_ssize_t length = PyUnicode_GET_LENGTH (unicode );
794+ int kind = PyUnicode_KIND (unicode );
795+ const void * data = PyUnicode_DATA (unicode );
796+
797+ for (Py_ssize_t i = 0 ; i < length / 2 ; i ++ ) {
798+ if (PyUnicode_READ (kind , data , i ) != PyUnicode_READ (kind , data , length - i - 1 )) {
799+ return 0 ;
800+ }
801+ }
802+ return 1 ;
803+ }
788804 if (PyUnicode_CheckExact (unicode )) {
789805 return Py_NewRef (unicode );
790806 }
@@ -5061,6 +5077,14 @@ load_unaligned(const unsigned char *p, size_t size)
50615077}
50625078#endif
50635079
5080+ /*
5081+ * Find the first non-ASCII character in a byte sequence.
5082+ *
5083+ * This function scans a range of bytes from `start` to `end` and returns the
5084+ * index of the first byte that is not an ASCII character (i.e., has the most
5085+ * significant bit set). If all characters in the range are ASCII, it returns
5086+ * `end - start`.
5087+ */
50645088static Py_ssize_t
50655089find_first_nonascii (const unsigned char * start , const unsigned char * end )
50665090{
@@ -5122,18 +5146,23 @@ find_first_nonascii(const unsigned char *start, const unsigned char *end)
51225146#endif
51235147}
51245148
5125- static inline int scalar_utf8_start_char (unsigned int ch )
5149+ static inline int
5150+ scalar_utf8_start_char (unsigned int ch )
51265151{
51275152 // 0xxxxxxx or 11xxxxxx are first byte.
51285153 return (~ch >> 7 | ch >> 6 ) & 1 ;
51295154}
51305155
5131- static inline size_t vector_utf8_start_chars (size_t v )
5156+ static inline size_t
5157+ vector_utf8_start_chars (size_t v )
51325158{
51335159 return ((~v >> 7 ) | (v >> 6 )) & VECTOR_0101 ;
51345160}
51355161
5136- static Py_ssize_t utf8_count_codepoints (const unsigned char * s , const unsigned char * end )
5162+
5163+ // Count the number of UTF-8 code points in a given byte sequence.
5164+ static Py_ssize_t
5165+ utf8_count_codepoints (const unsigned char * s , const unsigned char * end )
51375166{
51385167 Py_ssize_t len = 0 ;
51395168
@@ -5377,6 +5406,11 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
53775406 // otherwise: check the input and decide the maxchr and maxsize to reduce
53785407 // reallocation and copy.
53795408 if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2 ) {
5409+ // we only calculate the number of codepoints and don't determine the exact maxchr.
5410+ // This is because writing fast and portable SIMD code to find maxchr is difficult.
5411+ // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5412+ // means that it is no longer necessary to allocate several times the required amount
5413+ // of memory.
53805414 maxsize = utf8_count_codepoints ((const unsigned char * )s , (const unsigned char * )end );
53815415 if (ch < 0xc4 ) { // latin1
53825416 maxchr = 0xff ;
0 commit comments