Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions mypy/typeshed/stubs/librt/librt/strings.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,12 @@ def isdigit(c: i32, /) -> bool: ...
def isalnum(c: i32, /) -> bool: ...
def isalpha(c: i32, /) -> bool: ...
def isidentifier(c: i32, /) -> bool: ...

# Codepoint case conversion. For the rare codepoints whose Unicode
# uppercase / lowercase expands to multiple codepoints (e.g. U+00DF
# uppercases to "SS", U+FB01 to "FI"), returns the input unchanged so
# the signature stays i32 -> i32. Use str.upper() / str.lower() for full
# Unicode case conversion when those cases matter. Negative inputs are
# returned unchanged.
def toupper(c: i32, /) -> i32: ...
def tolower(c: i32, /) -> i32: ...
18 changes: 18 additions & 0 deletions mypyc/lib-rt/strings/librt_strings.c
Original file line number Diff line number Diff line change
Expand Up @@ -1191,6 +1191,18 @@ DEFINE_CP_BOOL_WRAPPER(isalnum, LibRTStrings_IsAlnum)
DEFINE_CP_BOOL_WRAPPER(isalpha, LibRTStrings_IsAlpha)
DEFINE_CP_BOOL_WRAPPER(isidentifier, LibRTStrings_IsIdentifier)

#define DEFINE_CP_I32_WRAPPER(name, fn) \
static PyObject* \
cp_##name(PyObject *module, PyObject *arg) { \
int32_t c; \
if (cp_parse_i32(arg, &c) < 0) \
return NULL; \
return PyLong_FromLong((long) fn(c)); \
}

DEFINE_CP_I32_WRAPPER(toupper, LibRTStrings_ToUpper)
DEFINE_CP_I32_WRAPPER(tolower, LibRTStrings_ToLower)

static PyMethodDef librt_strings_module_methods[] = {
{"write_i16_le", (PyCFunction) write_i16_le, METH_FASTCALL,
PyDoc_STR("Write a 16-bit signed integer to BytesWriter in little-endian format")
Expand Down Expand Up @@ -1267,6 +1279,12 @@ static PyMethodDef librt_strings_module_methods[] = {
{"isidentifier", cp_isidentifier, METH_O,
PyDoc_STR("Test whether a codepoint (i32) is a valid identifier start (XID_Start).")
},
{"toupper", cp_toupper, METH_O,
PyDoc_STR("Single-codepoint uppercase mapping for a codepoint (i32). Returns the input unchanged if the Unicode uppercase expands to multiple codepoints (e.g. U+00DF uppercases to \"SS\"); use str.upper() for full Unicode case conversion.")
},
{"tolower", cp_tolower, METH_O,
PyDoc_STR("Single-codepoint lowercase mapping for a codepoint (i32). Returns the input unchanged if the Unicode lowercase expands to multiple codepoints; use str.lower() for full Unicode case conversion.")
},
{NULL, NULL, 0, NULL}
};

Expand Down
44 changes: 44 additions & 0 deletions mypyc/lib-rt/strings/librt_strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,4 +73,48 @@ static inline bool LibRTStrings_IsIdentifier(int32_t c) {
return r == 1;
}

// Shared slow path for LibRTStrings_ToUpper / _ToLower. Round-trips the
// codepoint through CPython's str.upper / str.lower on a 1-character
// string. When the conversion expands to multiple codepoints (e.g.
// 'ß'.upper() == 'SS') we return the input unchanged so the public
// helpers stay i32 -> i32. Aborts via CPyError_OutOfMemory on allocation
// failure.
static inline int32_t LibRTStrings_ChangeCase_slow(int32_t c, const char *method) {
PyObject *s = PyUnicode_FromOrdinal((int)c);
if (s == NULL) {
CPyError_OutOfMemory();
}
PyObject *u = PyObject_CallMethod(s, method, NULL);
Py_DECREF(s);
if (u == NULL) {
CPyError_OutOfMemory();
}
int32_t result = c;
if (PyUnicode_GET_LENGTH(u) == 1) {
result = (int32_t)PyUnicode_READ_CHAR(u, 0);
}
Py_DECREF(u);
return result;
}

// Uppercase a codepoint. ASCII fast path is `a..z -> A..Z` (subtract 32);
// non-ASCII delegates to str.upper on a 1-character string. Returns the
// input unchanged when uppercasing expands to multiple codepoints.
static inline int32_t LibRTStrings_ToUpper(int32_t c) {
if (c < 0) return c;
if (c >= 'a' && c <= 'z') return c - 32;
if (c < 128) return c;
return LibRTStrings_ChangeCase_slow(c, "upper");
}

// Lowercase a codepoint. ASCII fast path is `A..Z -> a..z` (add 32);
// non-ASCII delegates to str.lower on a 1-character string. Returns the
// input unchanged when lowercasing expands to multiple codepoints.
static inline int32_t LibRTStrings_ToLower(int32_t c) {
if (c < 0) return c;
if (c >= 'A' && c <= 'Z') return c + 32;
if (c < 128) return c;
return LibRTStrings_ChangeCase_slow(c, "lower");
}

#endif // LIBRT_STRINGS_H
23 changes: 23 additions & 0 deletions mypyc/primitives/librt_strings_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,3 +438,26 @@
error_kind=ERR_NEVER,
dependencies=[LIBRT_STRINGS],
)

# Codepoint case conversion. When the Unicode uppercase/lowercase of a
# codepoint expands to multiple codepoints (e.g. U+00DF uppercases to "SS",
# U+FB01 to "FI"), returns the input unchanged so the signature stays
# i32 -> i32; callers needing full Unicode case conversion should use
# str.upper() / .lower() instead. Negative inputs are returned unchanged.
function_op(
name="librt.strings.toupper",
arg_types=[int32_rprimitive],
return_type=int32_rprimitive,
c_function_name="LibRTStrings_ToUpper",
error_kind=ERR_NEVER,
dependencies=[LIBRT_STRINGS],
)

function_op(
name="librt.strings.tolower",
arg_types=[int32_rprimitive],
return_type=int32_rprimitive,
c_function_name="LibRTStrings_ToLower",
error_kind=ERR_NEVER,
dependencies=[LIBRT_STRINGS],
)
26 changes: 26 additions & 0 deletions mypyc/test-data/irbuild-librt-strings.test
Original file line number Diff line number Diff line change
Expand Up @@ -401,3 +401,29 @@ def is_id(c):
L0:
r0 = LibRTStrings_IsIdentifier(c)
return r0

[case testLibrtStringsToUpperIR]
from librt.strings import toupper
from mypy_extensions import i32

def up(c: i32) -> i32:
return toupper(c)
[out]
def up(c):
c, r0 :: i32
L0:
r0 = LibRTStrings_ToUpper(c)
return r0

[case testLibrtStringsToLowerIR]
from librt.strings import tolower
from mypy_extensions import i32

def lo(c: i32) -> i32:
return tolower(c)
[out]
def lo(c):
c, r0 :: i32
L0:
r0 = LibRTStrings_ToLower(c)
return r0
49 changes: 49 additions & 0 deletions mypyc/test-data/run-librt-strings.test
Original file line number Diff line number Diff line change
Expand Up @@ -1490,3 +1490,52 @@ def test_codepoint_classifiers_via_any() -> None:
f(1 << 40)
with assertRaises(OverflowError, "codepoint out of i32 range"):
f(-(1 << 40))

[case testLibrtStringsCodepointCaseConversion_librt]
from typing import Any
from mypy_extensions import i32
from librt.strings import toupper, tolower

from testutil import assertRaises


def _expect(c: str, method: str) -> int:
# The contract: i32 -> i32 when conversion yields exactly one codepoint,
# else return the input unchanged.
converted = getattr(c, method)()
if len(converted) == 1:
return ord(converted)
return ord(c)


def test_codepoint_case_conversion() -> None:
# Negative inputs return unchanged.
for bad in (i32(-1), i32(-113)):
assert toupper(bad) == bad
assert tolower(bad) == bad
# Agree with str.upper / str.lower across the full Unicode range
# whenever the conversion is single-codepoint; otherwise return input.
for i in range(0x110000):
c = chr(i)
o = ord(c)
assert toupper(o) == _expect(c, "upper")
assert tolower(o) == _expect(c, "lower")


def test_codepoint_case_conversion_via_any() -> None:
# Iterate to force generic dispatch through the PyMethodDef wrapper.
for fn, in_cp, out_cp in (
(toupper, ord("a"), ord("A")),
(toupper, ord("A"), ord("A")),
(tolower, ord("Z"), ord("z")),
(tolower, ord("z"), ord("z")),
):
f: Any = fn
assert f(in_cp) == out_cp
# Negative values are valid i32, returned unchanged.
assert f(-1) == -1
# Inputs outside i32 range raise OverflowError through the wrapper.
with assertRaises(OverflowError, "codepoint out of i32 range"):
f(1 << 40)
with assertRaises(OverflowError, "codepoint out of i32 range"):
f(-(1 << 40))
Loading