"""
This module contains a set of functions for vectorized string
operations.
"""
import sys
import numpy as np
from numpy import (
equal, not_equal, less, less_equal, greater, greater_equal,
add, multiply as _multiply_ufunc,
)
from numpy._core.multiarray import _vec_string
from numpy._core.umath import (
isalpha,
isdigit,
isspace,
isalnum,
islower,
isupper,
istitle,
isdecimal,
isnumeric,
str_len,
find as _find_ufunc,
rfind as _rfind_ufunc,
index as _index_ufunc,
rindex as _rindex_ufunc,
count as _count_ufunc,
startswith as _startswith_ufunc,
endswith as _endswith_ufunc,
_lstrip_whitespace,
_lstrip_chars,
_rstrip_whitespace,
_rstrip_chars,
_strip_whitespace,
_strip_chars,
_replace,
_expandtabs_length,
_expandtabs,
_center,
_ljust,
_rjust,
_zfill,
_partition,
_partition_index,
_rpartition,
_rpartition_index,
)
__all__ = [
# UFuncs
"equal", "not_equal", "less", "less_equal", "greater", "greater_equal",
"add", "multiply", "isalpha", "isdigit", "isspace", "isalnum", "islower",
"isupper", "istitle", "isdecimal", "isnumeric", "str_len", "find",
"rfind", "index", "rindex", "count", "startswith", "endswith", "lstrip",
"rstrip", "strip", "replace", "expandtabs", "center", "ljust", "rjust",
"zfill", "partition", "rpartition",
# _vec_string - Will gradually become ufuncs as well
"upper", "lower", "swapcase", "capitalize", "title",
# _vec_string - Will probably not become ufuncs
"mod", "decode", "encode", "translate",
# Removed from namespace until behavior has been crystallized
# "join", "split", "rsplit", "splitlines",
]
MAX = np.iinfo(np.int64).max
def _get_num_chars(a):
"""
Helper function that returns the number of characters per field in
a string or unicode array. This is to abstract out the fact that
for a unicode array this is itemsize / 4.
"""
if issubclass(a.dtype.type, np.str_):
return a.itemsize // 4
return a.itemsize
def _to_bytes_or_str_array(result, output_dtype_like):
"""
Helper function to cast a result back into an array
with the appropriate dtype if an object array must be used
as an intermediary.
"""
output_dtype_like = np.asarray(output_dtype_like)
if result.size == 0:
# Calling asarray & tolist in an empty array would result
# in losing shape information
return result.astype(output_dtype_like.dtype)
ret = np.asarray(result.tolist())
if isinstance(output_dtype_like.dtype, np.dtypes.StringDType):
return ret.astype(type(output_dtype_like.dtype))
return ret.astype(type(output_dtype_like.dtype)(_get_num_chars(ret)))
def _clean_args(*args):
"""
Helper function for delegating arguments to Python string
functions.
Many of the Python string operations that have optional arguments
do not use 'None' to indicate a default value. In these cases,
we need to remove all None arguments, and those following them.
"""
newargs = []
for chk in args:
if chk is None:
break
newargs.append(chk)
return newargs
def multiply(a, i):
"""
Return (a * i), that is string multiple concatenation,
element-wise.
Values in ``i`` of less than 0 are treated as 0 (which yields an
empty string).
Parameters
----------
a : array_like, with ``StringDType``, ``bytes_`` or ``str_`` dtype
i : array_like, with any integer dtype
Returns
-------
out : ndarray
Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
depending on input types
Examples
--------
>>> import numpy as np
>>> a = np.array(["a", "b", "c"])
>>> np.strings.multiply(a, 3)
array(['aaa', 'bbb', 'ccc'], dtype='<U3')
>>> i = np.array([1, 2, 3])
>>> np.strings.multiply(a, i)
array(['a', 'bb', 'ccc'], dtype='<U3')
>>> np.strings.multiply(np.array(['a']), i)
array(['a', 'aa', 'aaa'], dtype='<U3')
>>> a = np.array(['a', 'b', 'c', 'd', 'e', 'f']).reshape((2, 3))
>>> np.strings.multiply(a, 3)
array([['aaa', 'bbb', 'ccc'],
['ddd', 'eee', 'fff']], dtype='<U3')
>>> np.strings.multiply(a, i)
array([['a', 'bb', 'ccc'],
['d', 'ee', 'fff']], dtype='<U3')
"""
a = np.asanyarray(a)
i = np.asanyarray(i)
if not np.issubdtype(i.dtype, np.integer):
raise TypeError(f"unsupported type {i.dtype} for operand 'i'")
i = np.maximum(i, 0)
# delegate to stringdtype loops that also do overflow checking
if a.dtype.char == "T":
return a * i
a_len = str_len(a)
# Ensure we can do a_len * i without overflow.
if np.any(a_len > sys.maxsize / np.maximum(i, 1)):
raise MemoryError("repeated string is too long")
buffersizes = a_len * i
out_dtype = f"{a.dtype.char}{buffersizes.max()}"
out = np.empty_like(a, shape=buffersizes.shape, dtype=out_dtype)
return _multiply_ufunc(a, i, out=out)
def mod(a, values):
"""
Return (a % i), that is pre-Python 2.6 string formatting
(interpolation), element-wise for a pair of array_likes of str
or unicode.
Parameters
----------
a : array_like, with `np.bytes_` or `np.str_` dtype
values : array_like of values
These values will be element-wise interpolated into the string.
Returns
-------
out : ndarray
Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
depending on input types
Examples
--------
>>> import numpy as np
>>> a = np.array(["NumPy is a %s library"])
>>> np.strings.mod(a, values=["Python"])
array(['NumPy is a Python library'], dtype='<U25')
>>> a = np.array([b'%d bytes', b'%d bits'])
>>> values = np.array([8, 64])
>>> np.strings.mod(a, values)
array([b'8 bytes', b'64 bits'], dtype='|S7')
"""
return _to_bytes_or_str_array(
_vec_string(a, np.object_, '__mod__', (values,)), a)
def find(a, sub, start=0, end=None):
"""
For each element, return the lowest index in the string where
substring ``sub`` is found, such that ``sub`` is contained in the
range [``start``, ``end``).
Parameters
----------
a : array_like, with ``StringDType``, ``bytes_`` or ``str_`` dtype
sub : array_like, with `np.bytes_` or `np.str_` dtype
The substring to search for.
start, end : array_like, with any integer dtype
The range to look in, interpreted as in slice notation.
Returns
-------
y : ndarray
Output array of ints
See Also
--------
str.find
Examples
--------
>>> import numpy as np
>>> a = np.array(["NumPy is a Python library"])
>>> np.strings.find(a, "Python")
array([11])
"""
end = end if end is not None else MAX
return _find_ufunc(a, sub, start, end)
def rfind(a, sub, start=0, end=None):
"""
For each element, return the highest index in the string where
substring ``sub`` is found, such that ``sub`` is contained in the
range [``start``, ``end``).
Parameters
----------
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
sub : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
The substring to search for.
start, end : array_like, with any integer dtype
The range to look in, interpreted as in slice notation.
Returns
-------
y : ndarray
Output array of ints
See Also
--------
str.rfind
Examples
--------
>>> import numpy as np
>>> a = np.array(["Computer Science"])
>>> np.strings.rfind(a, "Science", start=0, end=None)
array([9])
>>> np.strings.rfind(a, "Science", start=0, end=8)
array([-1])
>>> b = np.array(["Computer Science", "Science"])
>>> np.strings.rfind(b, "Science", start=0, end=None)
array([9, 0])
"""
end = end if end is not None else MAX
return _rfind_ufunc(a, sub, start, end)
def index(a, sub, start=0, end=None):
"""
Like `find`, but raises :exc:`ValueError` when the substring is not found.
Parameters
----------
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
sub : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
start, end : array_like, with any integer dtype, optional
Returns
-------
out : ndarray
Output array of ints.
See Also
--------
find, str.index
Examples
--------
>>> import numpy as np
>>> a = np.array(["Computer Science"])
>>> np.strings.index(a, "Science", start=0, end=None)
array([9])
"""
end = end if end is not None else MAX
return _index_ufunc(a, sub, start, end)
def rindex(a, sub, start=0, end=None):
"""
Like `rfind`, but raises :exc:`ValueError` when the substring `sub` is
not found.
Parameters
----------
a : array-like, with `np.bytes_` or `np.str_` dtype
sub : array-like, with `np.bytes_` or `np.str_` dtype
start, end : array-like, with any integer dtype, optional
Returns
-------
out : ndarray
Output array of ints.
See Also
--------
rfind, str.rindex
Examples
--------
>>> a = np.array(["Computer Science"])
>>> np.strings.rindex(a, "Science", start=0, end=None)
array([9])
"""
end = end if end is not None else MAX
return _rindex_ufunc(a, sub, start, end)
def count(a, sub, start=0, end=None):
"""
Returns an array with the number of non-overlapping occurrences of
substring ``sub`` in the range [``start``, ``end``).
Parameters
----------
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
sub : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
The substring to search for.
start, end : array_like, with any integer dtype
The range to look in, interpreted as in slice notation.
Returns
-------
y : ndarray
Output array of ints
See Also
--------
str.count
Examples
--------
>>> import numpy as np
>>> c = np.array(['aAaAaA', ' aA ', 'abBABba'])
>>> c
array(['aAaAaA', ' aA ', 'abBABba'], dtype='<U7')
>>> np.strings.count(c, 'A')
array([3, 1, 1])
>>> np.strings.count(c, 'aA')
array([3, 1, 0])
>>> np.strings.count(c, 'A', start=1, end=4)
array([2, 1, 1])
>>> np.strings.count(c, 'A', start=1, end=3)
array([1, 0, 0])
"""
end = end if end is not None else MAX
return _count_ufunc(a, sub, start, end)
def startswith(a, prefix, start=0, end=None):
"""
Returns a boolean array which is `True` where the string element
in ``a`` starts with ``prefix``, otherwise `False`.
Parameters
----------
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
prefix : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
start, end : array_like, with any integer dtype
With ``start``, test beginning at that position. With ``end``,
stop comparing at that position.
Returns
-------
out : ndarray
Output array of bools
See Also
--------
str.startswith
Examples
--------
>>> import numpy as np
>>> s = np.array(['foo', 'bar'])
>>> s
array(['foo', 'bar'], dtype='<U3')
>>> np.strings.startswith(s, 'fo')
array([True, False])
>>> np.strings.startswith(s, 'o', start=1, end=2)
array([True, False])
"""
end = end if end is not None else MAX
return _startswith_ufunc(a, prefix, start, end)
def endswith(a, suffix, start=0, end=None):
"""
Returns a boolean array which is `True` where the string element
in ``a`` ends with ``suffix``, otherwise `False`.
Parameters
----------
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
suffix : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
start, end : array_like, with any integer dtype
With ``start``, test beginning at that position. With ``end``,
stop comparing at that position.
Returns
-------
out : ndarray
Output array of bools
See Also
--------
str.endswith
Examples
--------
>>> import numpy as np
>>> s = np.array(['foo', 'bar'])
>>> s
array(['foo', 'bar'], dtype='<U3')
>>> np.strings.endswith(s, 'ar')
array([False, True])
>>> np.strings.endswith(s, 'a', start=1, end=2)
array([False, True])
"""
end = end if end is not None else MAX
return _endswith_ufunc(a, suffix, start, end)
def decode(a, encoding=None, errors=None):
r"""
Calls :meth:`bytes.decode` element-wise.
The set of available codecs comes from the Python standard library,
and may be extended at runtime. For more information, see the
:mod:`codecs` module.
Parameters
----------
a : array_like, with ``bytes_`` dtype
encoding : str, optional
The name of an encoding
errors : str, optional
Specifies how to handle encoding errors
Returns
-------
out : ndarray
See Also
--------
:py:meth:`bytes.decode`
Notes
-----
The type of the result will depend on the encoding specified.
Examples
--------
>>> import numpy as np
>>> c = np.array([b'\x81\xc1\x81\xc1\x81\xc1', b'@@\x81\xc1@@',
... b'\x81\x82\xc2\xc1\xc2\x82\x81'])
>>> c
array([b'\x81\xc1\x81\xc1\x81\xc1', b'@@\x81\xc1@@',
b'\x81\x82\xc2\xc1\xc2\x82\x81'], dtype='|S7')
>>> np.strings.decode(c, encoding='cp037')
array(['aAaAaA', ' aA ', 'abBABba'], dtype='<U7')
"""
return _to_bytes_or_str_array(
_vec_string(a, np.object_, 'decode', _clean_args(encoding, errors)),
np.str_(''))
def encode(a, encoding=None, errors=None):
"""
Calls :meth:`str.encode` element-wise.
The set of available codecs comes from the Python standard library,
and may be extended at runtime. For more information, see the
:mod:`codecs` module.
Parameters
----------
a : array_like, with ``StringDType`` or ``str_`` dtype
encoding : str, optional
The name of an encoding
errors : str, optional
Specifies how to handle encoding errors
Returns
-------
out : ndarray
See Also
--------
str.encode
Notes
-----
The type of the result will depend on the encoding specified.
Examples
--------
>>> import numpy as np
>>> a = np.array(['aAaAaA', ' aA ', 'abBABba'])
>>> np.strings.encode(a, encoding='cp037')
array([b'\x81\xc1\x81\xc1\x81\xc1', b'@@\x81\xc1@@',
b'\x81\x82\xc2\xc1\xc2\x82\x81'], dtype='|S7')
"""
return _to_bytes_or_str_array(
_vec_string(a, np.object_, 'encode', _clean_args(encoding, errors)),
np.bytes_(b''))
def expandtabs(a, tabsize=8):
"""
Return a copy of each string element where all tab characters are
replaced by one or more spaces.
Calls :meth:`str.expandtabs` element-wise.
Return a copy of each string element where all tab characters are
replaced by one or more spaces, depending on the current column
and the given `tabsize`. The column number is reset to zero after
each newline occurring in the string. This doesn't understand other
non-printing characters or escape sequences.
Parameters
----------
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
Input array
tabsize : int, optional
Replace tabs with `tabsize` number of spaces. If not given defaults
to 8 spaces.
Returns
-------
out : ndarray
Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
depending on input type
See Also
--------
str.expandtabs
Examples
--------
>>> import numpy as np
>>> a = np.array(['\t\tHello\tworld'])
>>> np.strings.expandtabs(a, tabsize=4) # doctest: +SKIP
array([' Hello world'], dtype='<U21') # doctest: +SKIP
"""
a = np.asanyarray(a)
tabsize = np.asanyarray(tabsize)
if a.dtype.char == "T":
return _expandtabs(a, tabsize)
buffersizes = _expandtabs_length(a, tabsize)
out_dtype = f"{a.dtype.char}{buffersizes.max()}"
out = np.empty_like(a, shape=buffersizes.shape, dtype=out_dtype)
return _expandtabs(a, tabsize, out=out)
def center(a, width, fillchar=' '):
"""
Return a copy of `a` with its elements centered in a string of
length `width`.
Parameters
----------
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
width : array_like, with any integer dtype
The length of the resulting strings, unless ``width < str_len(a)``.
fillchar : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
Optional padding character to use (default is space).
Returns
-------
out : ndarray
Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
depending on input types
See Also
--------
str.center
Notes
-----
While it is possible for ``a`` and ``fillchar`` to have different dtypes,
passing a non-ASCII character in ``fillchar`` when ``a`` is of dtype "S"
is not allowed, and a ``ValueError`` is raised.
Examples
--------
>>> import numpy as np
>>> c = np.array(['a1b2','1b2a','b2a1','2a1b']); c
array(['a1b2', '1b2a', 'b2a1', '2a1b'], dtype='<U4')
>>> np.strings.center(c, width=9)
array([' a1b2 ', ' 1b2a ', ' b2a1 ', ' 2a1b '], dtype='<U9')
>>> np.strings.center(c, width=9, fillchar='*')
array(['***a1b2**', '***1b2a**', '***b2a1**', '***2a1b**'], dtype='<U9')
>>> np.strings.center(c, width=1)
array(['a1b2', '1b2a', 'b2a1', '2a1b'], dtype='<U4')
"""
width = np.asanyarray(width)
if not np.issubdtype(width.dtype, np.integer):
raise TypeError(f"unsupported type {width.dtype} for operand 'width'")
a = np.asanyarray(a)
fillchar = np.asanyarray(fillchar)
if np.any(str_len(fillchar) != 1):
raise TypeError(
"The fill character must be exactly one character long")
if np.result_type(a, fillchar).char == "T":
return _center(a, width, fillchar)
fillchar = fillchar.astype(a.dtype, copy=False)
width = np.maximum(str_len(a), width)
out_dtype = f"{a.dtype.char}{width.max()}"
shape = np.broadcast_shapes(a.shape, width.shape, fillchar.shape)
out = np.empty_like(a, shape=shape, dtype=out_dtype)
return _center(a, width, fillchar, out=out)
def ljust(a, width, fillchar=' '):
"""
Return an array with the elements of `a` left-justified in a
string of length `width`.
Parameters
----------
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
width : array_like, with any integer dtype
The length of the resulting strings, unless ``width < str_len(a)``.
fillchar : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
Optional character to use for padding (default is space).
Returns
-------
out : ndarray
Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
depending on input types
See Also
--------
str.ljust
Notes
-----
While it is possible for ``a`` and ``fillchar`` to have different dtypes,
passing a non-ASCII character in ``fillchar`` when ``a`` is of dtype "S"
is not allowed, and a ``ValueError`` is raised.
Examples
--------
>>> import numpy as np
>>> c = np.array(['aAaAaA', ' aA ', 'abBABba'])
>>> np.strings.ljust(c, width=3)
array(['aAaAaA', ' aA ', 'abBABba'], dtype='<U7')
>>> np.strings.ljust(c, width=9)
array(['aAaAaA ', ' aA ', 'abBABba '], dtype='<U9')
"""
width = np.asanyarray(width)
if not np.issubdtype(width.dtype, np.integer):
raise TypeError(f"unsupported type {width.dtype} for operand 'width'")
a = np.asanyarray(a)
fillchar = np.asanyarray(fillchar)
if np.any(str_len(fillchar) != 1):
raise TypeError(
"The fill character must be exactly one character long")
if np.result_type(a, fillchar).char == "T":
return _ljust(a, width, fillchar)
fillchar = fillchar.astype(a.dtype, copy=False)
width = np.maximum(str_len(a), width)
shape = np.broadcast_shapes(a.shape, width.shape, fillchar.shape)
out_dtype = f"{a.dtype.char}{width.max()}"
out = np.empty_like(a, shape=shape, dtype=out_dtype)
return _ljust(a, width, fillchar, out=out)
def rjust(a, width, fillchar=' '):
"""
Return an array with the elements of `a` right-justified in a
string of length `width`.
Parameters
----------
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
width : array_like, with any integer dtype
The length of the resulting strings, unless ``width < str_len(a)``.
fillchar : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
Optional padding character to use (default is space).
Returns
-------
out : ndarray
Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
depending on input types
See Also
--------
str.rjust
Notes
-----
While it is possible for ``a`` and ``fillchar`` to have different dtypes,
passing a non-ASCII character in ``fillchar`` when ``a`` is of dtype "S"
is not allowed, and a ``ValueError`` is raised.
Examples
--------
>>> import numpy as np
>>> a = np.array(['aAaAaA', ' aA ', 'abBABba'])
>>> np.strings.rjust(a, width=3)
array(['aAaAaA', ' aA ', 'abBABba'], dtype='<U7')
>>> np.strings.rjust(a, width=9)
array([' aAaAaA', ' aA ', ' abBABba'], dtype='<U9')
"""
width = np.asanyarray(width)
if not np.issubdtype(width.dtype, np.integer):
raise TypeError(f"unsupported type {width.dtype} for operand 'width'")
a = np.asanyarray(a)
fillchar = np.asanyarray(fillchar)
if np.any(str_len(fillchar) != 1):
raise TypeError(
"The fill character must be exactly one character long")
if np.result_type(a, fillchar).char == "T":
return _rjust(a, width, fillchar)
fillchar = fillchar.astype(a.dtype, copy=False)
width = np.maximum(str_len(a), width)
shape = np.broadcast_shapes(a.shape, width.shape, fillchar.shape)
out_dtype = f"{a.dtype.char}{width.max()}"
out = np.empty_like(a, shape=shape, dtype=out_dtype)
return _rjust(a, width, fillchar, out=out)
def zfill(a, width):
"""
Return the numeric string left-filled with zeros. A leading
sign prefix (``+``/``-``) is handled by inserting the padding
after the sign character rather than before.
Parameters
----------
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
width : array_like, with any integer dtype
Width of string to left-fill elements in `a`.
Returns
-------
out : ndarray
Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
depending on input type
See Also
--------
str.zfill
Examples
--------
>>> import numpy as np
>>> np.strings.zfill(['1', '-1', '+1'], 3)
array(['001', '-01', '+01'], dtype='<U3')
"""
width = np.asanyarray(width)
if not np.issubdtype(width.dtype, np.integer):
raise TypeError(f"unsupported type {width.dtype} for operand 'width'")
a = np.asanyarray(a)
if a.dtype.char == "T":
return _zfill(a, width)
width = np.maximum(str_len(a), width)
shape = np.broadcast_shapes(a.shape, width.shape)
out_dtype = f"{a.dtype.char}{width.max()}"
out = np.empty_like(a, shape=shape, dtype=out_dtype)
return _zfill(a, width, out=out)
def lstrip(a, chars=None):
"""
For each element in `a`, return a copy with the leading characters
removed.
Parameters
----------
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
chars : scalar with the same dtype as ``a``, optional
The ``chars`` argument is a string specifying the set of
characters to be removed. If ``None``, the ``chars``
argument defaults to removing whitespace. The ``chars`` argument
is not a prefix or suffix; rather, all combinations of its
values are stripped.
Returns
-------
out : ndarray
Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
depending on input types
See Also
--------
str.lstrip
Examples
--------
>>> import numpy as np
>>> c = np.array(['aAaAaA', ' aA ', 'abBABba'])
>>> c
array(['aAaAaA', ' aA ', 'abBABba'], dtype='<U7')
# The 'a' variable is unstripped from c[1] because of leading whitespace.
>>> np.strings.lstrip(c, 'a')
array(['AaAaA', ' aA ', 'bBABba'], dtype='<U7')
>>> np.strings.lstrip(c, 'A') # leaves c unchanged
array(['aAaAaA', ' aA ', 'abBABba'], dtype='<U7')
>>> (np.strings.lstrip(c, ' ') == np.strings.lstrip(c, '')).all()
np.False_
>>> (np.strings.lstrip(c, ' ') == np.strings.lstrip(c)).all()
np.True_
"""
if chars is None:
return _lstrip_whitespace(a)
return _lstrip_chars(a, chars)
def rstrip(a, chars=None):
"""
For each element in `a`, return a copy with the trailing characters
removed.
Parameters
----------
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
chars : scalar with the same dtype as ``a``, optional
The ``chars`` argument is a string specifying the set of
characters to be removed. If ``None``, the ``chars``
argument defaults to removing whitespace. The ``chars`` argument
is not a prefix or suffix; rather, all combinations of its
values are stripped.
Returns
-------
out : ndarray
Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
depending on input types
See Also
--------
str.rstrip
Examples
--------
>>> import numpy as np
>>> c = np.array(['aAaAaA', 'abBABba'])
>>> c
array(['aAaAaA', 'abBABba'], dtype='<U7')
>>> np.strings.rstrip(c, 'a')
array(['aAaAaA', 'abBABb'], dtype='<U7')
>>> np.strings.rstrip(c, 'A')
array(['aAaAa', 'abBABba'], dtype='<U7')
"""
if chars is None:
return _rstrip_whitespace(a)
return _rstrip_chars(a, chars)
def strip(a, chars=None):
"""
For each element in `a`, return a copy with the leading and
trailing characters removed.
Parameters
----------
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
chars : scalar with the same dtype as ``a``, optional
The ``chars`` argument is a string specifying the set of
characters to be removed. If ``None``, the ``chars``
argument defaults to removing whitespace. The ``chars`` argument
is not a prefix or suffix; rather, all combinations of its
values are stripped.
Returns
-------
out : ndarray
Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
depending on input types
See Also
--------
str.strip
Examples
--------
>>> import numpy as np
>>> c = np.array(['aAaAaA', ' aA ', 'abBABba'])
>>> c
array(['aAaAaA', ' aA ', 'abBABba'], dtype='<U7')
>>> np.strings.strip(c)
array(['aAaAaA', 'aA', 'abBABba'], dtype='<U7')
# 'a' unstripped from c[1] because of leading whitespace.
>>> np.strings.strip(c, 'a')
array(['AaAaA', ' aA ', 'bBABb'], dtype='<U7')
# 'A' unstripped from c[1] because of trailing whitespace.
>>> np.strings.strip(c, 'A')
array(['aAaAa', ' aA ', 'abBABba'], dtype='<U7')
"""
if chars is None:
return _strip_whitespace(a)
return _strip_chars(a, chars)
def upper(a):
"""
Return an array with the elements converted to uppercase.
Calls :meth:`str.upper` element-wise.
For 8-bit strings, this method is locale-dependent.
Parameters
----------
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
Input array.
Returns
-------
out : ndarray
Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
depending on input types
See Also
--------
str.upper
Examples
--------
>>> import numpy as np
>>> c = np.array(['a1b c', '1bca', 'bca1']); c
array(['a1b c', '1bca', 'bca1'], dtype='<U5')
>>> np.strings.upper(c)
array(['A1B C', '1BCA', 'BCA1'], dtype='<U5')
"""
a_arr = np.asarray(a)
return _vec_string(a_arr, a_arr.dtype, 'upper')
def lower(a):
"""
Return an array with the elements converted to lowercase.
Call :meth:`str.lower` element-wise.
For 8-bit strings, this method is locale-dependent.
Parameters
----------
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
Input array.
Returns
-------
out : ndarray
Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
depending on input types
See Also
--------
str.lower
Examples
--------
>>> import numpy as np
>>> c = np.array(['A1B C', '1BCA', 'BCA1']); c
array(['A1B C', '1BCA', 'BCA1'], dtype='<U5')
>>> np.strings.lower(c)
array(['a1b c', '1bca', 'bca1'], dtype='<U5')
"""
a_arr = np.asarray(a)
return _vec_string(a_arr, a_arr.dtype, 'lower')
def swapcase(a):
"""
Return element-wise a copy of the string with
uppercase characters converted to lowercase and vice versa.
Calls :meth:`str.swapcase` element-wise.
For 8-bit strings, this method is locale-dependent.
Parameters
----------
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
Input array.
Returns
-------
out : ndarray
Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
depending on input types
See Also
--------
str.swapcase
Examples
--------
>>> import numpy as np
>>> c=np.array(['a1B c','1b Ca','b Ca1','cA1b'],'S5'); c
array(['a1B c', '1b Ca', 'b Ca1', 'cA1b'],
dtype='|S5')
>>> np.strings.swapcase(c)
array(['A1b C', '1B cA', 'B cA1', 'Ca1B'],
dtype='|S5')
"""
a_arr = np.asarray(a)
return _vec_string(a_arr, a_arr.dtype, 'swapcase')
def capitalize(a):
"""
Return a copy of ``a`` with only the first character of each element
capitalized.
Calls :meth:`str.capitalize` element-wise.
For byte strings, this method is locale-dependent.
Parameters
----------
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
Input array of strings to capitalize.
Returns
-------
out : ndarray
Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
depending on input types
See Also
--------
str.capitalize
Examples
--------
>>> import numpy as np
>>> c = np.array(['a1b2','1b2a','b2a1','2a1b'],'S4'); c
array(['a1b2', '1b2a', 'b2a1', '2a1b'],
dtype='|S4')
>>> np.strings.capitalize(c)
array(['A1b2', '1b2a', 'B2a1', '2a1b'],
dtype='|S4')
"""
a_arr = np.asarray(a)
return _vec_string(a_arr, a_arr.dtype, 'capitalize')
def title(a):
"""
Return element-wise title cased version of string or unicode.
Title case words start with uppercase characters, all remaining cased
characters are lowercase.
Calls :meth:`str.title` element-wise.
For 8-bit strings, this method is locale-dependent.
Parameters
----------
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
Input array.
Returns
-------
out : ndarray
Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
depending on input types
See Also
--------
str.title
Examples
--------
>>> import numpy as np
>>> c=np.array(['a1b c','1b ca','b ca1','ca1b'],'S5'); c
array(['a1b c', '1b ca', 'b ca1', 'ca1b'],
dtype='|S5')
>>> np.strings.title(c)
array(['A1B C', '1B Ca', 'B Ca1', 'Ca1B'],
dtype='|S5')
"""
a_arr = np.asarray(a)
return _vec_string(a_arr, a_arr.dtype, 'title')
def replace(a, old, new, count=-1):
"""
For each element in ``a``, return a copy of the string with
occurrences of substring ``old`` replaced by ``new``.
Parameters
----------
a : array_like, with ``bytes_`` or ``str_`` dtype
old, new : array_like, with ``bytes_`` or ``str_`` dtype
count : array_like, with ``int_`` dtype
If the optional argument ``count`` is given, only the first
``count`` occurrences are replaced.
Returns
-------
out : ndarray
Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
depending on input types
See Also
--------
str.replace
Examples
--------
>>> import numpy as np
>>> a = np.array(["That is a mango", "Monkeys eat mangos"])
>>> np.strings.replace(a, 'mango', 'banana')
array(['That is a banana', 'Monkeys eat bananas'], dtype='<U19')
>>> a = np.array(["The dish is fresh", "This is it"])
>>> np.strings.replace(a, 'is', 'was')
array(['The dwash was fresh', 'Thwas was it'], dtype='<U19')
"""
count = np.asanyarray(count)
if not np.issubdtype(count.dtype, np.integer):
raise TypeError(f"unsupported type {count.dtype} for operand 'count'")
arr = np.asanyarray(a)
old_dtype = getattr(old, 'dtype', None)
old = np.asanyarray(old)
new_dtype = getattr(new, 'dtype', None)
new = np.asanyarray(new)
if np.result_type(arr, old, new).char == "T":
return _replace(arr, old, new, count)
a_dt = arr.dtype
old = old.astype(old_dtype if old_dtype else a_dt, copy=False)
new = new.astype(new_dtype if new_dtype else a_dt, copy=False)
max_int64 = np.iinfo(np.int64).max
counts = _count_ufunc(arr, old, 0, max_int64)
counts = np.where(count < 0, counts, np.minimum(counts, count))
buffersizes = str_len(arr) + counts * (str_len(new) - str_len(old))
out_dtype = f"{arr.dtype.char}{buffersizes.max()}"
out = np.empty_like(arr, shape=buffersizes.shape, dtype=out_dtype)
return _replace(arr, old, new, counts, out=out)
def _join(sep, seq):
"""
Return a string which is the concatenation of the strings in the
sequence `seq`.
Calls :meth:`str.join` element-wise.
Parameters
----------
sep : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
seq : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
Returns
-------
out : ndarray
Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
depending on input types
See Also
--------
str.join
Examples
--------
>>> import numpy as np
>>> np.strings.join('-', 'osd') # doctest: +SKIP
array('o-s-d', dtype='<U5') # doctest: +SKIP
>>> np.strings.join(['-', '.'], ['ghc', 'osd']) # doctest: +SKIP
array(['g-h-c', 'o.s.d'], dtype='<U5') # doctest: +SKIP
"""
return _to_bytes_or_str_array(
_vec_string(sep, np.object_, 'join', (seq,)), seq)
def _split(a, sep=None, maxsplit=None):
"""
For each element in `a`, return a list of the words in the
string, using `sep` as the delimiter string.
Calls :meth:`str.split` element-wise.
Parameters
----------
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
sep : str or unicode, optional
If `sep` is not specified or None, any whitespace string is a
separator.
maxsplit : int, optional
If `maxsplit` is given, at most `maxsplit` splits are done.
Returns
-------
out : ndarray
Array of list objects
Examples
--------
>>> import numpy as np
>>> x = np.array("Numpy is nice!")
>>> np.strings.split(x, " ") # doctest: +SKIP
array(list(['Numpy', 'is', 'nice!']), dtype=object) # doctest: +SKIP
>>> np.strings.split(x, " ", 1) # doctest: +SKIP
array(list(['Numpy', 'is nice!']), dtype=object) # doctest: +SKIP
See Also
--------
str.split, rsplit
"""
# This will return an array of lists of different sizes, so we
# leave it as an object array
return _vec_string(
a, np.object_, 'split', [sep] + _clean_args(maxsplit))
def _rsplit(a, sep=None, maxsplit=None):
"""
For each element in `a`, return a list of the words in the
string, using `sep` as the delimiter string.
Calls :meth:`str.rsplit` element-wise.
Except for splitting from the right, `rsplit`
behaves like `split`.
Parameters
----------
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
sep : str or unicode, optional
If `sep` is not specified or None, any whitespace string
is a separator.
maxsplit : int, optional
If `maxsplit` is given, at most `maxsplit` splits are done,
the rightmost ones.
Returns
-------
out : ndarray
Array of list objects
See Also
--------
str.rsplit, split
Examples
--------
>>> import numpy as np
>>> a = np.array(['aAaAaA', 'abBABba'])
>>> np.strings.rsplit(a, 'A') # doctest: +SKIP
array([list(['a', 'a', 'a', '']), # doctest: +SKIP
list(['abB', 'Bba'])], dtype=object) # doctest: +SKIP
"""
# This will return an array of lists of different sizes, so we
# leave it as an object array
return _vec_string(
a, np.object_, 'rsplit', [sep] + _clean_args(maxsplit))
def _splitlines(a, keepends=None):
"""
For each element in `a`, return a list of the lines in the
element, breaking at line boundaries.
Calls :meth:`str.splitlines` element-wise.
Parameters
----------
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
keepends : bool, optional
Line breaks are not included in the resulting list unless
keepends is given and true.
Returns
-------
out : ndarray
Array of list objects
See Also
--------
str.splitlines
"""
return _vec_string(
a, np.object_, 'splitlines', _clean_args(keepends))
def partition(a, sep):
"""
Partition each element in ``a`` around ``sep``.
For each element in ``a``, split the element at the first
occurrence of ``sep``, and return a 3-tuple containing the part
before the separator, the separator itself, and the part after
the separator. If the separator is not found, the first item of
the tuple will contain the whole string, and the second and third
ones will be the empty string.
Parameters
----------
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
Input array
sep : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
Separator to split each string element in ``a``.
Returns
-------
out : 3-tuple:
- array with ``StringDType``, ``bytes_`` or ``str_`` dtype with the
part before the separator
- array with ``StringDType``, ``bytes_`` or ``str_`` dtype with the
separator
- array with ``StringDType``, ``bytes_`` or ``str_`` dtype with the
part after the separator
See Also
--------
str.partition
Examples
--------
>>> import numpy as np
>>> x = np.array(["Numpy is nice!"])
>>> np.strings.partition(x, " ")
(array(['Numpy'], dtype='<U5'),
array([' '], dtype='<U1'),
array(['is nice!'], dtype='<U8'))
"""
a = np.asanyarray(a)
sep = np.asanyarray(sep)
if np.result_type(a, sep).char == "T":
return _partition(a, sep)
sep = sep.astype(a.dtype, copy=False)
pos = _find_ufunc(a, sep, 0, MAX)
a_len = str_len(a)
sep_len = str_len(sep)
not_found = pos < 0
buffersizes1 = np.where(not_found, a_len, pos)
buffersizes3 = np.where(not_found, 0, a_len - pos - sep_len)
out_dtype = ",".join([f"{a.dtype.char}{n}" for n in (
buffersizes1.max(),
1 if np.all(not_found) else sep_len.max(),
buffersizes3.max(),
)])
shape = np.broadcast_shapes(a.shape, sep.shape)
out = np.empty_like(a, shape=shape, dtype=out_dtype)
return _partition_index(a, sep, pos, out=(out["f0"], out["f1"], out["f2"]))
def rpartition(a, sep):
"""
Partition (split) each element around the right-most separator.
For each element in ``a``, split the element at the last
occurrence of ``sep``, and return a 3-tuple containing the part
before the separator, the separator itself, and the part after
the separator. If the separator is not found, the third item of
the tuple will contain the whole string, and the first and second
ones will be the empty string.
Parameters
----------
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
Input array
sep : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
Separator to split each string element in ``a``.
Returns
-------
out : 3-tuple:
- array with ``StringDType``, ``bytes_`` or ``str_`` dtype with the
part before the separator
- array with ``StringDType``, ``bytes_`` or ``str_`` dtype with the
separator
- array with ``StringDType``, ``bytes_`` or ``str_`` dtype with the
part after the separator
See Also
--------
str.rpartition
Examples
--------
>>> import numpy as np
>>> a = np.array(['aAaAaA', ' aA ', 'abBABba'])
>>> np.strings.rpartition(a, 'A')
(array(['aAaAa', ' a', 'abB'], dtype='<U5'),
array(['A', 'A', 'A'], dtype='<U1'),
array(['', ' ', 'Bba'], dtype='<U3'))
"""
a = np.asanyarray(a)
sep = np.asanyarray(sep)
if np.result_type(a, sep).char == "T":
return _rpartition(a, sep)
sep = sep.astype(a.dtype, copy=False)
pos = _rfind_ufunc(a, sep, 0, MAX)
a_len = str_len(a)
sep_len = str_len(sep)
not_found = pos < 0
buffersizes1 = np.where(not_found, 0, pos)
buffersizes3 = np.where(not_found, a_len, a_len - pos - sep_len)
out_dtype = ",".join([f"{a.dtype.char}{n}" for n in (
buffersizes1.max(),
1 if np.all(not_found) else sep_len.max(),
buffersizes3.max(),
)])
shape = np.broadcast_shapes(a.shape, sep.shape)
out = np.empty_like(a, shape=shape, dtype=out_dtype)
return _rpartition_index(
a, sep, pos, out=(out["f0"], out["f1"], out["f2"]))
def translate(a, table, deletechars=None):
"""
For each element in `a`, return a copy of the string where all
characters occurring in the optional argument `deletechars` are
removed, and the remaining characters have been mapped through the
given translation table.
Calls :meth:`str.translate` element-wise.
Parameters
----------
a : array-like, with `np.bytes_` or `np.str_` dtype
table : str of length 256
deletechars : str
Returns
-------
out : ndarray
Output array of str or unicode, depending on input type
See Also
--------
str.translate
Examples
--------
>>> import numpy as np
>>> a = np.array(['a1b c', '1bca', 'bca1'])
>>> table = a[0].maketrans('abc', '123')
>>> deletechars = ' '
>>> np.char.translate(a, table, deletechars)
array(['112 3', '1231', '2311'], dtype='<U5')
"""
a_arr = np.asarray(a)
if issubclass(a_arr.dtype.type, np.str_):
return _vec_string(
a_arr, a_arr.dtype, 'translate', (table,))
else:
return _vec_string(
a_arr,
a_arr.dtype,
'translate',
[table] + _clean_args(deletechars)
)