Mise à jour de Monitor.py et autres scripts

2025-07-23 10:46:27 +02:00
parent 7081418ce0
commit 7de3e0fb50
8604 changed files with 2789953 additions and 295 deletions
--- a/myenv/lib/python3.11/site-packages/pyarrow/interchange/column.py
+++ b/myenv/lib/python3.11/site-packages/pyarrow/interchange/column.py
@@ -0,0 +1,529 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import annotations
+
+import enum
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    Optional,
+    Tuple,
+)
+
+import sys
+if sys.version_info >= (3, 8):
+    from typing import TypedDict
+else:
+    from typing_extensions import TypedDict
+
+import pyarrow as pa
+import pyarrow.compute as pc
+from pyarrow.interchange.buffer import _PyArrowBuffer
+
+
+class DtypeKind(enum.IntEnum):
+    """
+    Integer enum for data types.
+
+    Attributes
+    ----------
+    INT : int
+        Matches to signed integer data type.
+    UINT : int
+        Matches to unsigned integer data type.
+    FLOAT : int
+        Matches to floating point data type.
+    BOOL : int
+        Matches to boolean data type.
+    STRING : int
+        Matches to string data type (UTF-8 encoded).
+    DATETIME : int
+        Matches to datetime data type.
+    CATEGORICAL : int
+        Matches to categorical data type.
+    """
+
+    INT = 0
+    UINT = 1
+    FLOAT = 2
+    BOOL = 20
+    STRING = 21  # UTF-8
+    DATETIME = 22
+    CATEGORICAL = 23
+
+
+Dtype = Tuple[DtypeKind, int, str, str]  # see Column.dtype
+
+
+_PYARROW_KINDS = {
+    pa.int8(): (DtypeKind.INT, "c"),
+    pa.int16(): (DtypeKind.INT, "s"),
+    pa.int32(): (DtypeKind.INT, "i"),
+    pa.int64(): (DtypeKind.INT, "l"),
+    pa.uint8(): (DtypeKind.UINT, "C"),
+    pa.uint16(): (DtypeKind.UINT, "S"),
+    pa.uint32(): (DtypeKind.UINT, "I"),
+    pa.uint64(): (DtypeKind.UINT, "L"),
+    pa.float16(): (DtypeKind.FLOAT, "e"),
+    pa.float32(): (DtypeKind.FLOAT, "f"),
+    pa.float64(): (DtypeKind.FLOAT, "g"),
+    pa.bool_(): (DtypeKind.BOOL, "b"),
+    pa.string(): (DtypeKind.STRING, "u"),
+    pa.large_string(): (DtypeKind.STRING, "U"),
+}
+
+
+class ColumnNullType(enum.IntEnum):
+    """
+    Integer enum for null type representation.
+
+    Attributes
+    ----------
+    NON_NULLABLE : int
+        Non-nullable column.
+    USE_NAN : int
+        Use explicit float NaN value.
+    USE_SENTINEL : int
+        Sentinel value besides NaN.
+    USE_BITMASK : int
+        The bit is set/unset representing a null on a certain position.
+    USE_BYTEMASK : int
+        The byte is set/unset representing a null on a certain position.
+    """
+
+    NON_NULLABLE = 0
+    USE_NAN = 1
+    USE_SENTINEL = 2
+    USE_BITMASK = 3
+    USE_BYTEMASK = 4
+
+
+class ColumnBuffers(TypedDict):
+    # first element is a buffer containing the column data;
+    # second element is the data buffer's associated dtype
+    data: Tuple[_PyArrowBuffer, Dtype]
+
+    # first element is a buffer containing mask values indicating missing data;
+    # second element is the mask value buffer's associated dtype.
+    # None if the null representation is not a bit or byte mask
+    validity: Optional[Tuple[_PyArrowBuffer, Dtype]]
+
+    # first element is a buffer containing the offset values for
+    # variable-size binary data (e.g., variable-length strings);
+    # second element is the offsets buffer's associated dtype.
+    # None if the data buffer does not have an associated offsets buffer
+    offsets: Optional[Tuple[_PyArrowBuffer, Dtype]]
+
+
+class CategoricalDescription(TypedDict):
+    # whether the ordering of dictionary indices is semantically meaningful
+    is_ordered: bool
+    # whether a dictionary-style mapping of categorical values to other objects
+    # exists
+    is_dictionary: bool
+    # Python-level only (e.g. ``{int: str}``).
+    # None if not a dictionary-style categorical.
+    categories: Optional[_PyArrowColumn]
+
+
+class Endianness:
+    """Enum indicating the byte-order of a data-type."""
+
+    LITTLE = "<"
+    BIG = ">"
+    NATIVE = "="
+    NA = "|"
+
+
+class NoBufferPresent(Exception):
+    """Exception to signal that there is no requested buffer."""
+
+
+class _PyArrowColumn:
+    """
+    A column object, with only the methods and properties required by the
+    interchange protocol defined.
+
+    A column can contain one or more chunks. Each chunk can contain up to three
+    buffers - a data buffer, a mask buffer (depending on null representation),
+    and an offsets buffer (if variable-size binary; e.g., variable-length
+    strings).
+
+    TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
+         Instead, it seems to use "children" for both columns with a bit mask,
+         and for nested dtypes. Unclear whether this is elegant or confusing.
+         This design requires checking the null representation explicitly.
+
+         The Arrow design requires checking:
+         1. the ARROW_FLAG_NULLABLE (for sentinel values)
+         2. if a column has two children, combined with one of those children
+            having a null dtype.
+
+         Making the mask concept explicit seems useful. One null dtype would
+         not be enough to cover both bit and byte masks, so that would mean
+         even more checking if we did it the Arrow way.
+
+    TBD: there's also the "chunk" concept here, which is implicit in Arrow as
+         multiple buffers per array (= column here). Semantically it may make
+         sense to have both: chunks were meant for example for lazy evaluation
+         of data which doesn't fit in memory, while multiple buffers per column
+         could also come from doing a selection operation on a single
+         contiguous buffer.
+
+         Given these concepts, one would expect chunks to be all of the same
+         size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows),
+         while multiple buffers could have data-dependent lengths. Not an issue
+         in pandas if one column is backed by a single NumPy array, but in
+         Arrow it seems possible.
+         Are multiple chunks *and* multiple buffers per column necessary for
+         the purposes of this interchange protocol, or must producers either
+         reuse the chunk concept for this or copy the data?
+
+    Note: this Column object can only be produced by ``__dataframe__``, so
+          doesn't need its own version or ``__column__`` protocol.
+    """
+
+    def __init__(
+        self, column: pa.Array | pa.ChunkedArray, allow_copy: bool = True
+    ) -> None:
+        """
+        Handles PyArrow Arrays and ChunkedArrays.
+        """
+        # Store the column as a private attribute
+        if isinstance(column, pa.ChunkedArray):
+            if column.num_chunks == 1:
+                column = column.chunk(0)
+            else:
+                if not allow_copy:
+                    raise RuntimeError(
+                        "Chunks will be combined and a copy is required which "
+                        "is forbidden by allow_copy=False"
+                    )
+                column = column.combine_chunks()
+
+        self._allow_copy = allow_copy
+
+        if pa.types.is_boolean(column.type):
+            if not allow_copy:
+                raise RuntimeError(
+                    "Boolean column will be casted to uint8 and a copy "
+                    "is required which is forbidden by allow_copy=False"
+                )
+            self._dtype = self._dtype_from_arrowdtype(column.type, 8)
+            self._col = pc.cast(column, pa.uint8())
+        else:
+            self._col = column
+            dtype = self._col.type
+            try:
+                bit_width = dtype.bit_width
+            except ValueError:
+                # in case of a variable-length strings, considered as array
+                # of bytes (8 bits)
+                bit_width = 8
+            self._dtype = self._dtype_from_arrowdtype(dtype, bit_width)
+
+    def size(self) -> int:
+        """
+        Size of the column, in elements.
+
+        Corresponds to DataFrame.num_rows() if column is a single chunk;
+        equal to size of this current chunk otherwise.
+
+        Is a method rather than a property because it may cause a (potentially
+        expensive) computation for some dataframe implementations.
+        """
+        return len(self._col)
+
+    @property
+    def offset(self) -> int:
+        """
+        Offset of first element.
+
+        May be > 0 if using chunks; for example for a column with N chunks of
+        equal size M (only the last chunk may be shorter),
+        ``offset = n * M``, ``n = 0 .. N-1``.
+        """
+        return self._col.offset
+
+    @property
+    def dtype(self) -> Tuple[DtypeKind, int, str, str]:
+        """
+        Dtype description as a tuple ``(kind, bit-width, format string,
+        endianness)``.
+
+        Bit-width : the number of bits as an integer
+        Format string : data type description format string in Apache Arrow C
+                        Data Interface format.
+        Endianness : current only native endianness (``=``) is supported
+
+        Notes:
+            - Kind specifiers are aligned with DLPack where possible (hence the
+              jump to 20, leave enough room for future extension)
+            - Masks must be specified as boolean with either bit width 1 (for
+              bit masks) or 8 (for byte masks).
+            - Dtype width in bits was preferred over bytes
+            - Endianness isn't too useful, but included now in case in the
+              future we need to support non-native endianness
+            - Went with Apache Arrow format strings over NumPy format strings
+              because they're more complete from a dataframe perspective
+            - Format strings are mostly useful for datetime specification, and
+              for categoricals.
+            - For categoricals, the format string describes the type of the
+              categorical in the data buffer. In case of a separate encoding of
+              the categorical (e.g. an integer to string mapping), this can
+              be derived from ``self.describe_categorical``.
+            - Data types not included: complex, Arrow-style null, binary,
+              decimal, and nested (list, struct, map, union) dtypes.
+        """
+        return self._dtype
+
+    def _dtype_from_arrowdtype(
+        self, dtype: pa.DataType, bit_width: int
+    ) -> Tuple[DtypeKind, int, str, str]:
+        """
+        See `self.dtype` for details.
+        """
+        # Note: 'c' (complex) not handled yet (not in array spec v1).
+        #       'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void)
+        #       not handled datetime and timedelta both map to datetime
+        #       (is timedelta handled?)
+
+        if pa.types.is_timestamp(dtype):
+            kind = DtypeKind.DATETIME
+            ts = dtype.unit[0]
+            tz = dtype.tz if dtype.tz else ""
+            f_string = f"ts{ts}:{tz}"
+            return kind, bit_width, f_string, Endianness.NATIVE
+        elif pa.types.is_dictionary(dtype):
+            kind = DtypeKind.CATEGORICAL
+            arr = self._col
+            indices_dtype = arr.indices.type
+            _, f_string = _PYARROW_KINDS.get(indices_dtype)
+            return kind, bit_width, f_string, Endianness.NATIVE
+        else:
+            kind, f_string = _PYARROW_KINDS.get(dtype, (None, None))
+            if kind is None:
+                raise ValueError(
+                    f"Data type {dtype} not supported by interchange protocol")
+
+            return kind, bit_width, f_string, Endianness.NATIVE
+
+    @property
+    def describe_categorical(self) -> CategoricalDescription:
+        """
+        If the dtype is categorical, there are two options:
+        - There are only values in the data buffer.
+        - There is a separate non-categorical Column encoding categorical
+          values.
+
+        Raises TypeError if the dtype is not categorical
+
+        Returns the dictionary with description on how to interpret the
+        data buffer:
+            - "is_ordered" : bool, whether the ordering of dictionary indices
+                             is semantically meaningful.
+            - "is_dictionary" : bool, whether a mapping of
+                                categorical values to other objects exists
+            - "categories" : Column representing the (implicit) mapping of
+                             indices to category values (e.g. an array of
+                             cat1, cat2, ...). None if not a dictionary-style
+                             categorical.
+
+        TBD: are there any other in-memory representations that are needed?
+        """
+        arr = self._col
+        if not pa.types.is_dictionary(arr.type):
+            raise TypeError(
+                "describe_categorical only works on a column with "
+                "categorical dtype!"
+            )
+
+        return {
+            "is_ordered": self._col.type.ordered,
+            "is_dictionary": True,
+            "categories": _PyArrowColumn(arr.dictionary),
+        }
+
+    @property
+    def describe_null(self) -> Tuple[ColumnNullType, Any]:
+        """
+        Return the missing value (or "null") representation the column dtype
+        uses, as a tuple ``(kind, value)``.
+
+        Value : if kind is "sentinel value", the actual value. If kind is a bit
+        mask or a byte mask, the value (0 or 1) indicating a missing value.
+        None otherwise.
+        """
+        # In case of no missing values, we need to set ColumnNullType to
+        # non nullable as in the current __dataframe__ protocol bit/byte masks
+        # cannot be None
+        if self.null_count == 0:
+            return ColumnNullType.NON_NULLABLE, None
+        else:
+            return ColumnNullType.USE_BITMASK, 0
+
+    @property
+    def null_count(self) -> int:
+        """
+        Number of null elements, if known.
+
+        Note: Arrow uses -1 to indicate "unknown", but None seems cleaner.
+        """
+        arrow_null_count = self._col.null_count
+        n = arrow_null_count if arrow_null_count != -1 else None
+        return n
+
+    @property
+    def metadata(self) -> Dict[str, Any]:
+        """
+        The metadata for the column. See `DataFrame.metadata` for more details.
+        """
+        pass
+
+    def num_chunks(self) -> int:
+        """
+        Return the number of chunks the column consists of.
+        """
+        return 1
+
+    def get_chunks(
+        self, n_chunks: Optional[int] = None
+    ) -> Iterable[_PyArrowColumn]:
+        """
+        Return an iterator yielding the chunks.
+
+        See `DataFrame.get_chunks` for details on ``n_chunks``.
+        """
+        if n_chunks and n_chunks > 1:
+            chunk_size = self.size() // n_chunks
+            if self.size() % n_chunks != 0:
+                chunk_size += 1
+
+            array = self._col
+            i = 0
+            for start in range(0, chunk_size * n_chunks, chunk_size):
+                yield _PyArrowColumn(
+                    array.slice(start, chunk_size), self._allow_copy
+                )
+                i += 1
+        else:
+            yield self
+
+    def get_buffers(self) -> ColumnBuffers:
+        """
+        Return a dictionary containing the underlying buffers.
+
+        The returned dictionary has the following contents:
+
+            - "data": a two-element tuple whose first element is a buffer
+                      containing the data and whose second element is the data
+                      buffer's associated dtype.
+            - "validity": a two-element tuple whose first element is a buffer
+                          containing mask values indicating missing data and
+                          whose second element is the mask value buffer's
+                          associated dtype. None if the null representation is
+                          not a bit or byte mask.
+            - "offsets": a two-element tuple whose first element is a buffer
+                         containing the offset values for variable-size binary
+                         data (e.g., variable-length strings) and whose second
+                         element is the offsets buffer's associated dtype. None
+                         if the data buffer does not have an associated offsets
+                         buffer.
+        """
+        buffers: ColumnBuffers = {
+            "data": self._get_data_buffer(),
+            "validity": None,
+            "offsets": None,
+        }
+
+        try:
+            buffers["validity"] = self._get_validity_buffer()
+        except NoBufferPresent:
+            pass
+
+        try:
+            buffers["offsets"] = self._get_offsets_buffer()
+        except NoBufferPresent:
+            pass
+
+        return buffers
+
+    def _get_data_buffer(
+        self,
+    ) -> Tuple[_PyArrowBuffer, Any]:  # Any is for self.dtype tuple
+        """
+        Return the buffer containing the data and the buffer's
+        associated dtype.
+        """
+        array = self._col
+        dtype = self.dtype
+
+        # In case of dictionary arrays, use indices
+        # to define a buffer, codes are transferred through
+        # describe_categorical()
+        if pa.types.is_dictionary(array.type):
+            array = array.indices
+            dtype = _PyArrowColumn(array).dtype
+
+        n = len(array.buffers())
+        if n == 2:
+            return _PyArrowBuffer(array.buffers()[1]), dtype
+        elif n == 3:
+            return _PyArrowBuffer(array.buffers()[2]), dtype
+
+    def _get_validity_buffer(self) -> Tuple[_PyArrowBuffer, Any]:
+        """
+        Return the buffer containing the mask values indicating missing data
+        and the buffer's associated dtype.
+        Raises NoBufferPresent if null representation is not a bit or byte
+        mask.
+        """
+        # Define the dtype of the returned buffer
+        dtype = (DtypeKind.BOOL, 1, "b", Endianness.NATIVE)
+        array = self._col
+        buff = array.buffers()[0]
+        if buff:
+            return _PyArrowBuffer(buff), dtype
+        else:
+            raise NoBufferPresent(
+                "There are no missing values so "
+                "does not have a separate mask")
+
+    def _get_offsets_buffer(self) -> Tuple[_PyArrowBuffer, Any]:
+        """
+        Return the buffer containing the offset values for variable-size binary
+        data (e.g., variable-length strings) and the buffer's associated dtype.
+        Raises NoBufferPresent if the data buffer does not have an associated
+        offsets buffer.
+        """
+        array = self._col
+        n = len(array.buffers())
+        if n == 2:
+            raise NoBufferPresent(
+                "This column has a fixed-length dtype so "
+                "it does not have an offsets buffer"
+            )
+        elif n == 3:
+            # Define the dtype of the returned buffer
+            dtype = self._col.type
+            if pa.types.is_large_string(dtype):
+                dtype = (DtypeKind.INT, 64, "l", Endianness.NATIVE)
+            else:
+                dtype = (DtypeKind.INT, 32, "i", Endianness.NATIVE)
+            return _PyArrowBuffer(array.buffers()[1]), dtype