Source code for kartothek.serialization._parquet

# -*- coding: utf-8 -*-
"""
This module contains functionality for persisting/serialising DataFrames.
"""


import datetime
import logging
import time
from typing import Iterable, Optional

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from pyarrow.parquet import ParquetFile
from simplekv import KeyValueStore

from ._generic import (
    DataFrameSerializer,
    PredicatesType,
    check_predicates,
    filter_df,
    filter_df_from_predicates,
)
from ._io_buffer import BlockBuffer
from ._util import ensure_unicode_string_type

try:
    # Only check for BotoStore instance if boto is really installed
    from simplekv.net.botostore import BotoStore

    HAVE_BOTO = True
except ImportError:
    HAVE_BOTO = False

_logger = logging.getLogger(__name__)


EPOCH_ORDINAL = datetime.date(1970, 1, 1).toordinal()
MAX_NB_RETRIES = 6  # longest retry backoff = BACKOFF_TIME * 2**(MAX_NB_RETRIES - 2)
BACKOFF_TIME = 0.01  # 10 ms


def _empty_table_from_schema(parquet_file):
    schema = parquet_file.schema.to_arrow_schema()

    return schema.empty_table()


def _reset_dictionary_columns(table, exclude=None):
    """
    We need to ensure that the dtype is exactly as requested, see GH227
    """
    if exclude is None:
        exclude = []

    schema = table.schema
    for i in range(len(schema)):
        field = schema[i]
        if field.name in exclude:
            continue
        if pa.types.is_dictionary(field.type):
            new_field = pa.field(
                field.name, field.type.value_type, field.nullable, field.metadata,
            )
            schema = schema.remove(i).insert(i, new_field)

    table = table.cast(schema)
    return table


class ParquetReadError(IOError):
    """
    Internal kartothek error while attempting to read Parquet file
    """

    pass


[docs]class ParquetSerializer(DataFrameSerializer): """Serializer to store a :class:`pandas.DataFrame` as parquet On top of the plain serialization, this class handles forward and backwards compatibility between pyarrow versions. Parameters ---------- compression The compression algorithm to be used for the parquet file. For a comprehensive list of available compression algorithms, please see :func:`pyarrow.parquet.write_table`. The default is set to "SNAPPY" which usually offers a good balance between performance and compression rate. Depending on your data, picking a different algorithm may have vastly different characteristics and we can only recommend to test this on your own data. Depending on the reader parquet implementation, some compression algorithms may not be supported and we recommend to consult the documentation of the reader libraries first. chunk_size The number of rows stored in a Parquet RowGroup. To leverage predicate pushdown, it is necessary to set this value. We do not apply any default value since a good choice is very sensitive to the kind of data you are using and what kind of storage. A typical range to try out would be somewhere between 50k-200k. To fully leverage row group statistics, it is highly recommended to sort the file before serialization. Notes ----- Regarding type stability and supported types there are a few known limitations users should be aware of. .. ipython:: python :suppress: from kartothek.core.utils import ensure_store import pandas as pd from kartothek.serialization import ParquetSerializer store = ensure_store("hmemory://") * `pandas.Categorical` Kartothek offers the keyword argument `categories` which contains a list of field names which are supposed to retrieved as a `pandas.Categorical`. See also :ref:`Dictionary Encoding` .. ipython:: python ser = ParquetSerializer() df = pd.DataFrame({"cat_field": pd.Categorical(["A"])}) df.dtypes ser.restore_dataframe(store, ser.store(store, "cat", df)) ser.restore_dataframe(store, ser.store(store, "cat", df), categories=["cat_field"]) * Timestamps with nanosecond resolution Timestamps can only be stored in micro second (`us`) accuracy. Trying to do differently may raise an exception. See also :ref:`timestamp` .. ipython:: python :okexcept: import pyarrow as pa pa.__version__ df = pd.DataFrame({"nanosecond": [pd.Timestamp("2021-01-01 00:00:00.0000001")]}) # nanosecond resolution ser.store(store, "key", df) """ _PARQUET_VERSION = "2.0" type_stable = True def __init__( self, compression: str = "SNAPPY", chunk_size: Optional[int] = None ) -> None: self.compression = compression if chunk_size is not None: if not isinstance(chunk_size, int): raise TypeError( "Cannot initialize ParquetSerializer because chunk size is not integer type" ) if chunk_size < 1: raise ValueError( "Cannot initialize ParquetSerializer because chunk size < 1" ) self.chunk_size = chunk_size def __eq__(self, other): return ( isinstance(other, ParquetSerializer) and (self.compression == other.compression) and (self.chunk_size == other.chunk_size) ) def __repr__(self): return "ParquetSerializer(compression={compression!r}, chunk_size={chunk_size!r})".format( compression=self.compression, chunk_size=self.chunk_size ) @staticmethod def _restore_dataframe( store: KeyValueStore, key: str, filter_query: Optional[str] = None, columns: Optional[Iterable[str]] = None, predicate_pushdown_to_io: bool = True, categories: Optional[Iterable[str]] = None, predicates: Optional[PredicatesType] = None, date_as_object: bool = False, ) -> pd.DataFrame: check_predicates(predicates) # If we want to do columnar access we can benefit from partial reads # otherwise full read en block is the better option. if (not predicate_pushdown_to_io) or (columns is None and predicates is None): with pa.BufferReader(store.get(key)) as reader: table = pq.read_pandas(reader, columns=columns) else: if HAVE_BOTO and isinstance(store, BotoStore): # Parquet and seeks on S3 currently leak connections thus # we omit column projection to the store. reader = pa.BufferReader(store.get(key)) else: reader = store.open(key) # Buffer at least 4 MB in requests. This is chosen because the default block size of the Azure # storage client is 4MB. reader = BlockBuffer(reader, 4 * 1024 * 1024) try: parquet_file = ParquetFile(reader) if predicates and parquet_file.metadata.num_rows > 0: # We need to calculate different predicates for predicate # pushdown and the later DataFrame filtering. This is required # e.g. in the case where we have an `in` predicate as this has # different normalized values. columns_to_io = _columns_for_pushdown(columns, predicates) predicates_for_pushdown = _normalize_predicates( parquet_file, predicates, True ) predicates = _normalize_predicates(parquet_file, predicates, False) tables = _read_row_groups_into_tables( parquet_file, columns_to_io, predicates_for_pushdown ) if len(tables) == 0: table = _empty_table_from_schema(parquet_file) else: table = pa.concat_tables(tables) else: # ARROW-5139 Column projection with empty columns returns a table w/out index if columns == []: # Create an arrow table with expected index length. df = ( parquet_file.schema.to_arrow_schema() .empty_table() .to_pandas(date_as_object=date_as_object) ) index = pd.Int64Index( pd.RangeIndex(start=0, stop=parquet_file.metadata.num_rows) ) df = pd.DataFrame(df, index=index) # convert back to table to keep downstream code untouched by this patch table = pa.Table.from_pandas(df) else: table = pq.read_pandas(reader, columns=columns) finally: reader.close() if columns is not None: missing_columns = set(columns) - set(table.schema.names) if missing_columns: raise ValueError( "Columns cannot be found in stored dataframe: {missing}".format( missing=", ".join(sorted(missing_columns)) ) ) table = _reset_dictionary_columns(table, exclude=categories) df = table.to_pandas(categories=categories, date_as_object=date_as_object) df.columns = df.columns.map(ensure_unicode_string_type) if predicates: df = filter_df_from_predicates( df, predicates, strict_date_types=date_as_object ) else: df = filter_df(df, filter_query) if columns is not None: return df.reindex(columns=columns, copy=False) else: return df
[docs] @classmethod def restore_dataframe( cls, store: KeyValueStore, key: str, filter_query: Optional[str] = None, columns: Optional[Iterable[str]] = None, predicate_pushdown_to_io: bool = True, categories: Optional[Iterable[str]] = None, predicates: Optional[PredicatesType] = None, date_as_object: bool = False, ) -> pd.DataFrame: # https://github.com/JDASoftwareGroup/kartothek/issues/407 We have been seeing weird `IOError`s while reading # Parquet files from Azure Blob Store. These errors have caused long running computations to fail. # The workaround is to retry the serialization here and gain more stability for long running tasks. # This code should not live forever, it should be removed once the underlying cause has been resolved. for nb_retry in range(MAX_NB_RETRIES): try: return cls._restore_dataframe( store=store, key=key, filter_query=filter_query, columns=columns, predicate_pushdown_to_io=predicate_pushdown_to_io, categories=categories, predicates=predicates, date_as_object=date_as_object, ) # We only retry OSErrors (note that IOError inherits from OSError), as these kind of errors may benefit # from retries. except OSError as err: raised_error = err _logger.warning( msg=( f"Failed to restore dataframe, attempt {nb_retry + 1} of {MAX_NB_RETRIES} with parameters " f"key: {key}, filter_query: {filter_query}, columns: {columns}, " f"predicate_pushdown_to_io: {predicate_pushdown_to_io}, categories: {categories}, " f"predicates: {predicates}, date_as_object: {date_as_object}." ), exc_info=True, ) # we don't sleep when we're done with the last attempt if nb_retry < (MAX_NB_RETRIES - 1): time.sleep(BACKOFF_TIME * 2 ** nb_retry) raise ParquetReadError( f"Failed to restore dataframe after {MAX_NB_RETRIES} attempts. Parameters: " f"key: {key}, filter_query: {filter_query}, columns: {columns}, " f"predicate_pushdown_to_io: {predicate_pushdown_to_io}, categories: {categories}, " f"date_as_object: {date_as_object}, predicates: {predicates}." ) from raised_error
[docs] def store(self, store, key_prefix, df): key = "{}.parquet".format(key_prefix) if isinstance(df, pa.Table): table = df else: table = pa.Table.from_pandas(df) buf = pa.BufferOutputStream() pq.write_table( table, buf, version=self._PARQUET_VERSION, chunk_size=self.chunk_size, compression=self.compression, coerce_timestamps="us", ) store.put(key, buf.getvalue().to_pybytes()) return key
def _columns_for_pushdown(columns, predicates): if columns is None: return new_cols = columns[:] for conjunction in predicates: for literal in conjunction: if literal[0] not in columns: new_cols.append(literal[0]) return new_cols def _read_row_groups_into_tables(parquet_file, columns, predicates_in): """ For each RowGroup check if the predicate in DNF applies and then read the respective RowGroup. """ arrow_schema = parquet_file.schema.to_arrow_schema() parquet_reader = parquet_file.reader def all_predicates_accept(row): # Check if the predicates evaluate on this RowGroup. # As the predicate is in DNF, we only need a single of the # inner lists to match. Once we have found a positive match, # there is no need to check whether the remaining ones apply. row_meta = parquet_file.metadata.row_group(row) for predicate_list in predicates_in: if all( _predicate_accepts(predicate, row_meta, arrow_schema, parquet_reader) for predicate in predicate_list ): return True return False # Iterate over the RowGroups and evaluate the list of predicates on each # one of them. Only access those that could contain a row where we could # get an exact match of the predicate. result = [] for row in range(parquet_file.num_row_groups): if all_predicates_accept(row): row_group = parquet_file.read_row_group(row, columns=columns) result.append(row_group) return result def _normalize_predicates(parquet_file, predicates, for_pushdown): schema = parquet_file.schema.to_arrow_schema() normalized_predicates = [] for conjunction in predicates: new_conjunction = [] for literal in conjunction: col, op, val = literal col_idx = parquet_file.reader.column_name_idx(col) pa_type = schema[col_idx].type column_name = schema[col_idx].name if pa.types.is_null(pa_type): # early exit, the entire conjunction evaluates to False new_conjunction = None break if op == "in": values = [ _normalize_value(lit, pa_type, column_name=column_name) for lit in literal[2] ] if for_pushdown and values: normalized_value = [ _timelike_to_arrow_encoding(value, pa_type) for value in values ] else: normalized_value = values else: normalized_value = _normalize_value( literal[2], pa_type, column_name=column_name ) if for_pushdown: normalized_value = _timelike_to_arrow_encoding( normalized_value, pa_type ) new_literal = (literal[0], literal[1], normalized_value) new_conjunction.append(new_literal) if new_conjunction is not None: normalized_predicates.append(new_conjunction) return normalized_predicates def _timelike_to_arrow_encoding(value, pa_type): # Date32 columns are encoded as days since 1970 if pa.types.is_date32(pa_type): if isinstance(value, datetime.date): return value.toordinal() - EPOCH_ORDINAL else: return value def _normalize_value(value, pa_type, column_name=None): if pa.types.is_dictionary(pa_type): pa_type = pa_type.value_type if pa.types.is_string(pa_type): if isinstance(value, bytes): return value.decode("utf-8") elif isinstance(value, str): return value elif value is None: return value elif pa.types.is_binary(pa_type): if isinstance(value, bytes): return value elif isinstance(value, str): return str(value).encode("utf-8") elif ( pa.types.is_integer(pa_type) and pd.api.types.is_integer(value) or pa.types.is_floating(pa_type) and pd.api.types.is_float(value) or pa.types.is_boolean(pa_type) and pd.api.types.is_bool(value) or pa.types.is_timestamp(pa_type) and not isinstance(value, (bytes, str)) and ( pd.api.types.is_datetime64_dtype(value) or isinstance(value, datetime.datetime) ) ): return value elif pa.types.is_date(pa_type): if isinstance(value, str): return datetime.datetime.strptime(value, "%Y-%m-%d").date() elif isinstance(value, bytes): value = value.decode("utf-8") return datetime.datetime.strptime(value, "%Y-%m-%d").date() elif isinstance(value, datetime.date): if isinstance(value, datetime.datetime): raise TypeError( f"Unexpected type for predicate: Column {column_name!r} is an " f"Arrow date ({pa_type}), but predicate value has type {type(value)}. " f"Use a Python 'datetime.date' object instead." ) else: return value predicate_value_dtype = pd.Series(value).dtype raise TypeError( f"Unexpected type for predicate: Column {column_name!r} has pandas type " f"{pa_type.to_pandas_dtype()} (Arrow type {pa_type}), but predicate value " f"{value!r} has pandas type '{predicate_value_dtype}' (Python type '{type(value)}')" ) def _predicate_accepts(predicate, row_meta, arrow_schema, parquet_reader): """ Checks if a predicate evaluates on a column. This method first casts the value of the predicate to the type used for this column in the statistics and then applies the relevant operator. The operation applied here is done in a fashion to check if the predicate would evaluate to True for any possible row in the RowGroup. Thus e.g. for the `==` predicate, we check if the predicate value is in the (min, max) range of the RowGroup. """ col, op, val = predicate col_idx = parquet_reader.column_name_idx(col) pa_type = arrow_schema[col_idx].type parquet_statistics = row_meta.column(col_idx).statistics # In case min/max is not set, we have to assume that the predicate matches. if not parquet_statistics.has_min_max: return True min_value = parquet_statistics.min max_value = parquet_statistics.max # Transform the predicate value to the respective type used in the statistics. # integer overflow protection since statistics are stored as signed integer, see ARROW-5166 if pa.types.is_integer(pa_type) and (max_value < min_value): return True if pa.types.is_timestamp(pa_type): # timestamps in the parquet statistic might be of type datetime.datetime, which is not compatible w/ numpy min_value = np.datetime64(min_value) max_value = np.datetime64(max_value) # The statistics for floats only contain the 6 most significant digits. # So a suitable epsilon has to be considered below min and above max. if isinstance(val, float): min_value -= _epsilon(min_value) max_value += _epsilon(max_value) # op can only be "==" or "!=" for scalar null values. if op == "==": if pd.isnull(val): return parquet_statistics.null_count > 0 else: return (min_value <= val) and (max_value >= val) elif op == "!=": if pd.isnull(val): return parquet_statistics.null_count < row_meta.num_rows else: return not ((min_value >= val) and (max_value <= val)) elif op == "<=": return min_value <= val elif op == ">=": return max_value >= val elif op == "<": return min_value < val elif op == ">": return max_value > val elif op == "in": # This implementation is chosen for performance reasons. See # https://github.com/JDASoftwareGroup/kartothek/pull/130 for more information/benchmarks. # We accept the predicate if there is any value in the provided array which is equal to or between # the parquet min and max statistics. Otherwise, it is rejected. for x in val: if pd.isnull(x): if parquet_statistics.null_count > 0: return True elif min_value <= x <= max_value: return True return False else: raise NotImplementedError("op not supported") def _highest_significant_position(num): """ >>> _highest_significant_position(1.0) 1 >>> _highest_significant_position(9.0) 1 >>> _highest_significant_position(39.0) 2 >>> _highest_significant_position(0.1) -1 >>> _highest_significant_position(0.9) -1 >>> _highest_significant_position(0.000123) -4 >>> _highest_significant_position(1234567.0) 7 >>> _highest_significant_position(-0.1) -1 >>> _highest_significant_position(-100.0) 3 """ abs_num = np.absolute(num) log_of_abs = np.log10(abs_num) position = int(np.floor(log_of_abs)) # is position left of decimal point? if abs_num >= 1.0: position += 1 return position def _epsilon(num): """ >>> _epsilon(123456) 1 >>> _epsilon(0.123456) 1e-06 >>> _epsilon(0.123) 1e-06 >>> _epsilon(0) 0 >>> _epsilon(-0.123456) 1e-06 >>> _epsilon(-123456) 1 >>> _epsilon(np.inf) 0 >>> _epsilon(-np.inf) 0 """ SIGNIFICANT_DIGITS = 6 if num == 0 or np.isinf(num): return 0 epsilon_position = _highest_significant_position(num) - SIGNIFICANT_DIGITS # is position right of decimal point? if epsilon_position < 0: epsilon_position += 1 return 10 ** epsilon_position