Source code for kartothek.core.testing

# -*- coding: utf-8 -*-


import contextlib
import datetime
from datetime import date
from unittest import mock
from warnings import catch_warnings, simplefilter

import hypothesis.extra.numpy as hyp_np
import hypothesis.strategies as hyp_st
import numpy as np
import pandas as pd
from hypothesis.errors import NonInteractiveExampleWarning

from kartothek.core.uuid import gen_uuid_object

TIME_TO_FREEZE = datetime.datetime(2000, 1, 1, 1, 1, 1, 1)
TIME_TO_FREEZE_ISO = "2000-01-01T01:01:01.000001"
TIME_TO_FREEZE_ISO_QUOTED = "2000-01-01T01%3A01%3A01.000001"


[docs]def get_dataframe_alltypes():
    """
    Return a pandas DataFrame of length one with a column for each commonly used data types
    """
    # fmt: off
    not_nested = get_dataframe_not_nested()
    nested_types = pd.DataFrame(
        {
            "array_int8": pd.Series([np.array([1], dtype=np.int8)], dtype=object),
            "array_int16": pd.Series([np.array([1], dtype=np.int16)], dtype=object),
            "array_int32": pd.Series([np.array([1], dtype=np.int32)], dtype=object),
            "array_int64": pd.Series([np.array([1], dtype=np.int64)], dtype=object),
            "array_uint8": pd.Series([np.array([1], dtype=np.uint8)], dtype=object),
            "array_uint16": pd.Series([np.array([1], dtype=np.uint16)], dtype=object),
            "array_uint32": pd.Series([np.array([1], dtype=np.uint32)], dtype=object),
            "array_uint64": pd.Series([np.array([1], dtype=np.uint64)], dtype=object),
            "array_float32": pd.Series([np.array([1], dtype=np.float32)], dtype=object),
            "array_float64": pd.Series([np.array([1], dtype=np.float64)], dtype=object),
            "array_unicode": pd.Series([np.array(["Ö"], dtype=object)], dtype=object),
        }
    )

    return pd.concat([not_nested, nested_types], axis=1).reset_index(drop=True).sort_index(axis=1)
    # fmt: on


[docs]def get_dataframe_not_nested():
    return pd.DataFrame(
        {
            "bool": pd.Series([1], dtype=np.bool_),
            "int8": pd.Series([1], dtype=np.int8),
            "int16": pd.Series([1], dtype=np.int16),
            "int32": pd.Series([1], dtype=np.int32),
            "int64": pd.Series([1], dtype=np.int64),
            "uint8": pd.Series([1], dtype=np.uint8),
            "uint16": pd.Series([1], dtype=np.uint16),
            "uint32": pd.Series([1], dtype=np.uint32),
            "uint64": pd.Series([1], dtype=np.uint64),
            "float32": pd.Series([1.0], dtype=np.float32),
            "float64": pd.Series([1.0], dtype=np.float64),
            "date": pd.Series([date(2018, 1, 1)], dtype=object),
            "datetime64": pd.Series(["2018-01-01"], dtype="datetime64[ns]"),
            "unicode": pd.Series(["Ö"], dtype=str),
            "null": pd.Series([None], dtype=object),
            # Adding a byte type with value as byte sequence which can not be encoded as UTF8
            "byte": pd.Series([gen_uuid_object().bytes], dtype=object),
        }
    ).sort_index(axis=1)


[docs]def get_scalar_dtype_strategy(exclude=None):
    """
    A `hypothesis` strategy yielding
    """
    possible_strategies = {
        "datetime": hyp_np.datetime64_dtypes(max_period="ms", min_period="ns"),
        "uint": hyp_np.unsigned_integer_dtypes(),
        "int": hyp_np.integer_dtypes(),
        "float": hyp_np.floating_dtypes(),
        "byte": hyp_np.byte_string_dtypes(),
        "unicode": hyp_np.unicode_string_dtypes(),
    }
    if exclude is None:
        exclude = {}
    elif not isinstance(exclude, list):
        exclude = [exclude]
    for ex in exclude:
        if ex in possible_strategies:
            del possible_strategies[ex]
        else:
            raise ValueError(
                "Strategy {} unknown. Possible values are {}".format(
                    ex, possible_strategies.keys()
                )
            )
    return hyp_st.one_of(*list(possible_strategies.values()))


[docs]def get_numpy_array_strategy(
    shape=10, exclude_dtypes=None, unique=False, sort=False, allow_nan=True
):
    # the text example generation has quite some overhead when called the first time.
    # we don't want this in our test sample generation since the HealthCheck of hypothesis
    # might be triggered.
    with catch_warnings():
        simplefilter("ignore", NonInteractiveExampleWarning)
        hyp_st.text().example()

    dtype_strategy = get_scalar_dtype_strategy(exclude_dtypes)
    array_strategy = hyp_np.arrays(dtype=dtype_strategy, shape=shape, unique=unique)

    if exclude_dtypes is None or "date" not in exclude_dtypes:
        date_start = hyp_st.lists(
            hyp_st.dates(
                min_value=datetime.date(1970, 1, 1), max_value=datetime.date(2100, 1, 1)
            ),
            min_size=shape,
            max_size=shape,
            unique=unique,
        )
        date_start = date_start.map(np.array)
        one_of_strategies = [array_strategy] + [date_start]
        array_strategy = hyp_st.one_of(one_of_strategies)

    def _restrict_datetime_ranges(arr):
        if np.issubdtype(arr.dtype, np.datetime64):
            return all(
                (arr < np.datetime64("2200-01-01"))
                & (arr > np.datetime64("1970-01-01"))
            )
        return True

    if exclude_dtypes is None or "datetime" not in exclude_dtypes:
        array_strategy = array_strategy.filter(_restrict_datetime_ranges)
    if not allow_nan:

        def _check_for_nan(arr):
            if np.issubdtype(arr.dtype, np.floating):
                return not any(np.isnan(arr))
            return True

        array_strategy = array_strategy.filter(_check_for_nan)
    if unique and allow_nan:

        def _maximum_single_nan(arr):
            if np.issubdtype(arr.dtype, np.floating):
                return sum(np.isnan(arr)) <= 1
            return True

        array_strategy = array_strategy.filter(_maximum_single_nan)

    if sort:
        array_strategy = array_strategy.map(np.sort)
    return array_strategy


[docs]@contextlib.contextmanager
def cm_frozen_time(time_to_freeze):
    """
    Context manager to monkeypatch kartothek.core._time.* to return
    a fixed datetime value `time_to_freeze`.
    """
    with mock.patch("kartothek.core._time.datetime_now") as mock_now, mock.patch(
        "kartothek.core._time.datetime_utcnow"
    ) as mock_utcnow:
        mock_now.return_value = time_to_freeze
        mock_utcnow.return_value = time_to_freeze
        yield