Source code for kartothek.core.cube.cube

import typing

import attr

from kartothek.core.cube.constants import KTK_CUBE_UUID_SEPERATOR
from kartothek.core.dataset import _validate_uuid
from kartothek.utils.converters import (
    converter_str,
    converter_str_set,
    converter_str_tupleset,
)

__all__ = ("Cube",)


def _validate_not_subset(of, allow_none=False):
    """
    Create validator to check if an attribute is not a subset of ``of``.

    Parameters
    ----------
    of: str
        Attribute name that the subject under validation should not be a subset of.

    Returns
    -------
    validator: Callable
        Validator that can be used for ``attr.ib``.
    """

    def _v(instance, attribute, value):
        if allow_none and value is None:
            return
        other_set = set(getattr(instance, of))
        if isinstance(value, str):
            my_set = {value}
        else:
            my_set = set(value)
        share = my_set & other_set

        if share:
            raise ValueError(
                "{attribute} cannot share columns with {of}, but share the following: {share}".format(
                    attribute=attribute.name, of=of, share=", ".join(sorted(share))
                )
            )

    return _v


def _validate_subset(of, allow_none=False):
    """
    Create validator to check that an attribute is a subset of ``of``.

    Parameters
    ----------
    of: str
        Attribute name that the subject under validation should be a subset of.

    Returns
    -------
    validator: Callable
        Validator that can be used for ``attr.ib``.
    """

    def _v(instance, attribute, value):
        if allow_none and value is None:
            return
        other_set = set(getattr(instance, of))
        if isinstance(value, str):
            my_set = {value}
        else:
            my_set = set(value)
        too_much = my_set - other_set

        if too_much:
            raise ValueError(
                "{attribute} must be a subset of {of}, but it has additional values: {too_much}".format(
                    attribute=attribute.name,
                    of=of,
                    too_much=", ".join(sorted(too_much)),
                )
            )

    return _v


def _validator_uuid(instance, attribute, value):
    """
    Attr validator to validate if UUIDs are valid.
    """
    _validator_uuid_freestanding(attribute.name, value)


def _validator_uuid_freestanding(name, value):
    """
    Freestanding version of :meth:`_validate_not_subset`.
    """
    if not _validate_uuid(value):
        raise ValueError(
            '{name} ("{value}") is not compatible with kartothek'.format(
                name=name, value=value
            )
        )
    if value.find(KTK_CUBE_UUID_SEPERATOR) != -1:
        raise ValueError(
            '{name} ("{value}") must not contain UUID separator {sep}'.format(
                name=name, value=value, sep=KTK_CUBE_UUID_SEPERATOR
            )
        )


def _validator_not_empty(instance, attribute, value):
    """
    Attr validator to validate that a list is not empty:
    """
    if len(value) == 0:
        raise ValueError("{name} must not be empty".format(name=attribute.name))


[docs]@attr.s(frozen=True)
class Cube:
    """
    OLAP-like cube that fuses multiple datasets.

    Parameters
    ----------
    dimension_columns: Tuple[str, ...]
        Columns that span dimensions. This will imply index columns for the seed dataset, unless
        the automatic index creation is suppressed via ``suppress_index_on``.
    partition_columns: Tuple[str, ...]
        Columns that are used to partition the data. They also create (implicit) primary indices.
    uuid_prefix: str
        All datasets that are part of the cube will have UUIDs of form ``'uuid_prefix++ktk_cube_dataset_id'``.
    seed_dataset: str
        Dataset that present the ground-truth regarding cells present in the cube.
    index_columns: Tuple[str, ...]
        Columns for which secondary indices will be created. They may also be part of non-seed datasets.
    suppress_index_on: Tuple[str, ...]
        Suppress auto-creation of an index on the given dimension columns. Must be a subset of ``dimension_columns``
        (other columns are not subject to automatic index creation).
    """

    dimension_columns = attr.ib(
        converter=converter_str_tupleset,
        type=typing.Tuple[str, ...],
        validator=[_validator_not_empty],
    )

    partition_columns = attr.ib(
        converter=converter_str_tupleset,
        type=typing.Tuple[str, ...],
        validator=[_validator_not_empty, _validate_not_subset("dimension_columns")],
    )

    uuid_prefix = attr.ib(
        converter=converter_str, type=str, validator=[_validator_uuid]
    )

    seed_dataset = attr.ib(
        converter=converter_str, default="seed", type=str, validator=[_validator_uuid]
    )

    index_columns = attr.ib(
        converter=converter_str_set,
        default=None,
        type=typing.FrozenSet[str],
        validator=[
            _validate_not_subset("dimension_columns"),
            _validate_not_subset("partition_columns"),
        ],
    )

    suppress_index_on = attr.ib(
        converter=converter_str_set,
        default=None,
        type=typing.FrozenSet[str],
        validator=[_validate_subset("dimension_columns", allow_none=True)],
    )

[docs]    def ktk_dataset_uuid(self, ktk_cube_dataset_id):
        """
        Get Kartothek dataset UUID for given dataset UUID, so the prefix is included.

        Parameters
        ----------
        ktk_cube_dataset_id: str
            Dataset ID w/o prefix

        Returns
        -------
        ktk_dataset_uuid: str
            Prefixed dataset UUID for Kartothek.

        Raises
        ------
        ValueError
            If ``ktk_cube_dataset_id`` is not a string or if it is not a valid UUID.
        """
        ktk_cube_dataset_id = converter_str(ktk_cube_dataset_id)
        _validator_uuid_freestanding("ktk_cube_dataset_id", ktk_cube_dataset_id)
        return "{uuid_prefix}{sep}{ktk_cube_dataset_id}".format(
            uuid_prefix=self.uuid_prefix,
            sep=KTK_CUBE_UUID_SEPERATOR,
            ktk_cube_dataset_id=ktk_cube_dataset_id,
        )

    @property
    def ktk_index_columns(self):
        """
        Set of all available index columns through Kartothek, primary and secondary.
        """
        # FIXME: do not always add dimension columns. Also, check all users of this property!
        return (
            set(self.partition_columns)
            | set(self.index_columns)
            | (set(self.dimension_columns) - set(self.suppress_index_on))
        )

[docs]    def copy(self, **kwargs):
        """
        Create a new cube specification w/ changed attributes.

        This will not trigger any IO operation, but only affects the cube specification.

        Parameters
        ----------
        kwargs: Dict[str, Any]
            Attributes that should be changed.

        Returns
        -------
        cube: Cube
            New abstract cube.
        """
        return attr.evolve(self, **kwargs)