Source code for kartothek.core.cube.cube

import typing

import attr

from kartothek.core.cube.constants import KTK_CUBE_UUID_SEPERATOR
from kartothek.core.dataset import _validate_uuid
from kartothek.utils.converters import (
    converter_str,
    converter_str_set,
    converter_str_tupleset,
)

__all__ = ("Cube",)


def _validate_not_subset(of, allow_none=False):
    """
    Create validator to check if an attribute is not a subset of ``of``.

    Parameters
    ----------
    of: str
        Attribute name that the subject under validation should not be a subset of.

    Returns
    -------
    validator: Callable
        Validator that can be used for ``attr.ib``.
    """

    def _v(instance, attribute, value):
        if allow_none and value is None:
            return
        other_set = set(getattr(instance, of))
        if isinstance(value, str):
            my_set = {value}
        else:
            my_set = set(value)
        share = my_set & other_set

        if share:
            raise ValueError(
                "{attribute} cannot share columns with {of}, but share the following: {share}".format(
                    attribute=attribute.name, of=of, share=", ".join(sorted(share))
                )
            )

    return _v


def _validate_subset(of, allow_none=False):
    """
    Create validator to check that an attribute is a subset of ``of``.

    Parameters
    ----------
    of: str
        Attribute name that the subject under validation should be a subset of.

    Returns
    -------
    validator: Callable
        Validator that can be used for ``attr.ib``.
    """

    def _v(instance, attribute, value):
        if allow_none and value is None:
            return
        other_set = set(getattr(instance, of))
        if isinstance(value, str):
            my_set = {value}
        else:
            my_set = set(value)
        too_much = my_set - other_set

        if too_much:
            raise ValueError(
                "{attribute} must be a subset of {of}, but it has additional values: {too_much}".format(
                    attribute=attribute.name,
                    of=of,
                    too_much=", ".join(sorted(too_much)),
                )
            )

    return _v


def _validator_uuid(instance, attribute, value):
    """
    Attr validator to validate if UUIDs are valid.
    """
    _validator_uuid_freestanding(attribute.name, value)


def _validator_uuid_freestanding(name, value):
    """
    Freestanding version of :meth:`_validate_not_subset`.
    """
    if not _validate_uuid(value):
        raise ValueError(
            '{name} ("{value}") is not compatible with kartothek'.format(
                name=name, value=value
            )
        )
    if value.find(KTK_CUBE_UUID_SEPERATOR) != -1:
        raise ValueError(
            '{name} ("{value}") must not contain UUID separator {sep}'.format(
                name=name, value=value, sep=KTK_CUBE_UUID_SEPERATOR
            )
        )


def _validator_not_empty(instance, attribute, value):
    """
    Attr validator to validate that a list is not empty:
    """
    if len(value) == 0:
        raise ValueError("{name} must not be empty".format(name=attribute.name))


[docs]@attr.s(frozen=True) class Cube: """ OLAP-like cube that fuses multiple datasets. Parameters ---------- dimension_columns: Tuple[str, ...] Columns that span dimensions. This will imply index columns for the seed dataset, unless the automatic index creation is suppressed via ``suppress_index_on``. partition_columns: Tuple[str, ...] Columns that are used to partition the data. They also create (implicit) primary indices. uuid_prefix: str All datasets that are part of the cube will have UUIDs of form ``'uuid_prefix++ktk_cube_dataset_id'``. seed_dataset: str Dataset that present the ground-truth regarding cells present in the cube. index_columns: Tuple[str, ...] Columns for which secondary indices will be created. They may also be part of non-seed datasets. suppress_index_on: Tuple[str, ...] Suppress auto-creation of an index on the given dimension columns. Must be a subset of ``dimension_columns`` (other columns are not subject to automatic index creation). """ dimension_columns = attr.ib( converter=converter_str_tupleset, type=typing.Tuple[str, ...], validator=[_validator_not_empty], ) partition_columns = attr.ib( converter=converter_str_tupleset, type=typing.Tuple[str, ...], validator=[_validator_not_empty, _validate_not_subset("dimension_columns")], ) uuid_prefix = attr.ib( converter=converter_str, type=str, validator=[_validator_uuid] ) seed_dataset = attr.ib( converter=converter_str, default="seed", type=str, validator=[_validator_uuid] ) index_columns = attr.ib( converter=converter_str_set, default=None, type=typing.FrozenSet[str], validator=[ _validate_not_subset("dimension_columns"), _validate_not_subset("partition_columns"), ], ) suppress_index_on = attr.ib( converter=converter_str_set, default=None, type=typing.FrozenSet[str], validator=[_validate_subset("dimension_columns", allow_none=True)], )
[docs] def ktk_dataset_uuid(self, ktk_cube_dataset_id): """ Get Kartothek dataset UUID for given dataset UUID, so the prefix is included. Parameters ---------- ktk_cube_dataset_id: str Dataset ID w/o prefix Returns ------- ktk_dataset_uuid: str Prefixed dataset UUID for Kartothek. Raises ------ ValueError If ``ktk_cube_dataset_id`` is not a string or if it is not a valid UUID. """ ktk_cube_dataset_id = converter_str(ktk_cube_dataset_id) _validator_uuid_freestanding("ktk_cube_dataset_id", ktk_cube_dataset_id) return "{uuid_prefix}{sep}{ktk_cube_dataset_id}".format( uuid_prefix=self.uuid_prefix, sep=KTK_CUBE_UUID_SEPERATOR, ktk_cube_dataset_id=ktk_cube_dataset_id, )
@property def ktk_index_columns(self): """ Set of all available index columns through Kartothek, primary and secondary. """ # FIXME: do not always add dimension columns. Also, check all users of this property! return ( set(self.partition_columns) | set(self.index_columns) | (set(self.dimension_columns) - set(self.suppress_index_on)) )
[docs] def copy(self, **kwargs): """ Create a new cube specification w/ changed attributes. This will not trigger any IO operation, but only affects the cube specification. Parameters ---------- kwargs: Dict[str, Any] Attributes that should be changed. Returns ------- cube: Cube New abstract cube. """ return attr.evolve(self, **kwargs)