Source code for kartothek.io.testing.stats_cube

# -*- coding: utf-8 -*-
import dask.bag as db
import pandas as pd
import pytest

from kartothek.core.cube.cube import Cube
from kartothek.io.dask.bag_cube import build_cube_from_bag
from kartothek.io.eager_cube import build_cube

__all__ = (
    "test_fail_blocksize_negative",
    "test_fail_blocksize_wrong_type",
    "test_fail_blocksize_zero",
    "test_fail_no_store_factory",
    "test_multifile",
    "test_simple",
)


[docs]def test_simple(driver, function_store, function_store_rwro): df_seed = pd.DataFrame( {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]} ) df_enrich = pd.DataFrame( {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "foo": [10, 11, 12, 13]} ) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube") build_cube( data={cube.seed_dataset: df_seed, "enrich": df_enrich}, cube=cube, store=function_store, ) result = driver(cube=cube, store=function_store_rwro) assert set(result.keys()) == {cube.seed_dataset, "enrich"} stats_seed = result[cube.seed_dataset] assert stats_seed["partitions"] == 2 assert stats_seed["files"] == 2 assert stats_seed["rows"] == 4 assert stats_seed["blobsize"] > 0 stats_enrich = result["enrich"] assert stats_enrich["partitions"] == stats_seed["partitions"] assert stats_enrich["files"] == stats_seed["files"] assert stats_enrich["rows"] == stats_seed["rows"] assert stats_enrich["blobsize"] != stats_seed["blobsize"]
[docs]def test_multifile(driver, function_store): dfs = [pd.DataFrame({"x": [i], "p": [0], "v1": [10]}) for i in range(2)] cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube") build_cube_from_bag( data=db.from_sequence(dfs, partition_size=1), cube=cube, store=function_store ).compute() result = driver(cube=cube, store=function_store) assert set(result.keys()) == {cube.seed_dataset} stats_seed = result[cube.seed_dataset] assert stats_seed["partitions"] == 1 assert stats_seed["files"] == 2 assert stats_seed["rows"] == 2 assert stats_seed["blobsize"] > 0
[docs]def test_fail_no_store_factory(driver, function_store, skip_eager): df_seed = pd.DataFrame( {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]} ) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube") build_cube(data=df_seed, cube=cube, store=function_store) store = function_store() with pytest.raises(TypeError) as exc: driver(cube=cube, store=store, no_run=True) assert str(exc.value) == "store must be a factory but is HFilesystemStore"
[docs]def test_fail_blocksize_wrong_type(driver, function_store, skip_eager): cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube") with pytest.raises(TypeError, match="blocksize must be an integer but is str"): driver(cube=cube, store=function_store, blocksize="foo")
[docs]def test_fail_blocksize_negative(driver, function_store, skip_eager): cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube") with pytest.raises(ValueError, match="blocksize must be > 0 but is -1"): driver(cube=cube, store=function_store, blocksize=-1)
[docs]def test_fail_blocksize_zero(driver, function_store, skip_eager): cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube") with pytest.raises(ValueError, match="blocksize must be > 0 but is 0"): driver(cube=cube, store=function_store, blocksize=0)