Source code for dimcat.data.base
from __future__ import annotations
import logging
import os
from pathlib import Path
from pprint import pformat
from typing import Optional
import marshmallow as mm
from dimcat.base import (
DimcatConfig,
DimcatObject,
get_pickle_schema,
get_schema,
get_setting,
)
from dimcat.dc_exceptions import BaseFilePathMismatchError
module_logger = logging.getLogger(__name__)
[docs]class AbsolutePathStr(str):
"""This is just a string but if it includes the HOME directory, it is represented with a leading '~'."""
def __repr__(self):
"""If a basepath starts with the (home) directory that "~" resolves to, replace that part with "~"."""
path = str(self)
home = os.path.expanduser("~")
if path.startswith(home):
path = "~" + path[len(home) :]
return path
[docs]def resolve_path(path) -> Optional[AbsolutePathStr]:
"""Resolves '~' to HOME directory and turns ``path`` into an absolute path.
This is an identical copy of the function in dimcat.utils.
"""
if path is None:
return None
if isinstance(path, str):
pass
elif isinstance(path, Path):
path = str(path)
else:
raise TypeError(f"Expected str or Path, got {type(path)}")
if "~" in path:
path = os.path.expanduser(path)
else:
path = os.path.abspath(path)
path = path.rstrip("/\\")
return AbsolutePathStr(path)
[docs]class Data(DimcatObject):
"""
This base class unites all classes containing data in some way or another.
"""
[docs] @staticmethod
def treat_new_basepath(
basepath: str, filepath=None, other_logger=None
) -> AbsolutePathStr:
basepath_arg = resolve_path(basepath)
if not os.path.isdir(basepath_arg):
if get_setting("auto_make_dirs"):
os.makedirs(basepath_arg)
else:
raise NotADirectoryError(
f"basepath {basepath_arg!r} is not an existing directory."
)
if filepath and not os.path.isfile(os.path.join(basepath_arg, filepath)):
# this would result in a normpath that does not exist
raise BaseFilePathMismatchError(basepath_arg, filepath)
if other_logger is None:
other_logger = module_logger
other_logger.debug(f"The basepath been set to {basepath_arg!r}")
return basepath_arg
[docs] @classmethod
@property
def pickle_schema(cls):
"""Returns the (instantiated) PickleSchema singleton object for this class. It is different from the 'normal'
Schema in that it stores the tabular data to disk and returns the path to its descriptor.
"""
return get_pickle_schema(cls.dtype)
[docs] class PickleSchema(DimcatObject.Schema):
"""When serializing data objects, the basepath is used as location, but it is not included in the
descriptor, according to the frictionless specification."""
[docs] @mm.post_dump()
def validate_dump(self, data, **kwargs):
"""
Make sure to never return invalid serialization data. Identical with PipelineStep.Schema.validate_dump().
"""
if "dtype" not in data:
msg = (
f"{self.name}: The serialized data doesn't have a 'dtype' field, meaning that DiMCAT would "
f"not be able to deserialize it."
)
raise mm.ValidationError(msg)
dtype_schema = get_schema(data["dtype"])
report = dtype_schema.validate(data)
if report:
raise mm.ValidationError(
f"Dump of {data['dtype']} created with a {self.name} could not be validated by "
f"{dtype_schema.name}."
f"\n\nDUMP:\n{pformat(data, sort_dicts=False)}"
f"\n\nREPORT:\n{pformat(report, sort_dicts=False)}"
)
return data
[docs] class Schema(DimcatObject.Schema):
basepath = mm.fields.Str(
allow_none=True,
metadata=dict(
expose=False, description="The directory where data would be stored."
),
)
def __init__(
self,
basepath: Optional[str] = None,
):
super().__init__()
self._basepath = None
if basepath is not None:
self.basepath = basepath
def __str__(self):
return self.info(return_str=True)
@property
def basepath(self) -> str:
return self._basepath
@basepath.setter
def basepath(self, basepath: str):
self._basepath = self.treat_new_basepath(basepath, other_logger=self.logger)
[docs] def get_basepath(
self,
set_default_if_missing: bool = False,
) -> str:
"""Get the basepath of the resource. If not specified, the default basepath is returned.
If ``set_default_if_missing`` is set to True and no basepath has been set (e.g. during initialization),
the :attr:`basepath` is permanently set to the default basepath.
"""
if not self.basepath:
default_basepath = resolve_path(get_setting("default_basepath"))
if set_default_if_missing:
self._basepath = default_basepath
self.logger.debug(
f"The {self.name}'s basepath has been set to the default {default_basepath!r}"
)
return default_basepath
return self.basepath
[docs] def to_config(self, pickle=False) -> DimcatConfig:
"""If ``pickle`` is set to True,"""
return DimcatConfig(self.to_dict(pickle=pickle))
[docs] def to_dict(self, pickle=False) -> dict:
if pickle:
return dict(self.pickle_schema.dump(self))
return dict(self.schema.dump(self))