project module

class datumaro.components.project.IgnoreMode(value)[source]

Bases: enum.Enum

An enumeration.

rewrite = 1
append = 2
remove = 3
class datumaro.components.project.ProjectSources(tree: datumaro.components.project.Tree)[source]

Bases: datumaro.components.project.CrudProxy[datumaro.components.config_model.Source]

__init__(*args, **kwargs)[source]

Initialize self. See help(type(self)) for accurate signature.

class datumaro.components.project.BuildStageType(value)[source]

Bases: enum.Enum

An enumeration.

source = 1
project = 2
transform = 3
filter = 4
convert = 5
inference = 6
class datumaro.components.project.Pipeline(config: Optional[datumaro.components.config_model.PipelineConfig] = None)[source]

Bases: object

static _get_subgraph(graph, target)[source]

Returns a subgraph with all the target dependencies and the target itself.

__init__(*args, **kwargs)[source]

Initialize self. See help(type(self)) for accurate signature.

property head: str
property head_node
get_slice(target) datumaro.components.project.Pipeline[source]
class datumaro.components.project.ProjectBuilder(project: datumaro.components.project.Project, tree: datumaro.components.project.Tree)[source]

Bases: object

_init_pipeline(pipeline: datumaro.components.project.Pipeline, working_dir_hashes=None)[source]

Initializes datasets in the pipeline nodes. Currently, only the head node will have a dataset on exit, so no extra memory is wasted for the intermediate nodes.

make_dataset(pipeline: datumaro.components.project.Pipeline) datumaro.components.extractor.IExtractor[source]
class datumaro.components.project.ProjectBuildTargets(tree: datumaro.components.project.Tree)[source]

Bases: datumaro.components.project.CrudProxy[datumaro.components.config_model.BuildTarget]

MAIN_TARGET = 'project'
BASE_STAGE = 'root'
add_target(name) datumaro.components.config_model.BuildTarget[source]
add_stage(target, value, prev=None, name=None) str[source]
remove_target(name: str)[source]
remove_stage(target: str, name: str)[source]
add_transform_stage(target: str, transform: str, params: Optional[Dict] = None, name: Optional[str] = None)[source]
add_inference_stage(target: str, model: str, params: Optional[Dict] = None, name: Optional[str] = None)[source]
add_filter_stage(target: str, expr: str, params: Optional[Dict] = None, name: Optional[str] = None)[source]
add_convert_stage(target: str, format: str, params: Optional[Dict] = None, name: Optional[str] = None)[source]
static make_target_name(target: str, stage: Optional[str] = None) str[source]
classmethod split_target_name(name: str) Tuple[str, str][source]
classmethod strip_target_name(name: str) str[source]
make_pipeline(target: str) datumaro.components.project.Pipeline[source]
class datumaro.components.project.GitWrapper(project_dir, repo=None)[source]

Bases: object

static module()[source]
property initialized
init()[source]
close()[source]
checkout(ref: str, dst_dir=None, clean=False, force=False)[source]
add(paths, base=None)[source]

Adds paths to index. Paths can be truncated relatively to base.

commit(message) str[source]

Creates a new revision from index. Returns: new revision hash.

GitTree

alias of object

GitStatus

alias of str

status(paths: Union[str, GitTree, Iterable[str]] = None, base_dir: str = None) Dict[str, GitStatus][source]

Compares working directory and index.

Parameters
  • paths – an iterable of paths to compare, a git.Tree, or None. When None, uses all the paths from HEAD.

  • base_dir – a base path for paths. Paths will be prepended by this. When None or ‘’, uses repo root. Can be useful, if index contains displaced paths, which needs to be mapped on real paths.

The statuses are:
  • “A” for added paths

  • “D” for deleted paths

  • “R” for renamed paths

  • “M” for paths with modified data

  • “T” for changed in the type paths

Returns: { abspath(base_dir + path): status }

is_ref(rev)[source]
has_commits()[source]
get_tree(ref)[source]
write_tree(tree, base_path: str, include_files: Optional[List[str]] = None)[source]
property head: str
property branch: str
rev_parse(ref: str) Tuple[str, str][source]

Expands named refs and tags.

Returns: object type, object hash

ignore(paths: Union[str, List[str]], mode: Union[None, str, datumaro.components.project.IgnoreMode] = None, gitignore: Optional[str] = None)[source]
HASH_LEN = 40
classmethod is_hash(s: str) bool[source]
log(depth=10) List[Tuple[Any, int]][source]

Returns: a list of (commit, index) pairs

class datumaro.components.project.Tree(project: datumaro.components.project.Project, config: Union[None, Dict, datumaro.components.config.Config, datumaro.components.config_model.TreeConfig] = None, rev: Union[None, Revision] = None)[source]

Bases: object

can be:
  • attached to the work dir

  • attached to a revision

save()[source]
dump(path)[source]
clone() datumaro.components.project.Tree[source]
property sources: datumaro.components.project.ProjectSources
property build_targets: datumaro.components.project.ProjectBuildTargets
property config: datumaro.components.config.Config
property env: datumaro.components.environment.Environment
property rev: Union[None, Revision]
make_pipeline(target: Optional[str] = None) datumaro.components.project.Pipeline[source]
make_dataset(target: Union[None, str, datumaro.components.project.Pipeline] = None) datumaro.components.dataset.Dataset[source]
property is_working_tree: bool
source_data_dir(source) str[source]
class datumaro.components.project.DiffStatus(value)[source]

Bases: enum.Enum

An enumeration.

added = 1
modified = 2
removed = 3
missing = 4
foreign_modified = 5
class datumaro.components.project.Revision(x)
class datumaro.components.project.ObjectId(x)
class datumaro.components.project.Project(path: Optional[str] = None, readonly=False)[source]

Bases: object

static find_project_dir(path: str) Optional[str][source]
static migrate_from_v1_to_v2(src_dir: str, dst_dir: str, skip_import_errors=False)[source]
__init__(path: Optional[str] = None, readonly=False)[source]
classmethod init(path) datumaro.components.project.Project[source]
close()[source]
save()[source]
property readonly: bool
property working_tree: datumaro.components.project.Tree
property head: datumaro.components.project.Tree
property head_rev: Revision
property branch: str
property config: datumaro.components.config.Config
property env: datumaro.components.environment.Environment
property models: Dict[str, datumaro.components.config_model.Model]
get_rev(rev: Union[None, Revision]) datumaro.components.project.Tree[source]
Reference conventions:
  • None or “” - working dir

  • “<40 symbols>” - revision hash

is_rev_cached(rev: Revision) bool[source]
is_obj_cached(obj_hash: ObjectId) bool[source]
cache_path(obj_hash: ObjectId) str[source]
source_data_dir(name: str) str[source]
remove_cache_obj(ref: Union[Revision, ObjectId])[source]
validate_source_name(name: str)[source]
compute_source_hash(data_dir: str, dvcfile: Optional[str] = None, no_cache: bool = True, allow_external: bool = True) ObjectId[source]
refresh_source_hash(source: str, no_cache: bool = True) ObjectId[source]

Computes and updates the source hash in the working directory.

Returns: hash

import_source(name: str, url: Optional[str], format: str, options: Optional[Dict] = None, *, no_cache: bool = True, no_hash: bool = True, rpath: Optional[str] = None) datumaro.components.config_model.Source[source]

Adds a new source (dataset) to the working directory of the project.

When ‘rpath’ is specified, will copy all the data from URL, but read only the specified file. Required to support subtasks and subsets in datasets.

Parameters
  • name (str) – Name of the new source

  • url (str) – URL of the new source. A path to a file or directory

  • format (str) – Dataset format

  • options (dict) – Options for the format Extractor

  • no_cache (bool) – Don’t put a copy of files into the project cache. Can be used to reduce project cache size.

  • no_hash (bool) – Don’t compute source data hash. Implies “no_cache”. Useful to reduce import time at the cost of disabled data integrity checks.

  • rpath (str) – Used to specify a relative path to the dataset inside of the directory pointed by URL.

Returns: the new source config

add_source(path: str, format: str, options: Optional[Dict] = None, *, rpath: Optional[str] = None) Tuple[str, datumaro.components.config_model.Source][source]

Adds a new source (dataset) from the working directory of the project.

Only directories from the project root can be added. This command is useful after a source was removed and you need to re-add it, or when the dataset was copied or downloaded manually.

When ‘rpath’ is specified, will copy all the data from URL, but read only the specified file. Required to support subtasks and subsets in datasets.

Parameters
  • url (str) – URL of the new source. A path to a directory

  • format (str) – Dataset format

  • options (dict) – Options for the format Extractor

  • rpath (str) – Used to specify a relative path to the dataset inside of the directory pointed by URL.

Returns: the name and the config of the new source

remove_source(name: str, *, force: bool = False, keep_data: bool = True)[source]
Options:
  • force (bool) - ignores errors and tries to wipe remaining data

  • keep_data (bool) - leaves source data untouched

commit(message: str, *, no_cache: bool = False, allow_empty: bool = False, allow_foreign: bool = False) Revision[source]

Copies tree and objects from the working dir to the cache. Creates a new commit. Moves the HEAD pointer to the new commit.

Options:

  • no_cache (bool) - don’t put added dataset data into cache,

    store only metainfo. Can be used to reduce storage size.

  • allow_empty (bool) - allow commits with no changes.

  • allow_foreign (bool) - allow commits with changes made not by Datumaro.

Returns: the new commit hash

checkout(rev: Union[None, Revision] = None, sources: Union[None, str, Iterable[str]] = None, *, force: bool = False)[source]

Copies tree and objects from the cache to the working tree.

Sets HEAD to the specified revision, unless sources specified. When sources specified, only copies objects from the cache to the working tree. When no revision and no sources is specified, restores the sources from the current revision.

By default, uses the current (HEAD) revision.

Options:
  • force (bool) - ignore unsaved changes. By default, an error is raised

is_ref(ref: Union[None, str]) bool[source]
has_commits() bool[source]
status() Dict[str, datumaro.components.project.DiffStatus][source]
history(max_count=10) List[Tuple[Revision, str]][source]
diff(rev_a: Union[datumaro.components.project.Tree, Revision], rev_b: Union[datumaro.components.project.Tree, Revision]) Dict[str, datumaro.components.project.DiffStatus][source]

Compares 2 revision trees.

Returns: { target_name: status } for changed targets

model_data_dir(name: str) str[source]
make_model(name: str) datumaro.components.launcher.Launcher[source]
add_model(name: str, launcher: str, options: Optional[Dict[str, Any]] = None) datumaro.components.config_model.Model[source]
remove_model(name: str)[source]