DataChain

The DataChain class creates a data chain, which is a sequence of data manipulation steps such as reading data from storages, running AI or LLM models or calling external services API to validate or enrich data. See DataChain for examples of how to create a chain.

C `module-attribute`

C = Column

Column

Column(
    text, type_=None, is_literal=False, _selectable=None
)

Bases: ColumnClause

Source code in datachain/query/schema.py

def __init__(self, text, type_=None, is_literal=False, _selectable=None):
    """Dataset column."""
    self.name = ColumnMeta.to_db_name(text)
    super().__init__(
        self.name, type_=type_, is_literal=is_literal, _selectable=_selectable
    )

glob

glob(glob_str)

Search for matches using glob pattern matching.

Source code in datachain/query/schema.py

def glob(self, glob_str):
    """Search for matches using glob pattern matching."""
    return self.op("GLOB")(glob_str)

regexp

regexp(regexp_str)

Search for matches using regexp pattern matching.

Source code in datachain/query/schema.py

def regexp(self, regexp_str):
    """Search for matches using regexp pattern matching."""
    return self.op("REGEXP")(regexp_str)

read_csv

read_csv(
    path,
    delimiter: Optional[str] = None,
    header: bool = True,
    output: OutputType = None,
    column: str = "",
    model_name: str = "",
    source: bool = True,
    nrows=None,
    session: Optional[Session] = None,
    settings: Optional[dict] = None,
    column_types: Optional[
        dict[str, Union[str, DataType]]
    ] = None,
    parse_options: Optional[
        dict[str, Union[str, Union[bool, Callable]]]
    ] = None,
    **kwargs
) -> DataChain

Generate chain from csv files.

Parameters:

path –

Storage URI with directory. URI must start with storage prefix such as s3://, gs://, az:// or "file:///".
delimiter –

Character for delimiting columns. Takes precedence if also specified in parse_options. Defaults to ",".
header –

Whether the files include a header row.
output –

Dictionary or feature class defining column names and their corresponding types. List of column names is also accepted, in which case types will be inferred.
column –

Created column name.
model_name –

Generated model name.
source –

Whether to include info about the source file.
nrows –

Optional row limit.
session –

Session to use for the chain.
settings –

Settings to use for the chain.
column_types –

Dictionary of column names and their corresponding types. It is passed to CSV reader and for each column specified type auto inference is disabled.
parse_options (Optional[dict[str, Union[str, Union[bool, Callable]]]], default: None ) –

Tells the parser how to process lines. See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html

Example

Reading a csv file:

import datachain as dc
chain = dc.read_csv("s3://mybucket/file.csv")

Reading csv files from a directory as a combined dataset:

import datachain as dc
chain = dc.read_csv("s3://mybucket/dir")

Source code in datachain/lib/dc/csv.py

def read_csv(
    path,
    delimiter: Optional[str] = None,
    header: bool = True,
    output: OutputType = None,
    column: str = "",
    model_name: str = "",
    source: bool = True,
    nrows=None,
    session: Optional[Session] = None,
    settings: Optional[dict] = None,
    column_types: Optional[dict[str, "Union[str, ArrowDataType]"]] = None,
    parse_options: Optional[dict[str, "Union[str, Union[bool, Callable]]"]] = None,
    **kwargs,
) -> "DataChain":
    """Generate chain from csv files.

    Parameters:
        path : Storage URI with directory. URI must start with storage prefix such
            as `s3://`, `gs://`, `az://` or "file:///".
        delimiter : Character for delimiting columns. Takes precedence if also
            specified in `parse_options`. Defaults to ",".
        header : Whether the files include a header row.
        output : Dictionary or feature class defining column names and their
            corresponding types. List of column names is also accepted, in which
            case types will be inferred.
        column : Created column name.
        model_name : Generated model name.
        source : Whether to include info about the source file.
        nrows : Optional row limit.
        session : Session to use for the chain.
        settings : Settings to use for the chain.
        column_types : Dictionary of column names and their corresponding types.
            It is passed to CSV reader and for each column specified type auto
            inference is disabled.
        parse_options: Tells the parser how to process lines.
            See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html

    Example:
        Reading a csv file:
        ```py
        import datachain as dc
        chain = dc.read_csv("s3://mybucket/file.csv")
        ```

        Reading csv files from a directory as a combined dataset:
        ```py
        import datachain as dc
        chain = dc.read_csv("s3://mybucket/dir")
        ```
    """
    from pandas.io.parsers.readers import STR_NA_VALUES
    from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
    from pyarrow.dataset import CsvFileFormat
    from pyarrow.lib import type_for_alias

    from .storage import read_storage

    parse_options = parse_options or {}
    if "delimiter" not in parse_options:
        parse_options["delimiter"] = ","
    if delimiter:
        parse_options["delimiter"] = delimiter

    if column_types:
        column_types = {
            name: type_for_alias(typ) if isinstance(typ, str) else typ
            for name, typ in column_types.items()
        }
    else:
        column_types = {}

    chain = read_storage(path, session=session, settings=settings, **kwargs)

    column_names = None
    if not header:
        if not output:
            msg = "error parsing csv - provide output if no header"
            raise DatasetPrepareError(chain.name, msg)
        if isinstance(output, Sequence):
            column_names = output  # type: ignore[assignment]
        elif isinstance(output, dict):
            column_names = list(output.keys())
        elif (fr := ModelStore.to_pydantic(output)) is not None:
            column_names = list(fr.model_fields.keys())
        else:
            msg = f"error parsing csv - incompatible output type {type(output)}"
            raise DatasetPrepareError(chain.name, msg)

    parse_options = ParseOptions(**parse_options)
    read_options = ReadOptions(column_names=column_names)
    convert_options = ConvertOptions(
        strings_can_be_null=True,
        null_values=STR_NA_VALUES,
        column_types=column_types,
    )
    format = CsvFileFormat(
        parse_options=parse_options,
        read_options=read_options,
        convert_options=convert_options,
    )
    return chain.parse_tabular(
        output=output,
        column=column,
        model_name=model_name,
        source=source,
        nrows=nrows,
        format=format,
        parse_options=parse_options,
    )

read_dataset

read_dataset(
    name: str,
    version: Optional[Union[str, int]] = None,
    session: Optional[Session] = None,
    settings: Optional[dict] = None,
    fallback_to_studio: bool = True,
    delta: Optional[bool] = False,
    delta_on: Optional[Union[str, Sequence[str]]] = None,
    delta_result_on: Optional[
        Union[str, Sequence[str]]
    ] = None,
    delta_compare: Optional[
        Union[str, Sequence[str]]
    ] = None,
) -> DataChain

Get data from a saved Dataset. It returns the chain itself. If dataset or version is not found locally, it will try to pull it from Studio.

Parameters:

name –

dataset name
version –

dataset version
session –

Session to use for the chain.
settings –

Settings to use for the chain.
fallback_to_studio –

Try to pull dataset from Studio if not found locally. Default is True.
delta (Optional[bool], default: False ) –

If set to True, we optimize the creation of new dataset versions by calculating the diff between the latest version of this storage and the version used to create the most recent version of the resulting chain dataset (the one specified in .save()). We then run the "diff" chain using only the diff data, rather than the entire storage data, and merge that diff chain with the latest version of the resulting dataset to create a new version. This approach avoids applying modifications to all records from storage every time, which can be an expensive operation. The diff is calculated using the DataChain.compare() method, which compares the delta_on fields to find matches and checks the compare fields to determine if a record has changed. Note that this process only considers added and modified records in storage; deleted records are not removed from the new dataset version. This calculation is based on the difference between the current version of the source and the version used to create the dataset.
delta_on (Optional[Union[str, Sequence[str]]], default: None ) –

A list of fields that uniquely identify rows in the source. If two rows have the same values, they are considered the same (e.g., they could be different versions of the same row in a versioned source). This is used in the delta update to calculate the diff.
delta_result_on (Optional[Union[str, Sequence[str]]], default: None ) –

A list of fields in the resulting dataset that correspond to the delta_on fields from the source. This is needed to identify rows that have changed in the source but are already present in the current version of the resulting dataset, in order to avoid including outdated versions of those rows in the new dataset. We retain only the latest versions of rows to prevent duplication. There is no need to define this if the delta_on fields are present in the final dataset and have not been renamed.
delta_compare (Optional[Union[str, Sequence[str]]], default: None ) –

A list of fields used to check if the same row has been modified in the new version of the source. If not defined, all fields except those defined in delta_on will be used.

Example

import datachain as dc
chain = dc.read_dataset("my_cats")

chain = dc.read_dataset("my_cats", fallback_to_studio=False)

chain = dc.read_dataset("my_cats", version="1.0.0")

session = Session.get(client_config={"aws_endpoint_url": "<minio-url>"})
settings = {
    "cache": True,
    "parallel": 4,
    "workers": 4,
    "min_task_size": 1000,
    "prefetch": 10,
}
chain = dc.read_dataset(
    name="my_cats",
    version="1.0.0",
    session=session,
    settings=settings,
    fallback_to_studio=True,
)

Source code in datachain/lib/dc/datasets.py

def read_dataset(
    name: str,
    version: Optional[Union[str, int]] = None,
    session: Optional[Session] = None,
    settings: Optional[dict] = None,
    fallback_to_studio: bool = True,
    delta: Optional[bool] = False,
    delta_on: Optional[Union[str, Sequence[str]]] = None,
    delta_result_on: Optional[Union[str, Sequence[str]]] = None,
    delta_compare: Optional[Union[str, Sequence[str]]] = None,
) -> "DataChain":
    """Get data from a saved Dataset. It returns the chain itself.
    If dataset or version is not found locally, it will try to pull it from Studio.

    Parameters:
        name : dataset name
        version : dataset version
        session : Session to use for the chain.
        settings : Settings to use for the chain.
        fallback_to_studio : Try to pull dataset from Studio if not found locally.
            Default is True.
        delta: If set to True, we optimize the creation of new dataset versions by
            calculating the diff between the latest version of this storage and the
            version used to create the most recent version of the resulting chain
            dataset (the one specified in `.save()`). We then run the "diff" chain
            using only the diff data, rather than the entire storage data, and merge
            that diff chain with the latest version of the resulting dataset to create
            a new version. This approach avoids applying modifications to all records
            from storage every time, which can be an expensive operation.
            The diff is calculated using the `DataChain.compare()` method, which
            compares the `delta_on` fields to find matches and checks the compare
            fields to determine if a record has changed. Note that this process only
            considers added and modified records in storage; deleted records are not
            removed from the new dataset version.
            This calculation is based on the difference between the current version
            of the source and the version used to create the dataset.
        delta_on: A list of fields that uniquely identify rows in the source.
            If two rows have the same values, they are considered the same (e.g., they
            could be different versions of the same row in a versioned source).
            This is used in the delta update to calculate the diff.
        delta_result_on: A list of fields in the resulting dataset that correspond
            to the `delta_on` fields from the source.
            This is needed to identify rows that have changed in the source but are
            already present in the current version of the resulting dataset, in order
            to avoid including outdated versions of those rows in the new dataset.
            We retain only the latest versions of rows to prevent duplication.
            There is no need to define this if the `delta_on` fields are present in
            the final dataset and have not been renamed.
        delta_compare: A list of fields used to check if the same row has been modified
            in the new version of the source.
            If not defined, all fields except those defined in delta_on will be used.

    Example:
        ```py
        import datachain as dc
        chain = dc.read_dataset("my_cats")
        ```

        ```py
        chain = dc.read_dataset("my_cats", fallback_to_studio=False)
        ```

        ```py
        chain = dc.read_dataset("my_cats", version="1.0.0")
        ```

        ```py
        session = Session.get(client_config={"aws_endpoint_url": "<minio-url>"})
        settings = {
            "cache": True,
            "parallel": 4,
            "workers": 4,
            "min_task_size": 1000,
            "prefetch": 10,
        }
        chain = dc.read_dataset(
            name="my_cats",
            version="1.0.0",
            session=session,
            settings=settings,
            fallback_to_studio=True,
        )
        ```
    """
    from datachain.telemetry import telemetry

    from .datachain import DataChain

    if version is not None:
        try:
            # for backward compatibility we still allow users to put version as integer
            # in which case we are trying to find latest version where major part is
            # equal to that input version. For example if user sets version=2, we could
            # continue with something like 2.4.3 (assuming 2.4.3 is the biggest among
            # all 2.* dataset versions). If dataset doesn't have any versions where
            # major part is equal to that input, exception is thrown.
            major = int(version)
            dataset = Session.get(session).catalog.get_dataset(name)
            latest_major = dataset.latest_major_version(major)
            if not latest_major:
                raise DatasetVersionNotFoundError(
                    f"Dataset {name} does not have version {version}"
                )
            version = latest_major
        except ValueError:
            # version is in new semver string format, continuing as normal
            pass

    query = DatasetQuery(
        name=name,
        version=version,  #  type: ignore[arg-type]
        session=session,
        indexing_column_types=File._datachain_column_types,
        fallback_to_studio=fallback_to_studio,
    )
    telemetry.send_event_once("class", "datachain_init", name=name, version=version)
    if settings:
        _settings = Settings(**settings)
    else:
        _settings = Settings()

    signals_schema = SignalSchema({"sys": Sys})
    if query.feature_schema:
        signals_schema |= SignalSchema.deserialize(query.feature_schema)
    else:
        signals_schema |= SignalSchema.from_column_types(query.column_types or {})
    chain = DataChain(query, _settings, signals_schema)
    if delta:
        chain = chain._as_delta(
            on=delta_on, right_on=delta_result_on, compare=delta_compare
        )
    return chain

datasets

datasets(
    session: Optional[Session] = None,
    settings: Optional[dict] = None,
    in_memory: bool = False,
    column: Optional[str] = None,
    include_listing: bool = False,
    studio: bool = False,
    attrs: Optional[list[str]] = None,
) -> DataChain

Generate chain with list of registered datasets.

Parameters:

session (Optional[Session], default: None ) –

Optional session instance. If not provided, uses default session.
settings (Optional[dict], default: None ) –

Optional dictionary of settings to configure the chain.
in_memory (bool, default: False ) –

If True, creates an in-memory session. Defaults to False.
column (Optional[str], default: None ) –

Name of the output column in the chain. Defaults to None which means no top level column will be created.
include_listing (bool, default: False ) –

If True, includes listing datasets. Defaults to False.
studio (bool, default: False ) –

If True, returns datasets from Studio only, otherwise returns all local datasets. Defaults to False.
attrs (Optional[list[str]], default: None ) –

Optional list of attributes to filter datasets on. It can be just attribute without value e.g "NLP", or attribute with value e.g "location=US". Attribute with value can also accept "" to target all that have specific name e.g "location="

Returns:

DataChain ( DataChain ) –

A new DataChain instance containing dataset information.

Example

import datachain as dc

chain = dc.datasets(column="dataset")
for ds in chain.collect("dataset"):
    print(f"{ds.name}@v{ds.version}")

Source code in datachain/lib/dc/datasets.py

def datasets(
    session: Optional[Session] = None,
    settings: Optional[dict] = None,
    in_memory: bool = False,
    column: Optional[str] = None,
    include_listing: bool = False,
    studio: bool = False,
    attrs: Optional[list[str]] = None,
) -> "DataChain":
    """Generate chain with list of registered datasets.

    Args:
        session: Optional session instance. If not provided, uses default session.
        settings: Optional dictionary of settings to configure the chain.
        in_memory: If True, creates an in-memory session. Defaults to False.
        column: Name of the output column in the chain. Defaults to None which
            means no top level column will be created.
        include_listing: If True, includes listing datasets. Defaults to False.
        studio: If True, returns datasets from Studio only,
            otherwise returns all local datasets. Defaults to False.
        attrs: Optional list of attributes to filter datasets on. It can be just
            attribute without value e.g "NLP", or attribute with value
            e.g "location=US". Attribute with value can also accept "*" to target
            all that have specific name e.g "location=*"

    Returns:
        DataChain: A new DataChain instance containing dataset information.

    Example:
        ```py
        import datachain as dc

        chain = dc.datasets(column="dataset")
        for ds in chain.collect("dataset"):
            print(f"{ds.name}@v{ds.version}")
        ```
    """

    session = Session.get(session, in_memory=in_memory)
    catalog = session.catalog

    datasets_values = [
        DatasetInfo.from_models(d, v, j)
        for d, v, j in catalog.list_datasets_versions(
            include_listing=include_listing, studio=studio
        )
    ]
    datasets_values = [d for d in datasets_values if not d.is_temp]

    if attrs:
        for attr in attrs:
            datasets_values = [d for d in datasets_values if d.has_attr(attr)]

    if not column:
        # flattening dataset fields
        schema = {
            k: get_origin(v) if get_origin(v) is dict else v
            for k, v in get_type_hints(DatasetInfo).items()
            if k in DatasetInfo.model_fields
        }
        data = {k: [] for k in DatasetInfo.model_fields}  # type: ignore[var-annotated]
        for d in [d.model_dump() for d in datasets_values]:
            for field, value in d.items():
                data[field].append(value)

        return read_values(
            session=session,
            settings=settings,
            in_memory=in_memory,
            output=schema,
            **data,  # type: ignore[arg-type]
        )

    return read_values(
        session=session,
        settings=settings,
        in_memory=in_memory,
        output={column: DatasetInfo},
        **{column: datasets_values},  # type: ignore[arg-type]
    )

read_hf

read_hf(
    dataset: Union[str, HFDatasetType],
    *args,
    session: Optional[Session] = None,
    settings: Optional[dict] = None,
    column: str = "",
    model_name: str = "",
    **kwargs
) -> DataChain

Generate chain from huggingface hub dataset.

Parameters:

dataset –

Path or name of the dataset to read from Hugging Face Hub, or an instance of datasets.Dataset-like object.
session –

Session to use for the chain.
settings –

Settings to use for the chain.
column –

Generated object column name.
model_name –

Generated model name.
kwargs –

Parameters to pass to datasets.load_dataset.

Example

Load from Hugging Face Hub:

import datachain as dc
chain = dc.read_hf("beans", split="train")

Generate chain from loaded dataset:

from datasets import load_dataset
ds = load_dataset("beans", split="train")
import datachain as dc
chain = dc.read_hf(ds)

Source code in datachain/lib/dc/hf.py

def read_hf(
    dataset: Union[str, "HFDatasetType"],
    *args,
    session: Optional[Session] = None,
    settings: Optional[dict] = None,
    column: str = "",
    model_name: str = "",
    **kwargs,
) -> "DataChain":
    """Generate chain from huggingface hub dataset.

    Parameters:
        dataset : Path or name of the dataset to read from Hugging Face Hub,
            or an instance of `datasets.Dataset`-like object.
        session : Session to use for the chain.
        settings : Settings to use for the chain.
        column : Generated object column name.
        model_name : Generated model name.
        kwargs : Parameters to pass to datasets.load_dataset.

    Example:
        Load from Hugging Face Hub:
        ```py
        import datachain as dc
        chain = dc.read_hf("beans", split="train")
        ```

        Generate chain from loaded dataset:
        ```py
        from datasets import load_dataset
        ds = load_dataset("beans", split="train")
        import datachain as dc
        chain = dc.read_hf(ds)
        ```
    """
    from datachain.lib.hf import HFGenerator, get_output_schema, stream_splits

    from .values import read_values

    output: dict[str, DataType] = {}
    ds_dict = stream_splits(dataset, *args, **kwargs)
    if len(ds_dict) > 1:
        output = {"split": str}

    model_name = model_name or column or ""
    hf_features = next(iter(ds_dict.values())).features
    output = output | get_output_schema(hf_features)
    model = dict_to_data_model(model_name, output)
    if column:
        output = {column: model}

    chain = read_values(split=list(ds_dict.keys()), session=session, settings=settings)
    return chain.gen(HFGenerator(dataset, model, *args, **kwargs), output=output)

read_json

read_json(
    path: Union[str, PathLike[str]],
    type: FileType = "text",
    spec: Optional[DataType] = None,
    schema_from: Optional[str] = "auto",
    jmespath: Optional[str] = None,
    column: Optional[str] = "",
    model_name: Optional[str] = None,
    format: Optional[str] = "json",
    nrows=None,
    **kwargs
) -> DataChain

Get data from JSON. It returns the chain itself.

Parameters:

path –

storage URI with directory. URI must start with storage prefix such as s3://, gs://, az:// or "file:///"
type –

read file as "binary", "text", or "image" data. Default is "text".
spec –

optional Data Model
schema_from –

path to sample to infer spec (if schema not provided)
column –

generated column name
model_name –

optional generated model name
format (Optional[str], default: 'json' ) –

"json", "jsonl"
jmespath –

optional JMESPATH expression to reduce JSON
nrows –

optional row limit for jsonl and JSON arrays

Example

infer JSON schema from data, reduce using JMESPATH

import datachain as dc
chain = dc.read_json("gs://json", jmespath="key1.key2")

infer JSON schema from a particular path

import datachain as dc
chain = dc.read_json("gs://json_ds", schema_from="gs://json/my.json")

Source code in datachain/lib/dc/json.py

def read_json(
    path: Union[str, os.PathLike[str]],
    type: FileType = "text",
    spec: Optional[DataType] = None,
    schema_from: Optional[str] = "auto",
    jmespath: Optional[str] = None,
    column: Optional[str] = "",
    model_name: Optional[str] = None,
    format: Optional[str] = "json",
    nrows=None,
    **kwargs,
) -> "DataChain":
    """Get data from JSON. It returns the chain itself.

    Parameters:
        path : storage URI with directory. URI must start with storage prefix such
            as `s3://`, `gs://`, `az://` or "file:///"
        type : read file as "binary", "text", or "image" data. Default is "text".
        spec : optional Data Model
        schema_from : path to sample to infer spec (if schema not provided)
        column : generated column name
        model_name : optional generated model name
        format: "json", "jsonl"
        jmespath : optional JMESPATH expression to reduce JSON
        nrows : optional row limit for jsonl and JSON arrays

    Example:
        infer JSON schema from data, reduce using JMESPATH
        ```py
        import datachain as dc
        chain = dc.read_json("gs://json", jmespath="key1.key2")
        ```

        infer JSON schema from a particular path
        ```py
        import datachain as dc
        chain = dc.read_json("gs://json_ds", schema_from="gs://json/my.json")
        ```
    """
    from .storage import read_storage

    if schema_from == "auto":
        schema_from = os.fspath(path)

    def jmespath_to_name(s: str):
        name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s)  # type: ignore[union-attr]
        return s[:name_end]

    if (not column) and jmespath:
        column = jmespath_to_name(jmespath)
    if not column:
        column = format
    chain = read_storage(uri=path, type=type, **kwargs)
    signal_dict = {
        column: read_meta(
            schema_from=schema_from,
            format=format,
            spec=spec,
            model_name=model_name,
            jmespath=jmespath,
            nrows=nrows,
        ),
        "params": {"file": File},
    }
    # disable prefetch if nrows is set
    settings = {"prefetch": 0} if nrows else {}
    return chain.settings(**settings).gen(**signal_dict)  # type: ignore[misc, arg-type]

listings

listings(
    session: Optional[Session] = None,
    in_memory: bool = False,
    column: str = "listing",
    **kwargs
) -> DataChain

Generate chain with list of cached listings. Listing is a special kind of dataset which has directory listing data of some underlying storage (e.g S3 bucket).

Example

import datachain as dc
dc.listings().show()

Source code in datachain/lib/dc/listings.py

def listings(
    session: Optional[Session] = None,
    in_memory: bool = False,
    column: str = "listing",
    **kwargs,
) -> "DataChain":
    """Generate chain with list of cached listings.
    Listing is a special kind of dataset which has directory listing data of
    some underlying storage (e.g S3 bucket).

    Example:
        ```py
        import datachain as dc
        dc.listings().show()
        ```
    """
    session = Session.get(session, in_memory=in_memory)
    catalog = kwargs.get("catalog") or session.catalog

    return read_values(
        session=session,
        in_memory=in_memory,
        output={column: ListingInfo},
        **{column: catalog.listings()},  # type: ignore[arg-type]
    )

read_pandas

read_pandas(
    df: DataFrame,
    name: str = "",
    session: Optional[Session] = None,
    settings: Optional[dict] = None,
    in_memory: bool = False,
    column: str = "",
) -> DataChain

Generate chain from pandas data-frame.

Example

import pandas as pd
import datachain as dc

df = pd.DataFrame({"fib": [1, 2, 3, 5, 8]})
dc.read_pandas(df)

Source code in datachain/lib/dc/pandas.py

def read_pandas(  # type: ignore[override]
    df: "pd.DataFrame",
    name: str = "",
    session: Optional[Session] = None,
    settings: Optional[dict] = None,
    in_memory: bool = False,
    column: str = "",
) -> "DataChain":
    """Generate chain from pandas data-frame.

    Example:
        ```py
        import pandas as pd
        import datachain as dc

        df = pd.DataFrame({"fib": [1, 2, 3, 5, 8]})
        dc.read_pandas(df)
        ```
    """
    from .utils import DatasetPrepareError

    def get_col_name(col):
        if isinstance(col, tuple):
            # Join tuple elements with underscore for MultiIndex columns
            return "_".join(map(str, col)).lower()
        # Handle regular string column names
        return str(col).lower()

    fr_map = {get_col_name(col): df[col].tolist() for col in df.columns}

    for c in fr_map:
        if not c.isidentifier():
            raise DatasetPrepareError(
                name,
                f"import from pandas error - '{c}' cannot be a column name",
            )

    return read_values(
        name,
        session,
        settings=settings,
        column=column,
        in_memory=in_memory,
        **fr_map,
    )

read_parquet

read_parquet(
    path,
    partitioning: Any = "hive",
    output: Optional[dict[str, DataType]] = None,
    column: str = "",
    model_name: str = "",
    source: bool = True,
    session: Optional[Session] = None,
    settings: Optional[dict] = None,
    **kwargs
) -> DataChain

Generate chain from parquet files.

Parameters:

path –

Storage URI with directory. URI must start with storage prefix such as s3://, gs://, az:// or "file:///".
partitioning –

Any pyarrow partitioning schema.
output –

Dictionary defining column names and their corresponding types.
column –

Created column name.
model_name –

Generated model name.
source –

Whether to include info about the source file.
session –

Session to use for the chain.
settings –

Settings to use for the chain.

Example

Reading a single file:

import datachain as dc
dc.read_parquet("s3://mybucket/file.parquet")

Reading a partitioned dataset from a directory:

import datachain as dc
dc.read_parquet("s3://mybucket/dir")

Source code in datachain/lib/dc/parquet.py

def read_parquet(
    path,
    partitioning: Any = "hive",
    output: Optional[dict[str, DataType]] = None,
    column: str = "",
    model_name: str = "",
    source: bool = True,
    session: Optional[Session] = None,
    settings: Optional[dict] = None,
    **kwargs,
) -> "DataChain":
    """Generate chain from parquet files.

    Parameters:
        path : Storage URI with directory. URI must start with storage prefix such
            as `s3://`, `gs://`, `az://` or "file:///".
        partitioning : Any pyarrow partitioning schema.
        output : Dictionary defining column names and their corresponding types.
        column : Created column name.
        model_name : Generated model name.
        source : Whether to include info about the source file.
        session : Session to use for the chain.
        settings : Settings to use for the chain.

    Example:
        Reading a single file:
        ```py
        import datachain as dc
        dc.read_parquet("s3://mybucket/file.parquet")
        ```

        Reading a partitioned dataset from a directory:
        ```py
        import datachain as dc
        dc.read_parquet("s3://mybucket/dir")
        ```
    """
    from .storage import read_storage

    chain = read_storage(path, session=session, settings=settings, **kwargs)
    return chain.parse_tabular(
        output=output,
        column=column,
        model_name=model_name,
        source=source,
        format="parquet",
        partitioning=partitioning,
    )

read_records

read_records(
    to_insert: Optional[Union[dict, Iterable[dict]]],
    session: Optional[Session] = None,
    settings: Optional[dict] = None,
    in_memory: bool = False,
    schema: Optional[dict[str, DataType]] = None,
) -> DataChain

Create a DataChain from the provided records. This method can be used for programmatically generating a chain in contrast of reading data from storages or other sources.

Parameters:

to_insert –

records (or a single record) to insert. Each record is a dictionary of signals and theirs values.
schema –

describes chain signals and their corresponding types

Example

import datachain as dc
single_record = dc.read_records(dc.DEFAULT_FILE_RECORD)

Notes

This call blocks until all records are inserted.

Source code in datachain/lib/dc/records.py

def read_records(
    to_insert: Optional[Union[dict, Iterable[dict]]],
    session: Optional[Session] = None,
    settings: Optional[dict] = None,
    in_memory: bool = False,
    schema: Optional[dict[str, DataType]] = None,
) -> "DataChain":
    """Create a DataChain from the provided records. This method can be used for
    programmatically generating a chain in contrast of reading data from storages
    or other sources.

    Parameters:
        to_insert : records (or a single record) to insert. Each record is
                    a dictionary of signals and theirs values.
        schema : describes chain signals and their corresponding types

    Example:
        ```py
        import datachain as dc
        single_record = dc.read_records(dc.DEFAULT_FILE_RECORD)
        ```

    Notes:
        This call blocks until all records are inserted.
    """
    from datachain.query.dataset import INSERT_BATCH_SIZE, adjust_outputs, get_col_types
    from datachain.sql.types import SQLType
    from datachain.utils import batched

    from .datasets import read_dataset

    session = Session.get(session, in_memory=in_memory)
    catalog = session.catalog

    name = session.generate_temp_dataset_name()
    signal_schema = None
    columns: list[sqlalchemy.Column] = []

    if schema:
        signal_schema = SignalSchema(schema)
        columns = [
            sqlalchemy.Column(c.name, c.type)  # type: ignore[union-attr]
            for c in signal_schema.db_signals(as_columns=True)
        ]
    else:
        columns = [
            sqlalchemy.Column(name, typ)
            for name, typ in File._datachain_column_types.items()
        ]

    dsr = catalog.create_dataset(
        name,
        columns=columns,
        feature_schema=(
            signal_schema.clone_without_sys_signals().serialize()
            if signal_schema
            else None
        ),
    )

    session.add_dataset_version(dsr, dsr.latest_version)

    if isinstance(to_insert, dict):
        to_insert = [to_insert]
    elif not to_insert:
        to_insert = []

    warehouse = catalog.warehouse
    dr = warehouse.dataset_rows(dsr)
    table = dr.get_table()

    # Optimization: Compute row types once, rather than for every row.
    col_types = get_col_types(
        warehouse,
        {c.name: c.type for c in columns if isinstance(c.type, SQLType)},
    )
    records = (adjust_outputs(warehouse, record, col_types) for record in to_insert)
    for chunk in batched(records, INSERT_BATCH_SIZE):
        warehouse.insert_rows(table, chunk)
    warehouse.insert_rows_done(table)
    return read_dataset(name=dsr.name, session=session, settings=settings)

read_storage

read_storage(
    uri: Union[
        str, PathLike[str], list[str], list[PathLike[str]]
    ],
    *,
    type: FileType = "binary",
    session: Optional[Session] = None,
    settings: Optional[dict] = None,
    in_memory: bool = False,
    recursive: Optional[bool] = True,
    column: str = "file",
    update: bool = False,
    anon: bool = False,
    delta: Optional[bool] = False,
    delta_on: Optional[Union[str, Sequence[str]]] = None,
    delta_result_on: Optional[
        Union[str, Sequence[str]]
    ] = None,
    delta_compare: Optional[
        Union[str, Sequence[str]]
    ] = None,
    client_config: Optional[dict] = None
) -> DataChain

Get data from storage(s) as a list of file with all file attributes. It returns the chain itself as usual.

Parameters:

uri –

storage URI with directory or list of URIs. URIs must start with storage prefix such as s3://, gs://, az:// or "file:///"
type –

read file as "binary", "text", or "image" data. Default is "binary".
recursive –

search recursively for the given path.
column –

Created column name.
update –

force storage reindexing. Default is False.
anon –

If True, we will treat cloud bucket as public one
client_config –

Optional client configuration for the storage client.
delta (Optional[bool], default: False ) –

If set to True, we optimize the creation of new dataset versions by calculating the diff between the latest version of this storage and the version used to create the most recent version of the resulting chain dataset (the one specified in .save()). We then run the "diff" chain using only the diff data, rather than the entire storage data, and merge that diff chain with the latest version of the resulting dataset to create a new version. This approach avoids applying modifications to all records from storage every time, which can be an expensive operation. The diff is calculated using the DataChain.compare() method, which compares the delta_on fields to find matches and checks the compare fields to determine if a record has changed. Note that this process only considers added and modified records in storage; deleted records are not removed from the new dataset version. This calculation is based on the difference between the current version of the source and the version used to create the dataset.
delta_on (Optional[Union[str, Sequence[str]]], default: None ) –

A list of fields that uniquely identify rows in the source. If two rows have the same values, they are considered the same (e.g., they could be different versions of the same row in a versioned source). This is used in the delta update to calculate the diff.
delta_result_on (Optional[Union[str, Sequence[str]]], default: None ) –

A list of fields in the resulting dataset that correspond to the delta_on fields from the source. This is needed to identify rows that have changed in the source but are already present in the current version of the resulting dataset, in order to avoid including outdated versions of those rows in the new dataset. We retain only the latest versions of rows to prevent duplication. There is no need to define this if the delta_on fields are present in the final dataset and have not been renamed.
delta_compare (Optional[Union[str, Sequence[str]]], default: None ) –

A list of fields used to check if the same row has been modified in the new version of the source. If not defined, all fields except those defined in delta_on will be used.

Returns:

DataChain ( DataChain ) –

A DataChain object containing the file information.

Examples:

Simple call from s3:

import datachain as dc
chain = dc.read_storage("s3://my-bucket/my-dir")

Multiple URIs:

chain = dc.read_storage([
    "s3://bucket1/dir1",
    "s3://bucket2/dir2"
])

With AWS S3-compatible storage:

chain = dc.read_storage(
    "s3://my-bucket/my-dir",
    client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
)

Pass existing session

session = Session.get()
chain = dc.read_storage([
    "path/to/dir1",
    "path/to/dir2"
], session=session, recursive=True)

Note

When using multiple URIs with update=True, the function optimizes by avoiding redundant updates for URIs pointing to the same storage location.

Source code in datachain/lib/dc/storage.py

def read_storage(
    uri: Union[str, os.PathLike[str], list[str], list[os.PathLike[str]]],
    *,
    type: FileType = "binary",
    session: Optional[Session] = None,
    settings: Optional[dict] = None,
    in_memory: bool = False,
    recursive: Optional[bool] = True,
    column: str = "file",
    update: bool = False,
    anon: bool = False,
    delta: Optional[bool] = False,
    delta_on: Optional[Union[str, Sequence[str]]] = None,
    delta_result_on: Optional[Union[str, Sequence[str]]] = None,
    delta_compare: Optional[Union[str, Sequence[str]]] = None,
    client_config: Optional[dict] = None,
) -> "DataChain":
    """Get data from storage(s) as a list of file with all file attributes.
    It returns the chain itself as usual.

    Parameters:
        uri : storage URI with directory or list of URIs.
            URIs must start with storage prefix such
            as `s3://`, `gs://`, `az://` or "file:///"
        type : read file as "binary", "text", or "image" data. Default is "binary".
        recursive : search recursively for the given path.
        column : Created column name.
        update : force storage reindexing. Default is False.
        anon : If True, we will treat cloud bucket as public one
        client_config : Optional client configuration for the storage client.
        delta: If set to True, we optimize the creation of new dataset versions by
            calculating the diff between the latest version of this storage and the
            version used to create the most recent version of the resulting chain
            dataset (the one specified in `.save()`). We then run the "diff" chain
            using only the diff data, rather than the entire storage data, and merge
            that diff chain with the latest version of the resulting dataset to create
            a new version. This approach avoids applying modifications to all records
            from storage every time, which can be an expensive operation.
            The diff is calculated using the `DataChain.compare()` method, which
            compares the `delta_on` fields to find matches and checks the compare
            fields to determine if a record has changed. Note that this process only
            considers added and modified records in storage; deleted records are not
            removed from the new dataset version.
            This calculation is based on the difference between the current version
            of the source and the version used to create the dataset.
        delta_on: A list of fields that uniquely identify rows in the source.
            If two rows have the same values, they are considered the same (e.g., they
            could be different versions of the same row in a versioned source).
            This is used in the delta update to calculate the diff.
        delta_result_on: A list of fields in the resulting dataset that correspond
            to the `delta_on` fields from the source.
            This is needed to identify rows that have changed in the source but are
            already present in the current version of the resulting dataset, in order
            to avoid including outdated versions of those rows in the new dataset.
            We retain only the latest versions of rows to prevent duplication.
            There is no need to define this if the `delta_on` fields are present in
            the final dataset and have not been renamed.
        delta_compare: A list of fields used to check if the same row has been modified
            in the new version of the source.
            If not defined, all fields except those defined in `delta_on` will be used.

    Returns:
        DataChain: A DataChain object containing the file information.

    Examples:
        Simple call from s3:
        ```python
        import datachain as dc
        chain = dc.read_storage("s3://my-bucket/my-dir")
        ```

        Multiple URIs:
        ```python
        chain = dc.read_storage([
            "s3://bucket1/dir1",
            "s3://bucket2/dir2"
        ])
        ```

        With AWS S3-compatible storage:
        ```python
        chain = dc.read_storage(
            "s3://my-bucket/my-dir",
            client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
        )
        ```

        Pass existing session
        ```py
        session = Session.get()
        chain = dc.read_storage([
            "path/to/dir1",
            "path/to/dir2"
        ], session=session, recursive=True)
        ```

    Note:
        When using multiple URIs with `update=True`, the function optimizes by
        avoiding redundant updates for URIs pointing to the same storage location.
    """
    from .datachain import DataChain
    from .datasets import read_dataset
    from .records import read_records
    from .values import read_values

    file_type = get_file_type(type)

    if anon:
        client_config = (client_config or {}) | {"anon": True}
    session = Session.get(session, client_config=client_config, in_memory=in_memory)
    catalog = session.catalog
    cache = catalog.cache
    client_config = session.catalog.client_config

    uris = uri if isinstance(uri, (list, tuple)) else [uri]

    if not uris:
        raise ValueError("No URIs provided")

    chains = []
    listed_ds_name = set()
    file_values = []

    for single_uri in uris:
        list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
            single_uri, session, update=update
        )

        # list_ds_name is None if object is a file, we don't want to use cache
        # or do listing in that case - just read that single object
        if not list_ds_name:
            file_values.append(
                get_file_info(list_uri, cache, client_config=client_config)
            )
            continue

        dc = read_dataset(list_ds_name, session=session, settings=settings)
        dc._query.update = update
        dc.signals_schema = dc.signals_schema.mutate({f"{column}": file_type})

        if update or not list_ds_exists:

            def lst_fn(ds_name, lst_uri):
                # disable prefetch for listing, as it pre-downloads all files
                (
                    read_records(
                        DataChain.DEFAULT_FILE_RECORD,
                        session=session,
                        settings=settings,
                        in_memory=in_memory,
                    )
                    .settings(prefetch=0)
                    .gen(
                        list_bucket(lst_uri, cache, client_config=client_config),
                        output={f"{column}": file_type},
                    )
                    # for internal listing datasets, we always bump major version
                    .save(ds_name, listing=True, update_version="major")
                )

            dc._query.set_listing_fn(
                lambda ds_name=list_ds_name, lst_uri=list_uri: lst_fn(ds_name, lst_uri)
            )

        chains.append(ls(dc, list_path, recursive=recursive, column=column))
        listed_ds_name.add(list_ds_name)

    storage_chain = None if not chains else reduce(lambda x, y: x.union(y), chains)

    if file_values:
        file_chain = read_values(
            session=session,
            settings=settings,
            in_memory=in_memory,
            file=file_values,
        )
        file_chain.signals_schema = file_chain.signals_schema.mutate(
            {f"{column}": file_type}
        )
        storage_chain = storage_chain.union(file_chain) if storage_chain else file_chain

    assert storage_chain is not None

    if delta:
        storage_chain = storage_chain._as_delta(
            on=delta_on, right_on=delta_result_on, compare=delta_compare
        )
    return storage_chain

read_values

read_values(
    ds_name: str = "",
    session: Optional[Session] = None,
    settings: Optional[dict] = None,
    in_memory: bool = False,
    output: OutputType = None,
    column: str = "",
    **fr_map
) -> DataChain

Generate chain from list of values.

Example

import datachain as dc
dc.read_values(fib=[1, 2, 3, 5, 8])

Source code in datachain/lib/dc/values.py

def read_values(
    ds_name: str = "",
    session: Optional[Session] = None,
    settings: Optional[dict] = None,
    in_memory: bool = False,
    output: OutputType = None,
    column: str = "",
    **fr_map,
) -> "DataChain":
    """Generate chain from list of values.

    Example:
        ```py
        import datachain as dc
        dc.read_values(fib=[1, 2, 3, 5, 8])
        ```
    """
    from .datachain import DataChain

    tuple_type, output, tuples = values_to_tuples(ds_name, output, **fr_map)

    def _func_fr() -> Iterator[tuple_type]:  # type: ignore[valid-type]
        yield from tuples

    _func_fr.__name__ = "read_values"

    chain = read_records(
        DataChain.DEFAULT_FILE_RECORD,
        session=session,
        settings=settings,
        in_memory=in_memory,
    )
    if column:
        output = {column: dict_to_data_model(column, output)}  # type: ignore[arg-type]
    return chain.gen(_func_fr, output=output)

read_database

read_database(
    query: Union[str, Executable],
    connection: ConnectionType,
    params: Union[
        Sequence[Mapping[str, Any]], Mapping[str, Any], None
    ] = None,
    *,
    output: Optional[dict[str, DataType]] = None,
    session: Optional[Session] = None,
    settings: Optional[dict] = None,
    in_memory: bool = False,
    infer_schema_length: Optional[int] = 100
) -> DataChain

Read the results of a SQL query into a DataChain, using a given database connection.

Parameters:

query (Union[str, Executable]) –

The SQL query to execute. Can be a raw SQL string or a SQLAlchemy Executable object.
connection (ConnectionType) –

SQLAlchemy connectable, str, or a sqlite3 connection Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible for engine disposal and connection closure for the SQLAlchemy connectable; str connections are closed automatically.
params (Union[Sequence[Mapping[str, Any]], Mapping[str, Any], None], default: None ) –

Parameters to pass to execute method.
output (Optional[dict[str, DataType]], default: None ) –

A dictionary mapping column names to types, used to override the schema inferred from the query results.
session (Optional[Session], default: None ) –

Session to use for the chain.
settings (Optional[dict], default: None ) –

Settings to use for the chain.
in_memory (bool, default: False ) –

If True, creates an in-memory session. Defaults to False.
infer_schema_length (Optional[int], default: 100 ) –

The maximum number of rows to scan for inferring schema. If set to None, the full data may be scanned. The rows used for schema inference are stored in memory, so large values can lead to high memory usage. Only applies if the output parameter is not set for the given column.

Examples:

Reading from a SQL query against a user-supplied connection:

query = "SELECT key, value FROM tbl"
chain = dc.read_database(query, connection, output={"value": float})

Load data from a SQLAlchemy driver/engine:

from sqlalchemy import create_engine
engine = create_engine("postgresql+psycopg://myuser:mypassword@localhost:5432/mydb")
chain = dc.read_database("select * from tbl", engine)

Load data from a parameterized SQLAlchemy query:

query = "SELECT key, value FROM tbl WHERE value > :value"
dc.read_database(query, engine, params={"value": 50})

Notes

This function works with a variety of databases — including, but not limited to, SQLite, DuckDB, PostgreSQL, and Snowflake, provided the appropriate driver is installed.
This call is blocking, and will execute the query and return once the results are saved.

Source code in datachain/lib/dc/database.py

def read_database(
    query: Union[str, "sqlalchemy.sql.expression.Executable"],
    connection: "ConnectionType",
    params: Union["Sequence[Mapping[str, Any]]", "Mapping[str, Any]", None] = None,
    *,
    output: Optional["dict[str, DataType]"] = None,
    session: Optional["Session"] = None,
    settings: Optional[dict] = None,
    in_memory: bool = False,
    infer_schema_length: Optional[int] = 100,
) -> "DataChain":
    """
    Read the results of a SQL query into a DataChain, using a given database connection.

    Args:
        query:
            The SQL query to execute. Can be a raw SQL string or a SQLAlchemy
            `Executable` object.
        connection: SQLAlchemy connectable, str, or a sqlite3 connection
            Using SQLAlchemy makes it possible to use any DB supported by that
            library. If a DBAPI2 object, only sqlite3 is supported. The user is
            responsible for engine disposal and connection closure for the
            SQLAlchemy connectable; str connections are closed automatically.
        params: Parameters to pass to execute method.
        output: A dictionary mapping column names to types, used to override the
            schema inferred from the query results.
        session: Session to use for the chain.
        settings: Settings to use for the chain.
        in_memory: If True, creates an in-memory session. Defaults to False.
        infer_schema_length:
            The maximum number of rows to scan for inferring schema.
            If set to `None`, the full data may be scanned.
            The rows used for schema inference are stored in memory,
            so large values can lead to high memory usage.
            Only applies if the `output` parameter is not set for the given column.

    Examples:
        Reading from a SQL query against a user-supplied connection:
        ```python
        query = "SELECT key, value FROM tbl"
        chain = dc.read_database(query, connection, output={"value": float})
        ```

        Load data from a SQLAlchemy driver/engine:
        ```python
        from sqlalchemy import create_engine
        engine = create_engine("postgresql+psycopg://myuser:mypassword@localhost:5432/mydb")
        chain = dc.read_database("select * from tbl", engine)
        ```

        Load data from a parameterized SQLAlchemy query:
        ```python
        query = "SELECT key, value FROM tbl WHERE value > :value"
        dc.read_database(query, engine, params={"value": 50})
        ```

    Notes:
        - This function works with a variety of databases — including,
        but not limited to, SQLite, DuckDB, PostgreSQL, and Snowflake,
        provided the appropriate driver is installed.
        - This call is blocking, and will execute the query and return once the
          results are saved.
    """
    from datachain.lib.dc.records import read_records

    output = output or {}
    if isinstance(query, str):
        query = sqlalchemy.text(query)
    kw = {"execution_options": {"stream_results": True}}  # use server-side cursors
    with _connect(connection) as conn, conn.execute(query, params, **kw) as result:
        cols = result.keys()
        to_infer = [k for k in cols if k not in output]  # preserve the order
        rows, inferred_schema = _infer_schema(result, to_infer, infer_schema_length)
        records = (row._asdict() for row in itertools.chain(rows, result))
        return read_records(
            records,
            session=session,
            settings=settings,
            in_memory=in_memory,
            schema=inferred_schema | output,
        )

ConnectionType `module-attribute`

ConnectionType = Union[
    str,
    URL,
    Connectable,
    Engine,
    Connection,
    Session,
    Connection,
]

DataChain

DataChain(
    query: DatasetQuery,
    settings: Settings,
    signal_schema: SignalSchema,
    setup: Optional[dict] = None,
    _sys: bool = False,
)

DataChain - a data structure for batch data processing and evaluation.

It represents a sequence of data manipulation steps such as reading data from storages, running AI or LLM models or calling external services API to validate or enrich data.

Data in DataChain is presented as Python classes with arbitrary set of fields, including nested classes. The data classes have to inherit from DataModel class. The supported set of field types include: majority of the type supported by the underlyind library Pydantic.

dataset `property`

dataset: Optional[DatasetRecord]

Underlying dataset, if there is one.

delta `property`

delta: bool

Returns True if this chain is ran in "delta" update mode

empty `property`

empty: bool

Returns True if chain has zero number of rows

name `property`

name: Optional[str]

Name of the underlying dataset, if there is one.

schema `property`

schema: dict[str, DataType]

Get schema of the chain.

session `property`

session: Session

Session of the chain.

version `property`

version: Optional[str]

Version of the underlying dataset, if there is one.

or

__or__(other: Self) -> Self

Return self.union(other).

Source code in datachain/lib/dc/datachain.py

def __or__(self, other: "Self") -> "Self":
    """Return `self.union(other)`."""
    return self.union(other)

repr

__repr__() -> str

Return a string representation of the chain.

Source code in datachain/lib/dc/datachain.py

def __repr__(self) -> str:
    """Return a string representation of the chain."""
    classname = self.__class__.__name__
    if not self._effective_signals_schema.values:
        return f"Empty {classname}"

    import io

    file = io.StringIO()
    self.print_schema(file=file)
    return file.getvalue()

agg

agg(
    func: Optional[Callable] = None,
    partition_by: Optional[PartitionByType] = None,
    params: Union[None, str, Sequence[str]] = None,
    output: OutputType = None,
    **signal_map
) -> Self

Aggregate rows using partition_by statement and apply a function to the groups of aggregated rows. The function needs to return new objects for each group of the new rows. It returns a chain itself with new signals.

Input-output relationship: N:M

This method bears similarity to gen() and map(), employing a comparable set of parameters, yet differs in two crucial aspects: 1. The partition_by parameter: This specifies the column name or a list of column names that determine the grouping criteria for aggregation. 2. Group-based UDF function input: Instead of individual rows, the function receives a list all rows within each group defined by partition_by.

Examples:

chain = chain.agg(
    total=lambda category, amount: [sum(amount)],
    output=float,
    partition_by="category",
)
chain.save("new_dataset")

An alternative syntax, when you need to specify a more complex function:

# It automatically resolves which columns to pass to the function
# by looking at the function signature.
def agg_sum(
    file: list[File], amount: list[float]
) -> Iterator[tuple[File, float]]:
    yield file[0], sum(amount)

chain = chain.agg(
    agg_sum,
    output={"file": File, "total": float},
    # Alternative syntax is to use `C` (short for Column) to specify
    # a column name or a nested column, e.g. C("file.path").
    partition_by=C("category"),
)
chain.save("new_dataset")

Source code in datachain/lib/dc/datachain.py

@delta_disabled
def agg(
    self,
    func: Optional[Callable] = None,
    partition_by: Optional[PartitionByType] = None,
    params: Union[None, str, Sequence[str]] = None,
    output: OutputType = None,
    **signal_map,
) -> "Self":
    """Aggregate rows using `partition_by` statement and apply a function to the
    groups of aggregated rows. The function needs to return new objects for each
    group of the new rows. It returns a chain itself with new signals.

    Input-output relationship: N:M

    This method bears similarity to `gen()` and `map()`, employing a comparable set
    of parameters, yet differs in two crucial aspects:
    1. The `partition_by` parameter: This specifies the column name or a list of
       column names that determine the grouping criteria for aggregation.
    2. Group-based UDF function input: Instead of individual rows, the function
       receives a list all rows within each group defined by `partition_by`.

    Examples:
        ```py
        chain = chain.agg(
            total=lambda category, amount: [sum(amount)],
            output=float,
            partition_by="category",
        )
        chain.save("new_dataset")
        ```

        An alternative syntax, when you need to specify a more complex function:

        ```py
        # It automatically resolves which columns to pass to the function
        # by looking at the function signature.
        def agg_sum(
            file: list[File], amount: list[float]
        ) -> Iterator[tuple[File, float]]:
            yield file[0], sum(amount)

        chain = chain.agg(
            agg_sum,
            output={"file": File, "total": float},
            # Alternative syntax is to use `C` (short for Column) to specify
            # a column name or a nested column, e.g. C("file.path").
            partition_by=C("category"),
        )
        chain.save("new_dataset")
        ```
    """
    udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
    return self._evolve(
        query=self._query.generate(
            udf_obj.to_udf_wrapper(),
            partition_by=partition_by,
            **self._settings.to_dict(),
        ),
        signal_schema=udf_obj.output,
    )

apply

apply(func, *args, **kwargs)

Apply any function to the chain.

Useful for reusing in a chain of operations.

Example

import datachain as dc
def parse_stem(chain):
    return chain.map(
        lambda file: file.get_file_stem()
        output={"stem": str}
    )

chain = (
    dc.read_storage("s3://my-bucket")
    .apply(parse_stem)
    .filter(C("stem").glob("*cat*"))
)

Source code in datachain/lib/dc/datachain.py

def apply(self, func, *args, **kwargs):
    """Apply any function to the chain.

    Useful for reusing in a chain of operations.

    Example:
        ```py
        import datachain as dc
        def parse_stem(chain):
            return chain.map(
                lambda file: file.get_file_stem()
                output={"stem": str}
            )

        chain = (
            dc.read_storage("s3://my-bucket")
            .apply(parse_stem)
            .filter(C("stem").glob("*cat*"))
        )
        ```
    """
    return func(self, *args, **kwargs)

avg

avg(fr: DataType)

Compute the average of a column.

Source code in datachain/lib/dc/datachain.py

def avg(self, fr: DataType):  # type: ignore[override]
    """Compute the average of a column."""
    return self._extend_to_data_model("avg", fr)

batch_map

batch_map(
    func: Optional[Callable] = None,
    params: Union[None, str, Sequence[str]] = None,
    output: OutputType = None,
    batch: int = 1000,
    **signal_map
) -> Self

This is a batch version of map().

Input-output relationship: N:N

It accepts the same parameters plus an additional parameter:

batch : Size of each batch passed to `func`. Defaults to 1000.

Example

chain = chain.batch_map(
    sqrt=lambda size: np.sqrt(size),
    output=float
)
chain.save("new_dataset")

Source code in datachain/lib/dc/datachain.py

def batch_map(
    self,
    func: Optional[Callable] = None,
    params: Union[None, str, Sequence[str]] = None,
    output: OutputType = None,
    batch: int = 1000,
    **signal_map,
) -> "Self":
    """This is a batch version of `map()`.

    Input-output relationship: N:N

    It accepts the same parameters plus an
    additional parameter:

        batch : Size of each batch passed to `func`. Defaults to 1000.

    Example:
        ```py
        chain = chain.batch_map(
            sqrt=lambda size: np.sqrt(size),
            output=float
        )
        chain.save("new_dataset")
        ```
    """
    udf_obj = self._udf_to_obj(BatchMapper, func, params, output, signal_map)
    return self._evolve(
        query=self._query.add_signals(
            udf_obj.to_udf_wrapper(batch),
            **self._settings.to_dict(),
        ),
        signal_schema=self.signals_schema | udf_obj.output,
    )

c

c(column: Union[str, Column]) -> Column

Returns Column instance attached to the current chain.

Source code in datachain/lib/dc/datachain.py

def c(self, column: Union[str, Column]) -> Column:
    """Returns Column instance attached to the current chain."""
    c = self.column(column) if isinstance(column, str) else self.column(column.name)
    c.table = self._query.table
    return c

chunk

chunk(index: int, total: int) -> Self

Split a chain into smaller chunks for e.g. parallelization.

Example

import datachain as dc

chain = dc.read_storage(...)
chunk_1 = query._chunk(0, 2)
chunk_2 = query._chunk(1, 2)

Note

Bear in mind that index is 0-indexed but total isn't. Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.

Source code in datachain/lib/dc/datachain.py

def chunk(self, index: int, total: int) -> "Self":
    """Split a chain into smaller chunks for e.g. parallelization.

    Example:
        ```py
        import datachain as dc

        chain = dc.read_storage(...)
        chunk_1 = query._chunk(0, 2)
        chunk_2 = query._chunk(1, 2)
        ```

    Note:
        Bear in mind that `index` is 0-indexed but `total` isn't.
        Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
    """
    return self._evolve(query=self._query.chunk(index, total))

clone

clone() -> Self

Make a copy of the chain in a new table.

Source code in datachain/lib/dc/datachain.py

def clone(self) -> "Self":
    """Make a copy of the chain in a new table."""
    return self._evolve(query=self._query.clone(new_table=True))

collect

collect() -> Iterator[tuple[DataValue, ...]]

collect(col: str) -> Iterator[DataValue]

collect(*cols: str) -> Iterator[tuple[DataValue, ...]]

collect(
    *cols: str,
) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]

Yields rows of values, optionally limited to the specified columns.

Parameters:

*cols (str, default: () ) –

Limit to the specified columns. By default, all columns are selected.

Yields:

DataType –

Yields a single item if a column is selected.
tuple[DataType, ...] –

Yields a tuple of items if multiple columns are selected.

Example

Iterating over all rows:

for row in dc.collect():
    print(row)

Iterating over all rows with selected columns:

for name, size in dc.collect("file.path", "file.size"):
    print(name, size)

Iterating over a single column:

for file in dc.collect("file.path"):
    print(file)

Source code in datachain/lib/dc/datachain.py

def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]:  # type: ignore[overload-overlap,misc]
    """Yields rows of values, optionally limited to the specified columns.

    Args:
        *cols: Limit to the specified columns. By default, all columns are selected.

    Yields:
        (DataType): Yields a single item if a column is selected.
        (tuple[DataType, ...]): Yields a tuple of items if multiple columns are
            selected.

    Example:
        Iterating over all rows:
        ```py
        for row in dc.collect():
            print(row)
        ```

        Iterating over all rows with selected columns:
        ```py
        for name, size in dc.collect("file.path", "file.size"):
            print(name, size)
        ```

        Iterating over a single column:
        ```py
        for file in dc.collect("file.path"):
            print(file)
        ```
    """
    chain = self.select(*cols) if cols else self
    signals_schema = chain._effective_signals_schema
    db_signals = signals_schema.db_signals()
    with self._query.ordered_select(*db_signals).as_iterable() as rows:
        for row in rows:
            ret = signals_schema.row_to_features(
                row, catalog=chain.session.catalog, cache=chain._settings.cache
            )
            yield ret[0] if len(cols) == 1 else tuple(ret)

collect_flatten

collect_flatten() -> Iterator[tuple[Any, ...]]

collect_flatten(
    *, include_hidden: bool
) -> Iterator[tuple[Any, ...]]

collect_flatten(
    *,
    row_factory: Callable[[list[str], tuple[Any, ...]], _T]
) -> Iterator[_T]

collect_flatten(
    *,
    row_factory: Callable[[list[str], tuple[Any, ...]], _T],
    include_hidden: bool
) -> Iterator[_T]

collect_flatten(
    *, row_factory=None, include_hidden: bool = True
)

Yields flattened rows of values as a tuple.

Parameters:

row_factory –

A callable to convert row to a custom format. It should accept two arguments: a list of column names and a tuple of row values.
include_hidden (bool, default: True ) –

Whether to include hidden signals from the schema.

Source code in datachain/lib/dc/datachain.py

def collect_flatten(self, *, row_factory=None, include_hidden: bool = True):
    """Yields flattened rows of values as a tuple.

    Args:
        row_factory : A callable to convert row to a custom format.
                      It should accept two arguments: a list of column names and
                      a tuple of row values.
        include_hidden: Whether to include hidden signals from the schema.
    """
    db_signals = self._effective_signals_schema.db_signals(
        include_hidden=include_hidden
    )
    with self._query.ordered_select(*db_signals).as_iterable() as rows:
        if row_factory:
            rows = (row_factory(db_signals, r) for r in rows)  # type: ignore[assignment]
        yield from rows

column

column(name: str) -> Column

Returns Column instance with a type if name is found in current schema, otherwise raises an exception.

Source code in datachain/lib/dc/datachain.py

def column(self, name: str) -> Column:
    """Returns Column instance with a type if name is found in current schema,
    otherwise raises an exception.
    """
    if "." in name:
        name_path = name.split(".")
    elif DEFAULT_DELIMITER in name:
        name_path = name.split(DEFAULT_DELIMITER)
    else:
        name_path = [name]
    for path, type_, _, _ in self.signals_schema.get_flat_tree():
        if path == name_path:
            return Column(name, python_to_sql(type_))

    raise ValueError(f"Column with name {name} not found in the schema")

compare

compare(
    other: DataChain,
    on: Union[str, Sequence[str]],
    right_on: Optional[Union[str, Sequence[str]]] = None,
    compare: Optional[Union[str, Sequence[str]]] = None,
    right_compare: Optional[
        Union[str, Sequence[str]]
    ] = None,
    added: bool = True,
    deleted: bool = True,
    modified: bool = True,
    same: bool = False,
    status_col: Optional[str] = None,
) -> DataChain

Comparing two chains by identifying rows that are added, deleted, modified or same. Result is the new chain that has additional column with possible values: A, D, M, U representing added, deleted, modified and same rows respectively. Note that if only one "status" is asked, by setting proper flags, this additional column is not created as it would have only one value for all rows. Beside additional diff column, new chain has schema of the chain on which method was called.

Parameters:

other (DataChain) –

Chain to calculate diff from.
on (Union[str, Sequence[str]]) –

Column or list of columns to match on. If both chains have the same columns then this column is enough for the match. Otherwise, right_on parameter has to specify the columns for the other chain. This value is used to find corresponding row in other dataset. If not found there, row is considered as added (or removed if vice versa), and if found then row can be either modified or same.
right_on (Optional[Union[str, Sequence[str]]], default: None ) –

Optional column or list of columns for the other to match.
compare (Optional[Union[str, Sequence[str]]], default: None ) –

Column or list of columns to compare on. If both chains have the same columns then this column is enough for the compare. Otherwise, right_compare parameter has to specify the columns for the other chain. This value is used to see if row is modified or same. If not set, all columns will be used for comparison
right_compare (Optional[Union[str, Sequence[str]]], default: None ) –

Optional column or list of columns for the other to compare to.
added (bool, default: True ) –

Whether to return added rows in resulting chain.
deleted (bool, default: True ) –

Whether to return deleted rows in resulting chain.
modified (bool, default: True ) –

Whether to return modified rows in resulting chain.
same (bool, default: False ) –

Whether to return unchanged rows in resulting chain.
status_col (str, default: None ) –

Name of the new column that is created in resulting chain representing diff status.

Example

res = persons.compare(
    new_persons,
    on=["id"],
    right_on=["other_id"],
    compare=["name"],
    added=True,
    deleted=True,
    modified=True,
    same=True,
    status_col="diff"
)

Source code in datachain/lib/dc/datachain.py

def compare(
    self,
    other: "DataChain",
    on: Union[str, Sequence[str]],
    right_on: Optional[Union[str, Sequence[str]]] = None,
    compare: Optional[Union[str, Sequence[str]]] = None,
    right_compare: Optional[Union[str, Sequence[str]]] = None,
    added: bool = True,
    deleted: bool = True,
    modified: bool = True,
    same: bool = False,
    status_col: Optional[str] = None,
) -> "DataChain":
    """Comparing two chains by identifying rows that are added, deleted, modified
    or same. Result is the new chain that has additional column with possible
    values: `A`, `D`, `M`, `U` representing added, deleted, modified and same
    rows respectively. Note that if only one "status" is asked, by setting proper
    flags, this additional column is not created as it would have only one value
    for all rows. Beside additional diff column, new chain has schema of the chain
    on which method was called.

    Parameters:
        other: Chain to calculate diff from.
        on: Column or list of columns to match on. If both chains have the
            same columns then this column is enough for the match. Otherwise,
            `right_on` parameter has to specify the columns for the other chain.
            This value is used to find corresponding row in other dataset. If not
            found there, row is considered as added (or removed if vice versa), and
            if found then row can be either modified or same.
        right_on: Optional column or list of columns
            for the `other` to match.
        compare: Column or list of columns to compare on. If both chains have
            the same columns then this column is enough for the compare. Otherwise,
            `right_compare` parameter has to specify the columns for the other
            chain. This value is used to see if row is modified or same. If
            not set, all columns will be used for comparison
        right_compare: Optional column or list of columns
                for the `other` to compare to.
        added (bool): Whether to return added rows in resulting chain.
        deleted (bool): Whether to return deleted rows in resulting chain.
        modified (bool): Whether to return modified rows in resulting chain.
        same (bool): Whether to return unchanged rows in resulting chain.
        status_col (str): Name of the new column that is created in resulting chain
            representing diff status.

    Example:
        ```py
        res = persons.compare(
            new_persons,
            on=["id"],
            right_on=["other_id"],
            compare=["name"],
            added=True,
            deleted=True,
            modified=True,
            same=True,
            status_col="diff"
        )
        ```
    """
    from datachain.diff import _compare

    return _compare(
        self,
        other,
        on,
        right_on=right_on,
        compare=compare,
        right_compare=right_compare,
        added=added,
        deleted=deleted,
        modified=modified,
        same=same,
        status_col=status_col,
    )

count

count() -> int

Return the number of rows in the chain.

Source code in datachain/lib/dc/datachain.py

def count(self) -> int:
    """Return the number of rows in the chain."""
    return self._query.count()

diff

diff(
    other: DataChain,
    on: str = "file",
    right_on: Optional[str] = None,
    added: bool = True,
    modified: bool = True,
    deleted: bool = False,
    same: bool = False,
    status_col: Optional[str] = None,
) -> DataChain

Similar to .compare(), which is more generic method to calculate difference between two chains. Unlike .compare(), this method works only on those chains that have File object, or it's derivatives, in it. File source and path are used for matching, and file version and etag for comparing, while in .compare() user needs to provide arbitrary columns for matching and comparing.

Parameters:

other (DataChain) –

Chain to calculate diff from.
on (str, default: 'file' ) –

File signal to match on. If both chains have the same file signal then this column is enough for the match. Otherwise, right_on parameter has to specify the file signal for the other chain. This value is used to find corresponding row in other dataset. If not found there, row is considered as added (or removed if vice versa), and if found then row can be either modified or same.
right_on (Optional[str], default: None ) –

Optional file signal for the other to match.
added (bool, default: True ) –

Whether to return added rows in resulting chain.
deleted (bool, default: False ) –

Whether to return deleted rows in resulting chain.
modified (bool, default: True ) –

Whether to return modified rows in resulting chain.
same (bool, default: False ) –

Whether to return unchanged rows in resulting chain.
status_col (str, default: None ) –

Optional name of the new column that is created in resulting chain representing diff status.

Example

diff = images.diff(
    new_images,
    on="file",
    right_on="other_file",
    added=True,
    deleted=True,
    modified=True,
    same=True,
    status_col="diff"
)

Source code in datachain/lib/dc/datachain.py

def diff(
    self,
    other: "DataChain",
    on: str = "file",
    right_on: Optional[str] = None,
    added: bool = True,
    modified: bool = True,
    deleted: bool = False,
    same: bool = False,
    status_col: Optional[str] = None,
) -> "DataChain":
    """Similar to `.compare()`, which is more generic method to calculate difference
    between two chains. Unlike `.compare()`, this method works only on those chains
    that have `File` object, or it's derivatives, in it. File `source` and `path`
    are used for matching, and file `version` and `etag` for comparing, while in
    `.compare()` user needs to provide arbitrary columns for matching and comparing.

    Parameters:
        other: Chain to calculate diff from.
        on: File signal to match on. If both chains have the
            same file signal then this column is enough for the match. Otherwise,
            `right_on` parameter has to specify the file signal for the other chain.
            This value is used to find corresponding row in other dataset. If not
            found there, row is considered as added (or removed if vice versa), and
            if found then row can be either modified or same.
        right_on: Optional file signal for the `other` to match.
        added (bool): Whether to return added rows in resulting chain.
        deleted (bool): Whether to return deleted rows in resulting chain.
        modified (bool): Whether to return modified rows in resulting chain.
        same (bool): Whether to return unchanged rows in resulting chain.
        status_col (str): Optional name of the new column that is created in
            resulting chain representing diff status.

    Example:
        ```py
        diff = images.diff(
            new_images,
            on="file",
            right_on="other_file",
            added=True,
            deleted=True,
            modified=True,
            same=True,
            status_col="diff"
        )
        ```
    """
    on_file_signals = ["source", "path"]
    compare_file_signals = ["version", "etag"]

    def get_file_signals(file: str, signals):
        return [f"{file}.{c}" for c in signals]

    right_on = right_on or on

    on_cols = get_file_signals(on, on_file_signals)
    right_on_cols = get_file_signals(right_on, on_file_signals)
    compare_cols = get_file_signals(on, compare_file_signals)
    right_compare_cols = get_file_signals(right_on, compare_file_signals)

    return self.compare(
        other,
        on_cols,
        right_on=right_on_cols,
        compare=compare_cols,
        right_compare=right_compare_cols,
        added=added,
        deleted=deleted,
        modified=modified,
        same=same,
        status_col=status_col,
    )

distinct

distinct(arg: str, *args: str) -> Self

Removes duplicate rows based on uniqueness of some input column(s) i.e if rows are found with the same value of input column(s), only one row is left in the result set.

Example

dc.distinct("file.path")

Source code in datachain/lib/dc/datachain.py

@delta_disabled
def distinct(self, arg: str, *args: str) -> "Self":  # type: ignore[override]
    """Removes duplicate rows based on uniqueness of some input column(s)
    i.e if rows are found with the same value of input column(s), only one
    row is left in the result set.

    Example:
        ```py
        dc.distinct("file.path")
        ```
    """
    return self._evolve(
        query=self._query.distinct(
            *self.signals_schema.resolve(arg, *args).db_signals()
        )
    )

exec

exec() -> Self

Execute the chain.

Source code in datachain/lib/dc/datachain.py

def exec(self) -> "Self":
    """Execute the chain."""
    return self._evolve(query=self._query.exec())

explode

explode(
    col: str,
    model_name: Optional[str] = None,
    column: Optional[str] = None,
    schema_sample_size: int = 1,
) -> DataChain

Explodes a column containing JSON objects (dict or str DataChain type) into individual columns based on the schema of the JSON. Schema is inferred from the first row of the column.

Parameters:

col (str) –

the name of the column containing JSON to be exploded.
model_name (Optional[str], default: None ) –

optional generated model name. By default generates the name automatically.
column (Optional[str], default: None ) –

optional generated column name. By default generates the name automatically.
schema_sample_size (int, default: 1 ) –

the number of rows to use for inferring the schema of the JSON (in case some fields are optional and it's not enough to analyze a single row).

Returns:

DataChain ( DataChain ) –

A new DataChain instance with the new set of columns.

Source code in datachain/lib/dc/datachain.py

def explode(
    self,
    col: str,
    model_name: Optional[str] = None,
    column: Optional[str] = None,
    schema_sample_size: int = 1,
) -> "DataChain":
    """Explodes a column containing JSON objects (dict or str DataChain type) into
       individual columns based on the schema of the JSON. Schema is inferred from
       the first row of the column.

    Args:
        col: the name of the column containing JSON to be exploded.
        model_name: optional generated model name.  By default generates the name
            automatically.
        column: optional generated column name. By default generates the
            name automatically.
        schema_sample_size: the number of rows to use for inferring the schema of
            the JSON (in case some fields are optional and it's not enough to
            analyze a single row).

    Returns:
        DataChain: A new DataChain instance with the new set of columns.
    """
    import json

    import pyarrow as pa

    from datachain.lib.arrow import schema_to_output

    json_values = list(self.limit(schema_sample_size).collect(col))
    json_dicts = [
        json.loads(json_value) if isinstance(json_value, str) else json_value
        for json_value in json_values
    ]

    if any(not isinstance(json_dict, dict) for json_dict in json_dicts):
        raise TypeError(f"Column {col} should be a string or dict type with JSON")

    schema = pa.Table.from_pylist(json_dicts).schema
    output, original_names = schema_to_output(schema, None)

    if not model_name:
        model_name = f"{col.title()}ExplodedModel"

    model = dict_to_data_model(model_name, output, original_names)

    def json_to_model(json_value: Union[str, dict]):
        json_dict = (
            json.loads(json_value) if isinstance(json_value, str) else json_value
        )
        return model.model_validate(json_dict)

    if not column:
        column = f"{col}_expl"

    return self.map(json_to_model, params=col, output={column: model})

filter

filter(*args: Any) -> Self

Filter the chain according to conditions.

Example

Basic usage with built-in operators

dc.filter(C("width") < 200)

Using glob to match patterns

dc.filter(C("file.path").glob("*.jpg"))

Using in to match lists

ids = [1,2,3]
dc.filter(C("experiment_id").in_(ids))

Using datachain.func

from datachain.func import string
dc.filter(string.length(C("file.path")) > 5)

Combining filters with "or"

dc.filter(C("file.path").glob("cat*") | C("file.path").glob("dog*))

Combining filters with "and"

dc.filter(
    C("file.path").glob("*.jpg) &
    (string.length(C("file.path")) > 5)
)

Source code in datachain/lib/dc/datachain.py

def filter(self, *args: Any) -> "Self":
    """Filter the chain according to conditions.

    Example:
        Basic usage with built-in operators
        ```py
        dc.filter(C("width") < 200)
        ```

        Using glob to match patterns
        ```py
        dc.filter(C("file.path").glob("*.jpg"))
        ```

        Using in to match lists
        ```py
        ids = [1,2,3]
        dc.filter(C("experiment_id").in_(ids))
        ```

        Using `datachain.func`
        ```py
        from datachain.func import string
        dc.filter(string.length(C("file.path")) > 5)
        ```

        Combining filters with "or"
        ```py
        dc.filter(C("file.path").glob("cat*") | C("file.path").glob("dog*))
        ```

        Combining filters with "and"
        ```py
        dc.filter(
            C("file.path").glob("*.jpg) &
            (string.length(C("file.path")) > 5)
        )
        ```
    """
    return self._evolve(query=self._query.filter(*args))

gen

gen(
    func: Optional[Union[Callable, Generator]] = None,
    params: Union[None, str, Sequence[str]] = None,
    output: OutputType = None,
    **signal_map
) -> Self

Apply a function to each row to create new rows (with potentially new signals). The function needs to return a new objects for each of the new rows. It returns a chain itself with new signals.

Input-output relationship: 1:N

This method is similar to map(), uses the same list of parameters, but with one key differences: It produces a sequence of rows for each input row (like extracting multiple file records from a single tar file or bounding boxes from a single image file).

Example

chain = chain.gen(
    line=lambda file: [l for l in file.read().split("\n")],
    output=str,
)
chain.save("new_dataset")

Source code in datachain/lib/dc/datachain.py

def gen(
    self,
    func: Optional[Union[Callable, Generator]] = None,
    params: Union[None, str, Sequence[str]] = None,
    output: OutputType = None,
    **signal_map,
) -> "Self":
    r"""Apply a function to each row to create new rows (with potentially new
    signals). The function needs to return a new objects for each of the new rows.
    It returns a chain itself with new signals.

    Input-output relationship: 1:N

    This method is similar to `map()`, uses the same list of parameters, but with
    one key differences: It produces a sequence of rows for each input row (like
    extracting multiple file records from a single tar file or bounding boxes from a
    single image file).

    Example:
        ```py
        chain = chain.gen(
            line=lambda file: [l for l in file.read().split("\n")],
            output=str,
        )
        chain.save("new_dataset")
        ```
    """
    udf_obj = self._udf_to_obj(Generator, func, params, output, signal_map)
    if (prefetch := self._settings.prefetch) is not None:
        udf_obj.prefetch = prefetch
    return self._evolve(
        query=self._query.generate(
            udf_obj.to_udf_wrapper(),
            **self._settings.to_dict(),
        ),
        signal_schema=udf_obj.output,
    )

group_by

group_by(
    *,
    partition_by: Optional[
        Union[str, Func, Sequence[Union[str, Func]]]
    ] = None,
    **kwargs: Func
) -> Self

Group rows by specified set of signals and return new signals with aggregated values.

The supported functions

count(), sum(), avg(), min(), max(), any_value(), collect(), concat()

Example

chain = chain.group_by(
    cnt=func.count(),
    partition_by=("file_source", "file_ext"),
)

Source code in datachain/lib/dc/datachain.py

@delta_disabled  # type: ignore[arg-type]
def group_by(
    self,
    *,
    partition_by: Optional[Union[str, Func, Sequence[Union[str, Func]]]] = None,
    **kwargs: Func,
) -> "Self":
    """Group rows by specified set of signals and return new signals
    with aggregated values.

    The supported functions:
       count(), sum(), avg(), min(), max(), any_value(), collect(), concat()

    Example:
        ```py
        chain = chain.group_by(
            cnt=func.count(),
            partition_by=("file_source", "file_ext"),
        )
        ```
    """
    if partition_by is None:
        partition_by = []
    elif isinstance(partition_by, (str, Func)):
        partition_by = [partition_by]

    partition_by_columns: list[Column] = []
    signal_columns: list[Column] = []
    schema_fields: dict[str, DataType] = {}
    keep_columns: list[str] = []

    # validate partition_by columns and add them to the schema
    for col in partition_by:
        if isinstance(col, str):
            col_db_name = ColumnMeta.to_db_name(col)
            col_type = self.signals_schema.get_column_type(col_db_name)
            column = Column(col_db_name, python_to_sql(col_type))
            if col not in keep_columns:
                keep_columns.append(col)
        elif isinstance(col, Function):
            column = col.get_column(self.signals_schema)
            col_db_name = column.name
            col_type = column.type.python_type
            schema_fields[col_db_name] = col_type
        else:
            raise DataChainColumnError(
                col,
                (
                    f"partition_by column {col} has type {type(col)}"
                    " but expected str or Function"
                ),
            )
        partition_by_columns.append(column)

    # validate signal columns and add them to the schema
    if not kwargs:
        raise ValueError("At least one column should be provided for group_by")
    for col_name, func in kwargs.items():
        if not isinstance(func, Func):
            raise DataChainColumnError(
                col_name,
                f"Column {col_name} has type {type(func)} but expected Func object",
            )
        column = func.get_column(self.signals_schema, label=col_name)
        signal_columns.append(column)
        schema_fields[col_name] = func.get_result_type(self.signals_schema)

    signal_schema = SignalSchema(schema_fields)
    if keep_columns:
        signal_schema |= self.signals_schema.to_partial(*keep_columns)

    return self._evolve(
        query=self._query.group_by(signal_columns, partition_by_columns),
        signal_schema=signal_schema,
    )

limit

limit(n: int) -> Self

Return the first n rows of the chain.

If the chain is unordered, which rows are returned is undefined. If the chain has less than n rows, the whole chain is returned.

Parameters:

n (int) –

Number of rows to return.

Source code in datachain/lib/dc/datachain.py

def limit(self, n: int) -> "Self":
    """Return the first `n` rows of the chain.

    If the chain is unordered, which rows are returned is undefined.
    If the chain has less than `n` rows, the whole chain is returned.

    Parameters:
        n (int): Number of rows to return.
    """
    return self._evolve(query=self._query.limit(n))

map

map(
    func: Optional[Callable] = None,
    params: Union[None, str, Sequence[str]] = None,
    output: OutputType = None,
    **signal_map
) -> Self

Apply a function to each row to create new signals. The function should return a new object for each row. It returns a chain itself with new signals.

Input-output relationship: 1:1

Parameters:

func –

Function applied to each row.
params –

List of column names used as input for the function. Default is taken from function signature.
output –

Dictionary defining new signals and their corresponding types. Default type is taken from function signature. Default can be also taken from kwargs - **signal_map (see below). If signal name is defined using signal_map (see below) only a single type value can be used.
**signal_map –

kwargs can be used to define func together with it's return signal name in format of map(my_sign=my_func). This helps define signal names and function in a nicer way.

Example

Using signal_map and single type in output:

chain = chain.map(value=lambda name: name[:-4] + ".json", output=str)
chain.save("new_dataset")

Using func and output as a map:

chain = chain.map(
    lambda name: name.split("."), output={"stem": str, "ext": str}
)
chain.save("new_dataset")

Source code in datachain/lib/dc/datachain.py

def map(
    self,
    func: Optional[Callable] = None,
    params: Union[None, str, Sequence[str]] = None,
    output: OutputType = None,
    **signal_map,
) -> "Self":
    """Apply a function to each row to create new signals. The function should
    return a new object for each row. It returns a chain itself with new signals.

    Input-output relationship: 1:1

    Parameters:
        func : Function applied to each row.
        params : List of column names used as input for the function. Default
                is taken from function signature.
        output : Dictionary defining new signals and their corresponding types.
                Default type is taken from function signature. Default can be also
                taken from kwargs - **signal_map (see below).
                If signal name is defined using signal_map (see below) only a single
                type value can be used.
        **signal_map : kwargs can be used to define `func` together with it's return
                signal name in format of `map(my_sign=my_func)`. This helps define
                signal names and function in a nicer way.

    Example:
        Using signal_map and single type in output:
        ```py
        chain = chain.map(value=lambda name: name[:-4] + ".json", output=str)
        chain.save("new_dataset")
        ```

        Using func and output as a map:
        ```py
        chain = chain.map(
            lambda name: name.split("."), output={"stem": str, "ext": str}
        )
        chain.save("new_dataset")
        ```
    """
    udf_obj = self._udf_to_obj(Mapper, func, params, output, signal_map)
    if (prefetch := self._settings.prefetch) is not None:
        udf_obj.prefetch = prefetch

    return self._evolve(
        query=self._query.add_signals(
            udf_obj.to_udf_wrapper(),
            **self._settings.to_dict(),
        ),
        signal_schema=self.signals_schema | udf_obj.output,
    )

max

max(fr: DataType)

Compute the maximum of a column.

Source code in datachain/lib/dc/datachain.py

def max(self, fr: DataType):  # type: ignore[override]
    """Compute the maximum of a column."""
    return self._extend_to_data_model("max", fr)

merge

merge(
    right_ds: DataChain,
    on: Union[MergeColType, Sequence[MergeColType]],
    right_on: Optional[
        Union[MergeColType, Sequence[MergeColType]]
    ] = None,
    inner=False,
    full=False,
    rname="right_",
) -> Self

Merge two chains based on the specified criteria.

Parameters:

right_ds (DataChain) –

Chain to join with.
on (Union[MergeColType, Sequence[MergeColType]]) –

Predicate ("column.name", C("column.name"), or Func) or list of Predicates to join on. If both chains have the same predicates then this predicate is enough for the join. Otherwise, right_on parameter has to specify the predicates for the other chain.
right_on (Optional[Union[MergeColType, Sequence[MergeColType]]], default: None ) –

Optional predicate or list of Predicates for the right_ds to join.
inner (bool, default: False ) –

Whether to run inner join or outer join.
full (bool, default: False ) –

Whether to run full outer join.
rname (str, default: 'right_' ) –

Name prefix for conflicting signal names.

Examples:

meta = meta_emd.merge(meta_pq, on=(C.name, C.emd__index),
                      right_on=(C.name, C.pq__index))

imgs.merge(captions,
           on=func.path.file_stem(imgs.c("file.path")),
           right_on=func.path.file_stem(captions.c("file.path"))

)

Source code in datachain/lib/dc/datachain.py

@delta_disabled
def merge(
    self,
    right_ds: "DataChain",
    on: Union[MergeColType, Sequence[MergeColType]],
    right_on: Optional[Union[MergeColType, Sequence[MergeColType]]] = None,
    inner=False,
    full=False,
    rname="right_",
) -> "Self":
    """Merge two chains based on the specified criteria.

    Parameters:
        right_ds: Chain to join with.
        on: Predicate ("column.name", C("column.name"), or Func) or list of
            Predicates to join on. If both chains have the same predicates then
            this predicate is enough for the join. Otherwise, `right_on` parameter
            has to specify the predicates for the other chain.
        right_on: Optional predicate or list of Predicates for the `right_ds`
            to join.
        inner (bool): Whether to run inner join or outer join.
        full (bool): Whether to run full outer join.
        rname (str): Name prefix for conflicting signal names.

    Examples:
        ```py
        meta = meta_emd.merge(meta_pq, on=(C.name, C.emd__index),
                              right_on=(C.name, C.pq__index))
        ```

        ```py
        imgs.merge(captions,
                   on=func.path.file_stem(imgs.c("file.path")),
                   right_on=func.path.file_stem(captions.c("file.path"))
        ```
    )
    """
    if on is None:
        raise DatasetMergeError(["None"], None, "'on' must be specified")

    on = _validate_merge_on(on, self)
    if not on:
        raise DatasetMergeError(
            on,
            right_on,
            (
                "'on' must be 'str', 'Func' or 'Sequence' object "
                f"but got type '{type(on)}'"
            ),
        )

    if right_on is not None:
        right_on = _validate_merge_on(right_on, right_ds)
        if not right_on:
            raise DatasetMergeError(
                on,
                right_on,
                "'right_on' must be 'str', 'Func' or 'Sequence' object"
                f" but got type '{type(right_on)}'",
            )

        if len(right_on) != len(on):
            raise DatasetMergeError(
                on, right_on, "'on' and 'right_on' must have the same length'"
            )

    if self == right_ds:
        right_ds = right_ds.clone()

    errors = []

    def _resolve(
        ds: DataChain,
        col: Union[str, Function, sqlalchemy.ColumnElement],
        side: Union[str, None],
    ):
        try:
            if isinstance(col, Function):
                return ds.c(col.get_column())
            return ds.c(col) if isinstance(col, (str, C)) else col
        except ValueError:
            if side:
                errors.append(f"{_get_merge_error_str(col)} in {side}")

    ops = [
        _resolve(self, left, "left")
        == _resolve(right_ds, right, "right" if right_on else None)
        for left, right in zip(on, right_on or on)
    ]

    if errors:
        raise DatasetMergeError(
            on, right_on, f"Could not resolve {', '.join(errors)}"
        )

    query = self._query.join(
        right_ds._query, sqlalchemy.and_(*ops), inner, full, rname + "{name}"
    )
    query.feature_schema = None
    ds = self._evolve(query=query)

    signals_schema = self.signals_schema.clone_without_sys_signals()
    right_signals_schema = right_ds.signals_schema.clone_without_sys_signals()
    ds.signals_schema = SignalSchema({"sys": Sys}) | signals_schema.merge(
        right_signals_schema, rname
    )

    return ds

min

min(fr: DataType)

Compute the minimum of a column.

Source code in datachain/lib/dc/datachain.py

def min(self, fr: DataType):  # type: ignore[override]
    """Compute the minimum of a column."""
    return self._extend_to_data_model("min", fr)

mutate

mutate(**kwargs) -> Self

Create new signals based on existing signals.

This method cannot modify existing columns. If you need to modify an existing column, use a different name for the new column and then use select() to choose which columns to keep.

This method is vectorized and more efficient compared to map(), and it does not extract or download any data from the internal database. However, it can only utilize predefined built-in functions and their combinations.

The supported functions

Numerical: +, -, *, /, rand(), avg(), count(), func(), greatest(), least(), max(), min(), sum() String: length(), split(), replace(), regexp_replace() Filename: name(), parent(), file_stem(), file_ext() Array: length(), sip_hash_64(), euclidean_distance(), cosine_distance() Window: row_number(), rank(), dense_rank(), first()

Example:

 dc.mutate(
    area=Column("image.height") * Column("image.width"),
    extension=file_ext(Column("file.path")),
    dist=cosine_distance(embedding_text, embedding_image)
)

Window function example:

window = func.window(partition_by="file.parent", order_by="file.size")
dc.mutate(
    row_number=func.row_number().over(window),
)

This method can be also used to rename signals. If the Column("name") provided as value for the new signal - the old column will be dropped. Otherwise a new column is created.

Example:

 dc.mutate(
    newkey=Column("oldkey")
)

Source code in datachain/lib/dc/datachain.py

def mutate(self, **kwargs) -> "Self":
    """Create new signals based on existing signals.

    This method cannot modify existing columns. If you need to modify an
    existing column, use a different name for the new column and then use
    `select()` to choose which columns to keep.

    This method is vectorized and more efficient compared to map(), and it does not
    extract or download any data from the internal database. However, it can only
    utilize predefined built-in functions and their combinations.

    The supported functions:
       Numerical:   +, -, *, /, rand(), avg(), count(), func(),
                    greatest(), least(), max(), min(), sum()
       String:      length(), split(), replace(), regexp_replace()
       Filename:    name(), parent(), file_stem(), file_ext()
       Array:       length(), sip_hash_64(), euclidean_distance(),
                    cosine_distance()
       Window:      row_number(), rank(), dense_rank(), first()

    Example:
    ```py
     dc.mutate(
        area=Column("image.height") * Column("image.width"),
        extension=file_ext(Column("file.path")),
        dist=cosine_distance(embedding_text, embedding_image)
    )
    ```

    Window function example:
    ```py
    window = func.window(partition_by="file.parent", order_by="file.size")
    dc.mutate(
        row_number=func.row_number().over(window),
    )
    ```

    This method can be also used to rename signals. If the Column("name") provided
    as value for the new signal - the old column will be dropped. Otherwise a new
    column is created.

    Example:
    ```py
     dc.mutate(
        newkey=Column("oldkey")
    )
    ```
    """
    from sqlalchemy.sql.sqltypes import NullType

    primitives = (bool, str, int, float)

    for col_name, expr in kwargs.items():
        if not isinstance(expr, (*primitives, Column, Func)) and isinstance(
            expr.type, NullType
        ):
            raise DataChainColumnError(
                col_name, f"Cannot infer type with expression {expr}"
            )

    mutated = {}
    schema = self.signals_schema
    for name, value in kwargs.items():
        if isinstance(value, Column):
            # renaming existing column
            for signal in schema.db_signals(name=value.name, as_columns=True):
                mutated[signal.name.replace(value.name, name, 1)] = signal  # type: ignore[union-attr]
        elif isinstance(value, Func):
            # adding new signal
            mutated[name] = value.get_column(schema)
        elif isinstance(value, primitives):
            # adding simple python constant primitives like str, int, float, bool
            val = literal(value)
            val.type = python_to_sql(type(value))()
            mutated[name] = val  # type: ignore[assignment]
        else:
            # adding new signal
            mutated[name] = value

    return self._evolve(
        query=self._query.mutate(**mutated), signal_schema=schema.mutate(kwargs)
    )

offset

offset(offset: int) -> Self

Return the results starting with the offset row.

If the chain is unordered, which rows are skipped in undefined. If the chain has less than offset rows, the result is an empty chain.

Parameters:

offset (int) –

Number of rows to skip.

Source code in datachain/lib/dc/datachain.py

def offset(self, offset: int) -> "Self":
    """Return the results starting with the offset row.

    If the chain is unordered, which rows are skipped in undefined.
    If the chain has less than `offset` rows, the result is an empty chain.

    Parameters:
        offset (int): Number of rows to skip.
    """
    return self._evolve(query=self._query.offset(offset))

order_by

order_by(*args, descending: bool = False) -> Self

Orders by specified set of columns.

Parameters:

descending (bool, default: False ) –

Whether to sort in descending order or not.

Example

dc.order_by("similarity_score", descending=True).limit(10)

Note

Order is not guaranteed when steps are added after an order_by statement. I.e. when using read_dataset an order_by statement should be used if the order of the records in the chain is important. Using order_by directly before limit, collect and collect_flatten will give expected results. See https://github.com/iterative/datachain/issues/477 for further details.

Source code in datachain/lib/dc/datachain.py

@resolve_columns
def order_by(self, *args, descending: bool = False) -> "Self":
    """Orders by specified set of columns.

    Parameters:
        descending (bool): Whether to sort in descending order or not.

    Example:
        ```py
        dc.order_by("similarity_score", descending=True).limit(10)
        ```

    Note:
        Order is not guaranteed when steps are added after an `order_by` statement.
        I.e. when using `read_dataset` an `order_by` statement should be used if
        the order of the records in the chain is important.
        Using `order_by` directly before `limit`, `collect` and `collect_flatten`
        will give expected results.
        See https://github.com/iterative/datachain/issues/477 for further details.
    """
    if descending:
        args = tuple(sqlalchemy.desc(a) for a in args)

    return self._evolve(query=self._query.order_by(*args))

parse_tabular

parse_tabular(
    output: OutputType = None,
    column: str = "",
    model_name: str = "",
    source: bool = True,
    nrows: Optional[int] = None,
    **kwargs
) -> Self

Generate chain from list of tabular files.

Parameters:

output –

Dictionary or feature class defining column names and their corresponding types. List of column names is also accepted, in which case types will be inferred.
column –

Generated column name.
model_name –

Generated model name.
source –

Whether to include info about the source file.
nrows –

Optional row limit.
kwargs –

Parameters to pass to pyarrow.dataset.dataset.

Example

Reading a json lines file:

import datachain as dc
chain = dc.read_storage("s3://mybucket/file.jsonl")
chain = chain.parse_tabular(format="json")

Reading a filtered list of files as a dataset:

import datachain as dc

chain = dc.read_storage("s3://mybucket")
chain = chain.filter(dc.C("file.path").glob("*.jsonl"))
chain = chain.parse_tabular(format="json")

Source code in datachain/lib/dc/datachain.py

def parse_tabular(
    self,
    output: OutputType = None,
    column: str = "",
    model_name: str = "",
    source: bool = True,
    nrows: Optional[int] = None,
    **kwargs,
) -> "Self":
    """Generate chain from list of tabular files.

    Parameters:
        output : Dictionary or feature class defining column names and their
            corresponding types. List of column names is also accepted, in which
            case types will be inferred.
        column : Generated column name.
        model_name : Generated model name.
        source : Whether to include info about the source file.
        nrows : Optional row limit.
        kwargs : Parameters to pass to pyarrow.dataset.dataset.

    Example:
        Reading a json lines file:
        ```py
        import datachain as dc
        chain = dc.read_storage("s3://mybucket/file.jsonl")
        chain = chain.parse_tabular(format="json")
        ```

        Reading a filtered list of files as a dataset:
        ```py
        import datachain as dc

        chain = dc.read_storage("s3://mybucket")
        chain = chain.filter(dc.C("file.path").glob("*.jsonl"))
        chain = chain.parse_tabular(format="json")
        ```
    """
    from pyarrow.dataset import CsvFileFormat, JsonFileFormat

    from datachain.lib.arrow import (
        ArrowGenerator,
        fix_pyarrow_format,
        infer_schema,
        schema_to_output,
    )

    parse_options = kwargs.pop("parse_options", None)
    if format := kwargs.get("format"):
        kwargs["format"] = fix_pyarrow_format(format, parse_options)

    if (
        nrows
        and format not in ["csv", "json"]
        and not isinstance(format, (CsvFileFormat, JsonFileFormat))
    ):
        raise DatasetPrepareError(
            self.name,
            "error in `parse_tabular` - "
            "`nrows` only supported for csv and json formats.",
        )

    if "file" not in self.schema or not self.count():
        raise DatasetPrepareError(self.name, "no files to parse.")

    schema = None
    col_names = output if isinstance(output, Sequence) else None
    if col_names or not output:
        try:
            schema = infer_schema(self, **kwargs, parse_options=parse_options)
            output, _ = schema_to_output(schema, col_names)
        except ValueError as e:
            raise DatasetPrepareError(self.name, e) from e

    if isinstance(output, dict):
        model_name = model_name or column or ""
        model = dict_to_data_model(model_name, output)
        output = model
    else:
        model = output  # type: ignore[assignment]

    if column:
        output = {column: model}  # type: ignore[dict-item]
    elif isinstance(output, type(BaseModel)):
        output = {
            name: info.annotation  # type: ignore[misc]
            for name, info in output.model_fields.items()
        }

    if source:
        output = {"source": ArrowRow} | output  # type: ignore[assignment,operator]

    # disable prefetch if nrows is set
    settings = {"prefetch": 0} if nrows else {}
    return self.settings(**settings).gen(  # type: ignore[arg-type]
        ArrowGenerator(
            schema,
            model,
            source,
            nrows,
            parse_options=parse_options,
            **kwargs,
        ),
        output=output,
    )

persist

persist() -> Self

Saves temporary chain that will be removed after the process ends. Temporary datasets are useful for optimization, for example when we have multiple chains starting with identical sub-chain. We can then persist that common chain and use it to calculate other chains, to avoid re-calculation every time. It returns the chain itself.

Source code in datachain/lib/dc/datachain.py

def persist(self) -> "Self":
    """Saves temporary chain that will be removed after the process ends.
    Temporary datasets are useful for optimization, for example when we have
    multiple chains starting with identical sub-chain. We can then persist that
    common chain and use it to calculate other chains, to avoid re-calculation
    every time.
    It returns the chain itself.
    """
    schema = self.signals_schema.clone_without_sys_signals().serialize()
    return self._evolve(query=self._query.save(feature_schema=schema))

print_schema

print_schema(file: Optional[IO] = None) -> None

Print schema of the chain.

Source code in datachain/lib/dc/datachain.py

def print_schema(self, file: Optional[IO] = None) -> None:
    """Print schema of the chain."""
    self._effective_signals_schema.print_tree(file=file)

reset_settings

reset_settings(settings: Optional[Settings] = None) -> Self

Reset all settings to default values.

Source code in datachain/lib/dc/datachain.py

def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
    """Reset all settings to default values."""
    self._settings = settings if settings else Settings()
    return self

sample

sample(n) -> Self

Return a random sample from the chain.

Parameters:

n (int) –

Number of samples to draw.

NOTE: Samples are not deterministic, and streamed/paginated queries or multiple workers will draw samples with replacement.

Source code in datachain/lib/dc/datachain.py

def sample(self, n) -> "Self":
    """Return a random sample from the chain.

    Parameters:
        n (int): Number of samples to draw.

    NOTE: Samples are not deterministic, and streamed/paginated queries or
    multiple workers will draw samples with replacement.
    """
    return self._evolve(query=self._query.sample(n))

save

save(
    name: str,
    version: Optional[str] = None,
    description: Optional[str] = None,
    attrs: Optional[list[str]] = None,
    update_version: Optional[str] = "patch",
    **kwargs
) -> DataChain

Save to a Dataset. It returns the chain itself.

Parameters:

name –

dataset name.
version –

version of a dataset. If version is not specified and dataset already exists, version patch increment will happen e.g 1.2.1 -> 1.2.2.
description –

description of a dataset.
attrs –

attributes of a dataset. They can be without value, e.g "NLP", or with a value, e.g "location=US".
update_version (Optional[str], default: 'patch' ) –

which part of the dataset version to automatically increase. Available values: major, minor or patch. Default is patch.

Source code in datachain/lib/dc/datachain.py

def save(  # type: ignore[override]
    self,
    name: str,
    version: Optional[str] = None,
    description: Optional[str] = None,
    attrs: Optional[list[str]] = None,
    update_version: Optional[str] = "patch",
    **kwargs,
) -> "DataChain":
    """Save to a Dataset. It returns the chain itself.

    Parameters:
        name : dataset name.
        version : version of a dataset. If version is not specified and dataset
            already exists, version patch increment will happen e.g 1.2.1 -> 1.2.2.
        description : description of a dataset.
        attrs : attributes of a dataset. They can be without value, e.g "NLP",
            or with a value, e.g "location=US".
        update_version: which part of the dataset version to automatically increase.
            Available values: `major`, `minor` or `patch`. Default is `patch`.
    """
    if version is not None:
        semver.validate(version)

    if update_version is not None and update_version not in [
        "patch",
        "major",
        "minor",
    ]:
        raise ValueError(
            "update_version can have one of the following values: major, minor or"
            " patch"
        )

    schema = self.signals_schema.clone_without_sys_signals().serialize()
    if self.delta and name:
        delta_ds, dependencies, has_changes = delta_update(
            self,
            name,
            on=self._delta_on,
            right_on=self._delta_result_on,
            compare=self._delta_compare,
        )

        if delta_ds:
            return self._evolve(
                query=delta_ds._query.save(
                    name=name,
                    version=version,
                    feature_schema=schema,
                    dependencies=dependencies,
                    **kwargs,
                )
            )

        if not has_changes:
            # sources have not been changed so new version of resulting dataset
            # would be the same as previous one. To avoid duplicating exact
            # datasets, we won't create new version of it and we will return
            # current latest version instead.
            from .datasets import read_dataset

            return read_dataset(name, **kwargs)

    return self._evolve(
        query=self._query.save(
            name=name,
            version=version,
            description=description,
            attrs=attrs,
            feature_schema=schema,
            update_version=update_version,
            **kwargs,
        )
    )

select

select(*args: str, _sys: bool = True) -> Self

Select only a specified set of signals.

Source code in datachain/lib/dc/datachain.py

def select(self, *args: str, _sys: bool = True) -> "Self":
    """Select only a specified set of signals."""
    new_schema = self.signals_schema.resolve(*args)
    if self._sys and _sys:
        new_schema = SignalSchema({"sys": Sys}) | new_schema
    columns = new_schema.db_signals()
    return self._evolve(
        query=self._query.select(*columns), signal_schema=new_schema
    )

select_except

select_except(*args: str) -> Self

Select all the signals expect the specified signals.

Source code in datachain/lib/dc/datachain.py

def select_except(self, *args: str) -> "Self":
    """Select all the signals expect the specified signals."""
    new_schema = self.signals_schema.select_except_signals(*args)
    columns = new_schema.db_signals()
    return self._evolve(
        query=self._query.select(*columns), signal_schema=new_schema
    )

settings

settings(
    cache=None,
    parallel=None,
    workers=None,
    min_task_size=None,
    prefetch: Optional[int] = None,
    sys: Optional[bool] = None,
) -> Self

Change settings for chain.

This function changes specified settings without changing not specified ones. It returns chain, so, it can be chained later with next operation.

Parameters:

cache –

data caching (default=False)
parallel –

number of thread for processors. True is a special value to enable all available CPUs (default=1)
workers –

number of distributed workers. Only for Studio mode. (default=1)
min_task_size –

minimum number of tasks (default=1)
prefetch (Optional[int], default: None ) –

number of workers to use for downloading files in advance. This is enabled by default and uses 2 workers. To disable prefetching, set it to 0.

Example

chain = (
    chain
    .settings(cache=True, parallel=8)
    .map(laion=process_webdataset(spec=WDSLaion), params="file")
)

Source code in datachain/lib/dc/datachain.py

def settings(
    self,
    cache=None,
    parallel=None,
    workers=None,
    min_task_size=None,
    prefetch: Optional[int] = None,
    sys: Optional[bool] = None,
) -> "Self":
    """Change settings for chain.

    This function changes specified settings without changing not specified ones.
    It returns chain, so, it can be chained later with next operation.

    Parameters:
        cache : data caching (default=False)
        parallel : number of thread for processors. True is a special value to
            enable all available CPUs (default=1)
        workers : number of distributed workers. Only for Studio mode. (default=1)
        min_task_size : minimum number of tasks (default=1)
        prefetch: number of workers to use for downloading files in advance.
                  This is enabled by default and uses 2 workers.
                  To disable prefetching, set it to 0.

    Example:
        ```py
        chain = (
            chain
            .settings(cache=True, parallel=8)
            .map(laion=process_webdataset(spec=WDSLaion), params="file")
        )
        ```
    """
    if sys is None:
        sys = self._sys
    settings = copy.copy(self._settings)
    settings.add(Settings(cache, parallel, workers, min_task_size, prefetch))
    return self._evolve(settings=settings, _sys=sys)

setup

setup(**kwargs) -> Self

Setup variables to pass to UDF functions.

Use before running map/gen/agg/batch_map to save an object and pass it as an argument to the UDF.

The value must be a callable (a lambda: <value> syntax can be used to quickly create one) that returns the object to be passed to the UDF. It is evaluated lazily when UDF is running, in case of multiple machines the callable is run on a worker machine.

Example

import anthropic
from anthropic.types import Message
import datachain as dc

(
    dc.read_storage(DATA, type="text")
    .settings(parallel=4, cache=True)

    # Setup Anthropic client and pass it to the UDF below automatically
    # The value is callable (see the note above)
    .setup(client=lambda: anthropic.Anthropic(api_key=API_KEY))

    .map(
        claude=lambda client, file: client.messages.create(
            model=MODEL,
            system=PROMPT,
            messages=[{"role": "user", "content": file.get_value()}],
        ),
        output=Message,
    )
)

Source code in datachain/lib/dc/datachain.py

def setup(self, **kwargs) -> "Self":
    """Setup variables to pass to UDF functions.

    Use before running map/gen/agg/batch_map to save an object and pass it as an
    argument to the UDF.

    The value must be a callable (a `lambda: <value>` syntax can be used to quickly
    create one) that returns the object to be passed to the UDF. It is evaluated
    lazily when UDF is running, in case of multiple machines the callable is run on
    a worker machine.

    Example:
        ```py
        import anthropic
        from anthropic.types import Message
        import datachain as dc

        (
            dc.read_storage(DATA, type="text")
            .settings(parallel=4, cache=True)

            # Setup Anthropic client and pass it to the UDF below automatically
            # The value is callable (see the note above)
            .setup(client=lambda: anthropic.Anthropic(api_key=API_KEY))

            .map(
                claude=lambda client, file: client.messages.create(
                    model=MODEL,
                    system=PROMPT,
                    messages=[{"role": "user", "content": file.get_value()}],
                ),
                output=Message,
            )
        )
        ```
    """
    intersection = set(self._setup.keys()) & set(kwargs.keys())
    if intersection:
        keys = ", ".join(intersection)
        raise DatasetPrepareError(self.name, f"this value(s) already setup: {keys}")

    self._setup = self._setup | kwargs
    return self

show

show(
    limit: int = 20,
    flatten=False,
    transpose=False,
    truncate=True,
    include_hidden=False,
) -> None

Show a preview of the chain results.

Parameters:

limit –

How many rows to show.
flatten –

Whether to use a multiindex or flatten column names.
transpose –

Whether to transpose rows and columns.
truncate –

Whether or not to truncate the contents of columns.
include_hidden –

Whether to include hidden columns.

Source code in datachain/lib/dc/datachain.py

def show(
    self,
    limit: int = 20,
    flatten=False,
    transpose=False,
    truncate=True,
    include_hidden=False,
) -> None:
    """Show a preview of the chain results.

    Parameters:
        limit : How many rows to show.
        flatten : Whether to use a multiindex or flatten column names.
        transpose : Whether to transpose rows and columns.
        truncate : Whether or not to truncate the contents of columns.
        include_hidden : Whether to include hidden columns.
    """
    import pandas as pd

    dc = self.limit(limit) if limit > 0 else self  # type: ignore[misc]
    df = dc.to_pandas(flatten, include_hidden=include_hidden)

    if df.empty:
        print("Empty result")
        print(f"Columns: {list(df.columns)}")
        return

    if transpose:
        df = df.T

    options: list = [
        "display.max_columns",
        None,
        "display.multi_sparse",
        False,
    ]

    try:
        if columns := os.get_terminal_size().columns:
            options.extend(["display.width", columns])
    except OSError:
        pass

    if not truncate:
        options.extend(["display.max_colwidth", None])

    with pd.option_context(*options):
        if inside_notebook():
            from IPython.display import display

            display(df)
        else:
            print(df)

    if len(df) == limit:
        print(f"\n[Limited by {len(df)} rows]")

shuffle

shuffle() -> Self

Shuffle the rows of the chain deterministically.

Source code in datachain/lib/dc/datachain.py

def shuffle(self) -> "Self":
    """Shuffle the rows of the chain deterministically."""
    return self.order_by("sys.rand")

subtract

subtract(
    other: DataChain,
    on: Optional[Union[str, Sequence[str]]] = None,
    right_on: Optional[Union[str, Sequence[str]]] = None,
) -> Self

Remove rows that appear in another chain.

Parameters:

other (DataChain) –

chain whose rows will be removed from self
on (Optional[Union[str, Sequence[str]]], default: None ) –

columns to consider for determining row equality in self. If unspecified, defaults to all common columns between self and other.
right_on (Optional[Union[str, Sequence[str]]], default: None ) –

columns to consider for determining row equality in other. If unspecified, defaults to the same values as on.

Source code in datachain/lib/dc/datachain.py

def subtract(  # type: ignore[override]
    self,
    other: "DataChain",
    on: Optional[Union[str, Sequence[str]]] = None,
    right_on: Optional[Union[str, Sequence[str]]] = None,
) -> "Self":
    """Remove rows that appear in another chain.

    Parameters:
        other: chain whose rows will be removed from `self`
        on: columns to consider for determining row equality in `self`.
            If unspecified, defaults to all common columns
            between `self` and `other`.
        right_on: columns to consider for determining row equality in `other`.
            If unspecified, defaults to the same values as `on`.
    """
    if isinstance(on, str):
        if not on:
            raise DataChainParamsError("'on' cannot be an empty string")
        on = [on]
    elif isinstance(on, Sequence):
        if not on or any(not col for col in on):
            raise DataChainParamsError("'on' cannot contain empty strings")

    if isinstance(right_on, str):
        if not right_on:
            raise DataChainParamsError("'right_on' cannot be an empty string")
        right_on = [right_on]
    elif isinstance(right_on, Sequence):
        if not right_on or any(not col for col in right_on):
            raise DataChainParamsError("'right_on' cannot contain empty strings")

    if on is None and right_on is None:
        other_columns = set(other._effective_signals_schema.db_signals())
        signals = [
            c
            for c in self._effective_signals_schema.db_signals()
            if c in other_columns
        ]
        if not signals:
            raise DataChainParamsError("subtract(): no common columns")
    elif on is not None and right_on is None:
        right_on = on
        signals = list(self.signals_schema.resolve(*on).db_signals())
    elif on is None and right_on is not None:
        raise DataChainParamsError(
            "'on' must be specified when 'right_on' is provided"
        )
    else:
        if not isinstance(on, Sequence) or not isinstance(right_on, Sequence):
            raise TypeError(
                "'on' and 'right_on' must be 'str' or 'Sequence' object"
            )
        if len(on) != len(right_on):
            raise DataChainParamsError(
                "'on' and 'right_on' must have the same length"
            )
        signals = list(
            zip(
                self.signals_schema.resolve(*on).db_signals(),
                other.signals_schema.resolve(*right_on).db_signals(),
            )  # type: ignore[arg-type]
        )
    return self._evolve(query=self._query.subtract(other._query, signals))  # type: ignore[arg-type]

sum

sum(fr: DataType)

Compute the sum of a column.

Source code in datachain/lib/dc/datachain.py

def sum(self, fr: DataType):  # type: ignore[override]
    """Compute the sum of a column."""
    return self._extend_to_data_model("sum", fr)

to_columnar_data_with_names

to_columnar_data_with_names(
    chunk_size: int = DEFAULT_PARQUET_CHUNK_SIZE,
) -> tuple[list[str], Iterator[list[list[Any]]]]

Returns column names and the results as an iterator that provides chunks, with each chunk containing a list of columns, where each column contains a list of the row values for that column in that chunk. Useful for columnar data formats, such as parquet or other OLAP databases.

Source code in datachain/lib/dc/datachain.py

def to_columnar_data_with_names(
    self, chunk_size: int = DEFAULT_PARQUET_CHUNK_SIZE
) -> tuple[list[str], Iterator[list[list[Any]]]]:
    """Returns column names and the results as an iterator that provides chunks,
    with each chunk containing a list of columns, where each column contains a
    list of the row values for that column in that chunk. Useful for columnar data
    formats, such as parquet or other OLAP databases.
    """
    headers, _ = self._effective_signals_schema.get_headers_with_length()
    column_names = [".".join(filter(None, header)) for header in headers]

    results_iter = self.collect_flatten()

    def column_chunks() -> Iterator[list[list[Any]]]:
        for chunk_iter in batched_it(results_iter, chunk_size):
            columns: list[list[Any]] = [[] for _ in column_names]
            for row in chunk_iter:
                for i, col in enumerate(columns):
                    col.append(row[i])
            yield columns

    return column_names, column_chunks()

to_csv

to_csv(
    path: Union[str, PathLike[str]],
    delimiter: str = ",",
    fs_kwargs: Optional[dict[str, Any]] = None,
    **kwargs
) -> None

Save chain to a csv (comma-separated values) file.

Parameters:

path –

Path to save the file. This supports local paths as well as remote paths, such as s3:// or hf:// with fsspec.
delimiter –

Delimiter to use for the resulting file.
fs_kwargs –

Optional kwargs to pass to the fsspec filesystem, used only for write, for fsspec-type URLs, such as s3:// or hf:// when provided as the destination path.

Source code in datachain/lib/dc/datachain.py

def to_csv(
    self,
    path: Union[str, os.PathLike[str]],
    delimiter: str = ",",
    fs_kwargs: Optional[dict[str, Any]] = None,
    **kwargs,
) -> None:
    """Save chain to a csv (comma-separated values) file.

    Parameters:
        path : Path to save the file. This supports local paths as well as
            remote paths, such as s3:// or hf:// with fsspec.
        delimiter : Delimiter to use for the resulting file.
        fs_kwargs : Optional kwargs to pass to the fsspec filesystem, used only for
            write, for fsspec-type URLs, such as s3:// or hf:// when
            provided as the destination path.
    """
    import csv

    opener = open

    if isinstance(path, str) and "://" in path:
        from datachain.client.fsspec import Client

        fs_kwargs = {
            **self._query.catalog.client_config,
            **(fs_kwargs or {}),
        }

        client = Client.get_implementation(path)

        fsspec_fs = client.create_fs(**fs_kwargs)

        opener = fsspec_fs.open

    headers, _ = self._effective_signals_schema.get_headers_with_length()
    column_names = [".".join(filter(None, header)) for header in headers]

    results_iter = self.collect_flatten()

    with opener(path, "w", newline="") as f:
        writer = csv.writer(f, delimiter=delimiter, **kwargs)
        writer.writerow(column_names)

        for row in results_iter:
            writer.writerow(row)

to_json

to_json(
    path: Union[str, PathLike[str]],
    fs_kwargs: Optional[dict[str, Any]] = None,
    include_outer_list: bool = True,
) -> None

Save chain to a JSON file.

Parameters:

path –

Path to save the file. This supports local paths as well as remote paths, such as s3:// or hf:// with fsspec.
fs_kwargs –

Optional kwargs to pass to the fsspec filesystem, used only for write, for fsspec-type URLs, such as s3:// or hf:// when provided as the destination path.
include_outer_list –

Sets whether to include an outer list for all rows. Setting this to True makes the file valid JSON, while False instead writes in the JSON lines format.

Source code in datachain/lib/dc/datachain.py

def to_json(
    self,
    path: Union[str, os.PathLike[str]],
    fs_kwargs: Optional[dict[str, Any]] = None,
    include_outer_list: bool = True,
) -> None:
    """Save chain to a JSON file.

    Parameters:
        path : Path to save the file. This supports local paths as well as
            remote paths, such as s3:// or hf:// with fsspec.
        fs_kwargs : Optional kwargs to pass to the fsspec filesystem, used only for
            write, for fsspec-type URLs, such as s3:// or hf:// when
            provided as the destination path.
        include_outer_list : Sets whether to include an outer list for all rows.
            Setting this to True makes the file valid JSON, while False instead
            writes in the JSON lines format.
    """
    opener = open

    if isinstance(path, str) and "://" in path:
        from datachain.client.fsspec import Client

        fs_kwargs = {
            **self._query.catalog.client_config,
            **(fs_kwargs or {}),
        }

        client = Client.get_implementation(path)

        fsspec_fs = client.create_fs(**fs_kwargs)

        opener = fsspec_fs.open

    headers, _ = self._effective_signals_schema.get_headers_with_length()
    headers = [list(filter(None, header)) for header in headers]

    is_first = True

    with opener(path, "wb") as f:
        if include_outer_list:
            # This makes the file JSON instead of JSON lines.
            f.write(b"[\n")
        for row in self.collect_flatten():
            if not is_first:
                if include_outer_list:
                    # This makes the file JSON instead of JSON lines.
                    f.write(b",\n")
                else:
                    f.write(b"\n")
            else:
                is_first = False
            f.write(orjson.dumps(row_to_nested_dict(headers, row)))
        if include_outer_list:
            # This makes the file JSON instead of JSON lines.
            f.write(b"\n]\n")

to_jsonl

to_jsonl(
    path: Union[str, PathLike[str]],
    fs_kwargs: Optional[dict[str, Any]] = None,
) -> None

Save chain to a JSON lines file.

Parameters:

path –

Path to save the file. This supports local paths as well as remote paths, such as s3:// or hf:// with fsspec.
fs_kwargs –

Optional kwargs to pass to the fsspec filesystem, used only for write, for fsspec-type URLs, such as s3:// or hf:// when provided as the destination path.

Source code in datachain/lib/dc/datachain.py

def to_jsonl(
    self,
    path: Union[str, os.PathLike[str]],
    fs_kwargs: Optional[dict[str, Any]] = None,
) -> None:
    """Save chain to a JSON lines file.

    Parameters:
        path : Path to save the file. This supports local paths as well as
            remote paths, such as s3:// or hf:// with fsspec.
        fs_kwargs : Optional kwargs to pass to the fsspec filesystem, used only for
            write, for fsspec-type URLs, such as s3:// or hf:// when
            provided as the destination path.
    """
    self.to_json(path, fs_kwargs, include_outer_list=False)

to_pandas

to_pandas(flatten=False, include_hidden=True) -> DataFrame

Return a pandas DataFrame from the chain.

Parameters:

flatten –

Whether to use a multiindex or flatten column names.
include_hidden –

Whether to include hidden columns.

Source code in datachain/lib/dc/datachain.py

def to_pandas(self, flatten=False, include_hidden=True) -> "pd.DataFrame":
    """Return a pandas DataFrame from the chain.

    Parameters:
        flatten : Whether to use a multiindex or flatten column names.
        include_hidden : Whether to include hidden columns.
    """
    import pandas as pd

    headers, max_length = self._effective_signals_schema.get_headers_with_length(
        include_hidden=include_hidden
    )
    if flatten or max_length < 2:
        columns = [".".join(filter(None, header)) for header in headers]
    else:
        columns = pd.MultiIndex.from_tuples(map(tuple, headers))

    results = self.results(include_hidden=include_hidden)
    return pd.DataFrame.from_records(results, columns=columns)

to_parquet

to_parquet(
    path: Union[str, PathLike[str], BinaryIO],
    partition_cols: Optional[Sequence[str]] = None,
    chunk_size: int = DEFAULT_PARQUET_CHUNK_SIZE,
    fs_kwargs: Optional[dict[str, Any]] = None,
    **kwargs
) -> None

Save chain to parquet file with SignalSchema metadata.

Parameters:

path –

Path or a file-like binary object to save the file. This supports local paths as well as remote paths, such as s3:// or hf:// with fsspec.
partition_cols –

Column names by which to partition the dataset.
chunk_size –

The chunk size of results to read and convert to columnar data, to avoid running out of memory.
fs_kwargs –

Optional kwargs to pass to the fsspec filesystem, used only for write, for fsspec-type URLs, such as s3:// or hf:// when provided as the destination path.

Source code in datachain/lib/dc/datachain.py

def to_parquet(
    self,
    path: Union[str, os.PathLike[str], BinaryIO],
    partition_cols: Optional[Sequence[str]] = None,
    chunk_size: int = DEFAULT_PARQUET_CHUNK_SIZE,
    fs_kwargs: Optional[dict[str, Any]] = None,
    **kwargs,
) -> None:
    """Save chain to parquet file with SignalSchema metadata.

    Parameters:
        path : Path or a file-like binary object to save the file. This supports
            local paths as well as remote paths, such as s3:// or hf:// with fsspec.
        partition_cols : Column names by which to partition the dataset.
        chunk_size : The chunk size of results to read and convert to columnar
            data, to avoid running out of memory.
        fs_kwargs : Optional kwargs to pass to the fsspec filesystem, used only for
            write, for fsspec-type URLs, such as s3:// or hf:// when
            provided as the destination path.
    """
    import pyarrow as pa
    import pyarrow.parquet as pq

    from datachain.lib.arrow import DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY

    fsspec_fs = None

    if isinstance(path, str) and "://" in path:
        from datachain.client.fsspec import Client

        fs_kwargs = {
            **self._query.catalog.client_config,
            **(fs_kwargs or {}),
        }

        client = Client.get_implementation(path)

        if path.startswith("file://"):
            # pyarrow does not handle file:// uris, and needs a direct path instead.
            from urllib.parse import urlparse

            path = urlparse(path).path
            if sys.platform == "win32":
                path = os.path.normpath(path.lstrip("/"))

        fsspec_fs = client.create_fs(**fs_kwargs)

    _partition_cols = list(partition_cols) if partition_cols else None
    signal_schema_metadata = orjson.dumps(
        self._effective_signals_schema.serialize()
    )

    column_names, column_chunks = self.to_columnar_data_with_names(chunk_size)

    parquet_schema = None
    parquet_writer = None
    first_chunk = True

    for chunk in column_chunks:
        # pyarrow infers the best parquet schema from the python types of
        # the input data.
        table = pa.Table.from_pydict(
            dict(zip(column_names, chunk)),
            schema=parquet_schema,
        )

        # Preserve any existing metadata, and add the DataChain SignalSchema.
        existing_metadata = table.schema.metadata or {}
        merged_metadata = {
            **existing_metadata,
            DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY: signal_schema_metadata,
        }
        table = table.replace_schema_metadata(merged_metadata)
        parquet_schema = table.schema

        if _partition_cols:
            # Write to a partitioned parquet dataset.
            pq.write_to_dataset(
                table,
                root_path=path,
                partition_cols=_partition_cols,
                filesystem=fsspec_fs,
                **kwargs,
            )
        else:
            if first_chunk:
                # Write to a single parquet file.
                parquet_writer = pq.ParquetWriter(
                    path, parquet_schema, filesystem=fsspec_fs, **kwargs
                )
                first_chunk = False

            assert parquet_writer
            parquet_writer.write_table(table)

    if parquet_writer:
        parquet_writer.close()

to_pytorch

to_pytorch(
    transform=None,
    tokenizer=None,
    tokenizer_kwargs=None,
    num_samples=0,
    remove_prefetched: bool = False,
)

Convert to pytorch dataset format.

Parameters:

transform (Transform, default: None ) –

Torchvision transforms to apply to the dataset.
tokenizer (Callable, default: None ) –

Tokenizer to use to tokenize text values.
tokenizer_kwargs (dict, default: None ) –

Additional kwargs to pass when calling tokenizer.
num_samples (int, default: 0 ) –

Number of random samples to draw for each epoch. This argument is ignored if num_samples=0 (the default).
remove_prefetched (bool, default: False ) –

Whether to remove prefetched files after reading.

Example

from torch.utils.data import DataLoader
loader = DataLoader(
    chain.select("file", "label").to_pytorch(),
    batch_size=16
)

Source code in datachain/lib/dc/datachain.py

def to_pytorch(
    self,
    transform=None,
    tokenizer=None,
    tokenizer_kwargs=None,
    num_samples=0,
    remove_prefetched: bool = False,
):
    """Convert to pytorch dataset format.

    Args:
        transform (Transform): Torchvision transforms to apply to the dataset.
        tokenizer (Callable): Tokenizer to use to tokenize text values.
        tokenizer_kwargs (dict): Additional kwargs to pass when calling tokenizer.
        num_samples (int): Number of random samples to draw for each epoch.
            This argument is ignored if `num_samples=0` (the default).
        remove_prefetched (bool): Whether to remove prefetched files after reading.

    Example:
        ```py
        from torch.utils.data import DataLoader
        loader = DataLoader(
            chain.select("file", "label").to_pytorch(),
            batch_size=16
        )
        ```
    """
    from datachain.torch import PytorchDataset

    if self._query.attached:
        chain = self
    else:
        chain = self.persist()
    assert chain.name is not None  # for mypy
    return PytorchDataset(
        chain.name,
        chain.version,
        catalog=self.session.catalog,
        transform=transform,
        tokenizer=tokenizer,
        tokenizer_kwargs=tokenizer_kwargs,
        num_samples=num_samples,
        dc_settings=chain._settings,
        remove_prefetched=remove_prefetched,
    )

to_records

to_records() -> list[dict[str, Any]]

Convert every row to a dictionary.

Source code in datachain/lib/dc/datachain.py

def to_records(self) -> list[dict[str, Any]]:
    """Convert every row to a dictionary."""

    def to_dict(cols: list[str], row: tuple[Any, ...]) -> dict[str, Any]:
        return dict(zip(cols, row))

    return self.results(row_factory=to_dict)

to_storage

to_storage(
    output: Union[str, PathLike[str]],
    signal: str = "file",
    placement: ExportPlacement = "fullpath",
    link_type: Literal["copy", "symlink"] = "copy",
    num_threads: Optional[int] = EXPORT_FILES_MAX_THREADS,
    anon: bool = False,
    client_config: Optional[dict] = None,
) -> None

Export files from a specified signal to a directory. Files can be exported to a local or cloud directory.

Parameters:

output (Union[str, PathLike[str]]) –

Path to the target directory for exporting files.
signal (str, default: 'file' ) –

Name of the signal to export files from.
placement (ExportPlacement, default: 'fullpath' ) –

The method to use for naming exported files. The possible values are: "filename", "etag", "fullpath", and "checksum".
link_type (Literal['copy', 'symlink'], default: 'copy' ) –

Method to use for exporting files. Falls back to 'copy' if symlinking fails.
num_threads –

number of threads to use for exporting files. By default it uses 5 threads.
anon (bool, default: False ) –

If true, we will treat cloud bucket as public one
client_config (Optional[dict], default: None ) –

Optional configuration for the destination storage client

Example

Cross cloud transfer

import datachain as dc

ds = dc.read_storage("s3://mybucket")
ds.to_storage("gs://mybucket", placement="filename")

Source code in datachain/lib/dc/datachain.py

def to_storage(
    self,
    output: Union[str, os.PathLike[str]],
    signal: str = "file",
    placement: FileExportPlacement = "fullpath",
    link_type: Literal["copy", "symlink"] = "copy",
    num_threads: Optional[int] = EXPORT_FILES_MAX_THREADS,
    anon: bool = False,
    client_config: Optional[dict] = None,
) -> None:
    """Export files from a specified signal to a directory. Files can be
    exported to a local or cloud directory.

    Args:
        output: Path to the target directory for exporting files.
        signal: Name of the signal to export files from.
        placement: The method to use for naming exported files.
            The possible values are: "filename", "etag", "fullpath", and "checksum".
        link_type: Method to use for exporting files.
            Falls back to `'copy'` if symlinking fails.
        num_threads : number of threads to use for exporting files.
            By default it uses 5 threads.
        anon: If true, we will treat cloud bucket as public one
        client_config: Optional configuration for the destination storage client

    Example:
        Cross cloud transfer
        ```py
        import datachain as dc

        ds = dc.read_storage("s3://mybucket")
        ds.to_storage("gs://mybucket", placement="filename")
        ```
    """
    if placement == "filename" and (
        self._query.distinct(pathfunc.name(C(f"{signal}__path"))).count()
        != self._query.count()
    ):
        raise ValueError("Files with the same name found")

    if anon:
        client_config = (client_config or {}) | {"anon": True}

    progress_bar = tqdm(
        desc=f"Exporting files to {output}: ",
        unit=" files",
        unit_scale=True,
        unit_divisor=10,
        total=self.count(),
        leave=False,
    )
    file_exporter = FileExporter(
        output,
        placement,
        self._settings.cache if self._settings else False,
        link_type,
        max_threads=num_threads or 1,
        client_config=client_config,
    )
    file_exporter.run(self.collect(signal), progress_bar)

union

union(other: Self) -> Self

Return the set union of the two datasets.

Parameters:

other (Self) –

chain whose rows will be added to self.

Source code in datachain/lib/dc/datachain.py

@delta_disabled
def union(self, other: "Self") -> "Self":
    """Return the set union of the two datasets.

    Parameters:
        other: chain whose rows will be added to `self`.
    """
    return self._evolve(query=self._query.union(other._query))

DataChainError

DataChainError(message)

Bases: Exception

Source code in datachain/lib/utils.py

def __init__(self, message):
    super().__init__(message)

Session

Session(
    name="",
    catalog: Optional[Catalog] = None,
    client_config: Optional[dict] = None,
    in_memory: bool = False,
)

Session is a context that keeps track of temporary DataChain datasets for a proper cleanup. By default, a global session is created.

Temporary or ephemeral datasets are the ones created without specified name. They are useful for optimization purposes and should be automatically removed.

Temp dataset has specific name format

"session_"

The suffix is optional. Both s are auto-generated.

Temp dataset examples

session_myname_624b41_48e8b4 session_4b962d_2a5dff

Parameters:

name (str): The name of the session. Only latters and numbers are supported. It can be empty. catalog (Catalog): Catalog object.

Source code in datachain/query/session.py

def __init__(
    self,
    name="",
    catalog: Optional["Catalog"] = None,
    client_config: Optional[dict] = None,
    in_memory: bool = False,
):
    if re.match(r"^[0-9a-zA-Z]*$", name) is None:
        raise ValueError(
            f"Session name can contain only letters or numbers - '{name}' given."
        )

    if not name:
        name = self.GLOBAL_SESSION_NAME

    session_uuid = uuid4().hex[: self.SESSION_UUID_LEN]
    self.name = f"{name}_{session_uuid}"
    self.is_new_catalog = not catalog
    self.catalog = catalog or get_catalog(
        client_config=client_config, in_memory=in_memory
    )
    self.dataset_versions: list[tuple[DatasetRecord, str, bool]] = []

get `classmethod`

get(
    session: Optional[Session] = None,
    catalog: Optional[Catalog] = None,
    client_config: Optional[dict] = None,
    in_memory: bool = False,
) -> Session

Creates a Session() object from a catalog.

Parameters:

session (Session, default: None ) –

Optional Session(). If not provided a new session will be created. It's needed mostly for simple API purposes.
catalog (Catalog, default: None ) –

Optional catalog. By default, a new catalog is created.

Source code in datachain/query/session.py

@classmethod
def get(
    cls,
    session: Optional["Session"] = None,
    catalog: Optional["Catalog"] = None,
    client_config: Optional[dict] = None,
    in_memory: bool = False,
) -> "Session":
    """Creates a Session() object from a catalog.

    Parameters:
        session (Session): Optional Session(). If not provided a new session will
                be created. It's needed mostly for simple API purposes.
        catalog (Catalog): Optional catalog. By default, a new catalog is created.
    """
    if session:
        return session

    # Access the active (most recent) context from the stack
    if cls.SESSION_CONTEXTS:
        session = cls.SESSION_CONTEXTS[-1]

    elif cls.GLOBAL_SESSION_CTX is None:
        cls.GLOBAL_SESSION_CTX = Session(
            cls.GLOBAL_SESSION_NAME,
            catalog,
            client_config=client_config,
            in_memory=in_memory,
        )
        session = cls.GLOBAL_SESSION_CTX

        atexit.register(cls._global_cleanup)
        cls.ORIGINAL_EXCEPT_HOOK = sys.excepthook
        sys.excepthook = cls.except_hook
    else:
        session = cls.GLOBAL_SESSION_CTX

    if client_config and session.catalog.client_config != client_config:
        session = Session(
            "session" + uuid4().hex[:4],
            catalog,
            client_config=client_config,
            in_memory=in_memory,
        )
        session.__enter__()

    return session

Sys

Bases: DataModel

Model for internal DataChain signals id and rand.

DataChain

C module-attribute

Column

glob

regexp

read_csv

read_dataset

datasets

read_hf

read_json

listings

read_pandas

read_parquet

read_records

read_storage

read_values

read_database

ConnectionType module-attribute

DataChain

dataset property

delta property

empty property

name property

schema property

session property

version property

__or__

__repr__

agg

apply

avg

batch_map

c

chunk

clone

collect

collect_flatten

column

compare

count

diff

distinct

exec

explode

filter

gen

group_by

limit

map

max

merge

min

mutate

offset

order_by

parse_tabular

persist

print_schema

reset_settings

sample

save

select

select_except

settings

setup

show

shuffle

subtract

sum

to_columnar_data_with_names

to_csv

to_json

to_jsonl

to_pandas

to_parquet

to_pytorch

to_records

to_storage

union

DataChainError

C `module-attribute`

ConnectionType `module-attribute`

dataset `property`

delta `property`

empty `property`

name `property`

schema `property`

session `property`

version `property`

or

repr

get `classmethod`