Source code for deephaven.parquet

#
# Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending
#

""" This module supports reading an external Parquet files into Deephaven tables and writing Deephaven tables out as
Parquet files. """
from warnings import warn
from dataclasses import dataclass
from enum import Enum
from typing import List, Optional, Union, Dict, Sequence

import jpy

from deephaven import DHError
from deephaven.column import Column
from deephaven.dtypes import DType
from deephaven.jcompat import j_array_list
from deephaven.table import Table, PartitionedTable
from deephaven.experimental import s3

_JParquetTools = jpy.get_type("io.deephaven.parquet.table.ParquetTools")
_JFile = jpy.get_type("java.io.File")
_JCompressionCodecName = jpy.get_type("org.apache.parquet.hadoop.metadata.CompressionCodecName")
_JParquetInstructions = jpy.get_type("io.deephaven.parquet.table.ParquetInstructions")
_JParquetFileLayout = jpy.get_type("io.deephaven.parquet.table.ParquetInstructions$ParquetFileLayout")
_JTableDefinition = jpy.get_type("io.deephaven.engine.table.TableDefinition")



[docs]
@dataclass
class ColumnInstruction:
    """  This class specifies the instructions for reading/writing a Parquet column. """
    column_name: Optional[str] = None
    parquet_column_name: Optional[str] = None
    codec_name: Optional[str] = None
    codec_args: Optional[str] = None
    use_dictionary: bool = False




[docs]
class ParquetFileLayout(Enum):
    """ The parquet file layout. """

    SINGLE_FILE = 1
    """ A single parquet file. """

    FLAT_PARTITIONED = 2
    """ A single directory of parquet files. """

    KV_PARTITIONED = 3
    """ A hierarchically partitioned directory layout of parquet files. Directory names are of the format "key=value" 
     with keys derived from the partitioning columns. """

    METADATA_PARTITIONED = 4
    """
    Layout can be used to describe:
        - A directory containing a METADATA_FILE_NAME parquet file and an optional COMMON_METADATA_FILE_NAME parquet file
        - A single parquet METADATA_FILE_NAME file
        - A single parquet COMMON_METADATA_FILE_NAME file
    """



def _build_parquet_instructions(
    col_instructions: Optional[List[ColumnInstruction]] = None,
    compression_codec_name: Optional[str] = None,
    max_dictionary_keys: Optional[int] = None,
    max_dictionary_size: Optional[int] = None,
    is_legacy_parquet: bool = False,
    target_page_size: Optional[int] = None,
    is_refreshing: bool = False,
    for_read: bool = True,
    force_build: bool = False,
    generate_metadata_files: Optional[bool] = None,
    base_name: Optional[str] = None,
    file_layout: Optional[ParquetFileLayout] = None,
    table_definition: Optional[Union[Dict[str, DType], List[Column]]] = None,
    col_definitions: Optional[List[Column]] = None,
    index_columns: Optional[Sequence[Sequence[str]]] = None,
    special_instructions: Optional[s3.S3Instructions] = None,
):
    if not any(
        [
            force_build,
            col_instructions,
            compression_codec_name,
            max_dictionary_keys is not None,
            max_dictionary_size is not None,
            is_legacy_parquet,
            target_page_size is not None,
            is_refreshing,
            generate_metadata_files is not None,
            base_name is not None,
            file_layout is not None,
            table_definition is not None,
            col_definitions is not None,
            index_columns is not None,
            special_instructions is not None
        ]
    ):
        return _JParquetInstructions.EMPTY

    builder = _JParquetInstructions.builder()
    if col_instructions is not None:
        for ci in col_instructions:
            if for_read and not ci.parquet_column_name:
                raise ValueError("must specify the parquet column name for read.")
            if not for_read and not ci.column_name:
                raise ValueError("must specify the table column name for write.")

            builder.addColumnNameMapping(ci.parquet_column_name, ci.column_name)
            if ci.column_name:
                if ci.codec_name:
                    builder.addColumnCodec(ci.column_name, ci.codec_name, ci.codec_args)
                builder.useDictionary(ci.column_name, ci.use_dictionary)

    if compression_codec_name:
        builder.setCompressionCodecName(compression_codec_name)

    if max_dictionary_keys is not None:
        builder.setMaximumDictionaryKeys(max_dictionary_keys)

    if max_dictionary_size is not None:
        builder.setMaximumDictionarySize(max_dictionary_size)

    if is_legacy_parquet:
        builder.setIsLegacyParquet(is_legacy_parquet)

    if target_page_size is not None:
        builder.setTargetPageSize(target_page_size)

    if is_refreshing:
        builder.setIsRefreshing(is_refreshing)

    if generate_metadata_files:
        builder.setGenerateMetadataFiles(generate_metadata_files)

    if base_name:
        builder.setBaseNameForPartitionedParquetData(base_name)

    if file_layout is not None:
        builder.setFileLayout(_j_file_layout(file_layout))

    if table_definition is not None and col_definitions is not None:
        raise ValueError("table_definition and col_definitions cannot both be specified.")

    if table_definition is not None:
        builder.setTableDefinition(_j_table_definition(table_definition))

    if col_definitions is not None:
        builder.setTableDefinition(_JTableDefinition.of([col.j_column_definition for col in col_definitions]))

    if index_columns:
        builder.addAllIndexColumns(_j_list_of_list_of_string(index_columns))

    if special_instructions is not None:
        builder.setSpecialInstructions(special_instructions.j_object)

    return builder.build()

def _j_table_definition(table_definition: Union[Dict[str, DType], List[Column], None]) -> Optional[jpy.JType]:
    if table_definition is None:
        return None
    elif isinstance(table_definition, Dict):
        return _JTableDefinition.of(
            [
                Column(name=name, data_type=dtype).j_column_definition
                for name, dtype in table_definition.items()
            ]
        )
    elif isinstance(table_definition, List):
        return _JTableDefinition.of(
            [col.j_column_definition for col in table_definition]
        )
    else:
        raise DHError(f"Unexpected table_definition type: {type(table_definition)}")


def _j_file_layout(file_layout: Optional[ParquetFileLayout]) -> Optional[jpy.JType]:
    if file_layout is None:
        return None
    if file_layout == ParquetFileLayout.SINGLE_FILE:
        return _JParquetFileLayout.SINGLE_FILE
    if file_layout == ParquetFileLayout.FLAT_PARTITIONED:
        return _JParquetFileLayout.FLAT_PARTITIONED
    if file_layout == ParquetFileLayout.KV_PARTITIONED:
        return _JParquetFileLayout.KV_PARTITIONED
    if file_layout == ParquetFileLayout.METADATA_PARTITIONED:
        return _JParquetFileLayout.METADATA_PARTITIONED
    raise DHError(f"Invalid parquet file_layout '{file_layout}'")



[docs]
def read(
    path: str,
    col_instructions: Optional[List[ColumnInstruction]] = None,
    is_legacy_parquet: bool = False,
    is_refreshing: bool = False,
    file_layout: Optional[ParquetFileLayout] = None,
    table_definition: Union[Dict[str, DType], List[Column], None] = None,
    special_instructions: Optional[s3.S3Instructions] = None,
) -> Table:
    """ Reads in a table from a single parquet, metadata file, or directory with recognized layout.

    Args:
        path (str): the file or directory to examine
        col_instructions (Optional[List[ColumnInstruction]]): instructions for customizations while reading particular
            columns, default is None, which means no specialization for any column
        is_legacy_parquet (bool): if the parquet data is legacy
        is_refreshing (bool): if the parquet data represents a refreshing source
        file_layout (Optional[ParquetFileLayout]): the parquet file layout, by default None. When None, the layout is
            inferred.
        table_definition (Union[Dict[str, DType], List[Column], None]): the table definition, by default None. When None,
            the definition is inferred from the parquet file(s). Setting a definition guarantees the returned table will
            have that definition. This is useful for bootstrapping purposes when the initially partitioned directory is
            empty and is_refreshing=True. It is also useful for specifying a subset of the parquet definition.
        special_instructions (Optional[s3.S3Instructions]): Special instructions for reading parquet files, useful when
            reading files from a non-local file system, like S3. By default, None.

    Returns:
        a table

    Raises:
        DHError
    """

    try:
        read_instructions = _build_parquet_instructions(
            col_instructions=col_instructions,
            is_legacy_parquet=is_legacy_parquet,
            is_refreshing=is_refreshing,
            for_read=True,
            force_build=True,
            special_instructions=special_instructions,
            file_layout=file_layout,
            table_definition=table_definition,
        )
        return Table(_JParquetTools.readTable(path, read_instructions))
    except Exception as e:
        raise DHError(e, "failed to read parquet data.") from e


def _j_string_array(str_seq: Sequence[str]):
    return jpy.array("java.lang.String", str_seq)

def _j_list_of_list_of_string(str_seq_seq: Sequence[Sequence[str]]):
    return j_array_list([j_array_list(str_seq) for str_seq in str_seq_seq])


[docs]
def delete(path: str) -> None:
    """ Deletes a Parquet table on disk.

    Args:
        path (str): path to delete

    Raises:
        DHError
    """
    try:
        _JParquetTools.deleteTable(_JFile(path))
    except Exception as e:
        raise DHError(e, f"failed to delete a parquet table: {path} on disk.") from e




[docs]
def write(
    table: Table,
    path: str,
    table_definition: Optional[Union[Dict[str, DType], List[Column]]] = None,
    col_definitions: Optional[List[Column]] = None,
    col_instructions: Optional[List[ColumnInstruction]] = None,
    compression_codec_name: Optional[str] = None,
    max_dictionary_keys: Optional[int] = None,
    max_dictionary_size: Optional[int] = None,
    target_page_size: Optional[int] = None,
    generate_metadata_files: Optional[bool] = None,
    index_columns: Optional[Sequence[Sequence[str]]] = None
) -> None:
    """ Write a table to a Parquet file.

    Args:
        table (Table): the source table
        path (str): the destination file path; the file name should end in a ".parquet" extension. If the path
            includes any non-existing directories, they are created. If there is an error, any intermediate directories
            previously created are removed; note this makes this method unsafe for concurrent use
        table_definition (Optional[Union[Dict[str, DType], List[Column]]): the table definition to use for writing,
            instead of the definitions implied by the table. Default is None, which means use the column definitions
            implied by the table. This definition can be used to skip some columns or add additional columns with
            null values. Both table_definition and col_definitions cannot be specified at the same time.
        col_definitions (Optional[List[Column]]): the column definitions to use for writing, instead of the
            definitions implied by the table. Default is None, which means use the column definitions implied by the
            table. This argument is deprecated and will be removed in a future release. Use table_definition instead.
        col_instructions (Optional[List[ColumnInstruction]]): instructions for customizations while writing particular
            columns, default is None, which means no specialization for any column
        compression_codec_name (Optional[str]): the compression codec to use. Allowed values include "UNCOMPRESSED",
            "SNAPPY", "GZIP", "LZO", "LZ4", "LZ4_RAW", "ZSTD", etc. If not specified, defaults to "SNAPPY".
        max_dictionary_keys (Optional[int]): the maximum number of unique keys the writer should add to a dictionary page
            before switching to non-dictionary encoding, never evaluated for non-String columns, defaults to 2^20 (1,048,576)
        max_dictionary_size (Optional[int]): the maximum number of bytes the writer should add to the dictionary before
            switching to non-dictionary encoding, never evaluated for non-String columns, defaults to 2^20 (1,048,576)
        target_page_size (Optional[int]): the target page size in bytes, if not specified, defaults to 2^20 bytes (1 MiB)
        generate_metadata_files (Optional[bool]): whether to generate parquet _metadata and _common_metadata files,
            defaults to False. Generating these files can help speed up reading of partitioned parquet data because these
            files contain metadata (including schema) about the entire dataset, which can be used to skip reading some
            files.
        index_columns (Optional[Sequence[Sequence[str]]]): sequence of sequence containing the column names for indexes
            to persist. The write operation will store the index info for the provided columns as sidecar tables. For
            example, if the input is [["Col1"], ["Col1", "Col2"]], the write operation will store the index info for
            ["Col1"] and for ["Col1", "Col2"]. By default, data indexes to write are determined by those present on the
            source table. This argument can be used to narrow the set of indexes to write, or to be explicit about the
            expected set of indexes present on all sources. Indexes that are specified but missing will be computed on
            demand.
    Raises:
        DHError
    """
    if col_definitions is not None:
        warn("col_definitions is deprecated and will be removed in a future release. Use table_definition "
             "instead.", DeprecationWarning, stacklevel=2)
    try:
        write_instructions = _build_parquet_instructions(
            col_instructions=col_instructions,
            compression_codec_name=compression_codec_name,
            max_dictionary_keys=max_dictionary_keys,
            max_dictionary_size=max_dictionary_size,
            target_page_size=target_page_size,
            for_read=False,
            generate_metadata_files=generate_metadata_files,
            table_definition=table_definition,
            col_definitions=col_definitions,
            index_columns=index_columns,
        )
        _JParquetTools.writeTable(table.j_table, path, write_instructions)
    except Exception as e:
        raise DHError(e, "failed to write to parquet data.") from e




[docs]
def write_partitioned(
        table: Union[Table, PartitionedTable],
        destination_dir: str,
        table_definition: Optional[Union[Dict[str, DType], List[Column]]] = None,
        col_definitions: Optional[List[Column]] = None,
        col_instructions: Optional[List[ColumnInstruction]] = None,
        compression_codec_name: Optional[str] = None,
        max_dictionary_keys: Optional[int] = None,
        max_dictionary_size: Optional[int] = None,
        target_page_size: Optional[int] = None,
        base_name: Optional[str] = None,
        generate_metadata_files: Optional[bool] = None,
        index_columns: Optional[Sequence[Sequence[str]]] = None
) -> None:
    """ Write table to disk in parquet format with the partitioning columns written as "key=value" format in a nested
    directory structure. For example, for a partitioned column "date", we will have a directory structure like
    "date=2021-01-01/<base_name>.parquet", "date=2021-01-02/<base_name>.parquet", etc. where "2021-01-01" and
    "2021-01-02" are the partition values and "<base_name>" is passed as an optional parameter. All the necessary
    subdirectories are created if they do not exist.

    Args:
        table (Table): the source table or partitioned table
        destination_dir (str): The path to destination root directory in which the partitioned parquet data will be stored
            in a nested directory structure format. Non-existing directories in the provided path will be created.
        table_definition (Optional[Union[Dict[str, DType], List[Column]]): the table definition to use for writing,
            instead of the definitions implied by the table. Default is None, which means use the column definitions
            implied by the table. This definition can be used to skip some columns or add additional columns with
            null values. Both table_definition and col_definitions cannot be specified at the same time.
        col_definitions (Optional[List[Column]]): the column definitions to use for writing, instead of the definitions
            implied by the table. Default is None, which means use the column definitions implied by the table. This
            argument is deprecated and will be removed in a future release. Use table_definition instead.
        col_instructions (Optional[List[ColumnInstruction]]): instructions for customizations while writing particular
            columns, default is None, which means no specialization for any column
        compression_codec_name (Optional[str]): the compression codec to use. Allowed values include "UNCOMPRESSED",
            "SNAPPY", "GZIP", "LZO", "LZ4", "LZ4_RAW", "ZSTD", etc. If not specified, defaults to "SNAPPY".
        max_dictionary_keys (Optional[int]): the maximum number of unique keys the writer should add to a dictionary page
            before switching to non-dictionary encoding; never evaluated for non-String columns , defaults to 2^20 (1,048,576)
        max_dictionary_size (Optional[int]): the maximum number of bytes the writer should add to the dictionary before
            switching to non-dictionary encoding, never evaluated for non-String columns, defaults to 2^20 (1,048,576)
        target_page_size (Optional[int]): the target page size in bytes, if not specified, defaults to 2^20 bytes (1 MiB)
        base_name (Optional[str]): The base name for the individual partitioned tables, if not specified, defaults to
            `{uuid}`, so files will have names of the format `<uuid>.parquet` where `uuid` is a randomly generated UUID.
            Users can provide the following tokens in the base_name:
            - The token `{uuid}` will be replaced with a random UUID. For example, a base name of
            "table-{uuid}" will result in files named like "table-8e8ab6b2-62f2-40d1-8191-1c5b70c5f330.parquet.parquet".
            - The token `{partitions}` will be replaced with an underscore-delimited, concatenated string of
            partition values. For example, for a base name of "{partitions}-table" and partitioning columns "PC1" and
            "PC2", the file name is generated by concatenating the partition values "PC1=pc1" and "PC2=pc2"
            with an underscore followed by "-table.parquet", like "PC1=pc1_PC2=pc2-table.parquet".
            - The token `{i}` will be replaced with an automatically incremented integer for files in a directory. For
            example, a base name of "table-{i}" will result in files named like "PC=partition1/table-0.parquet",
            "PC=partition1/table-1.parquet", etc.
        generate_metadata_files (Optional[bool]): whether to generate parquet _metadata and _common_metadata files,
            defaults to False. Generating these files can help speed up reading of partitioned parquet data because these
            files contain metadata (including schema) about the entire dataset, which can be used to skip reading some
            files.
        index_columns (Optional[Sequence[Sequence[str]]]): sequence of sequence containing the column names for indexes
            to persist. The write operation will store the index info for the provided columns as sidecar tables. For
            example, if the input is [["Col1"], ["Col1", "Col2"]], the write operation will store the index info for
            ["Col1"] and for ["Col1", "Col2"]. By default, data indexes to write are determined by those present on the
            source table. This argument can be used to narrow the set of indexes to write, or to be explicit about the
            expected set of indexes present on all sources. Indexes that are specified but missing will be computed on
            demand.

    Raises:
        DHError
    """
    if col_definitions is not None:
        warn("col_definitions is deprecated and will be removed in a future release. Use table_definition "
             "instead.", DeprecationWarning, stacklevel=2)
    try:
        write_instructions = _build_parquet_instructions(
            col_instructions=col_instructions,
            compression_codec_name=compression_codec_name,
            max_dictionary_keys=max_dictionary_keys,
            max_dictionary_size=max_dictionary_size,
            target_page_size=target_page_size,
            for_read=False,
            generate_metadata_files=generate_metadata_files,
            base_name=base_name,
            table_definition=table_definition,
            col_definitions=col_definitions,
            index_columns=index_columns,
        )
        _JParquetTools.writeKeyValuePartitionedTable(table.j_object, destination_dir, write_instructions)
    except Exception as e:
        raise DHError(e, "failed to write to parquet data.") from e




[docs]
def batch_write(
    tables: List[Table],
    paths: List[str],
    table_definition: Optional[Union[Dict[str, DType], List[Column]]] = None,
    col_definitions: Optional[List[Column]] = None,
    col_instructions: Optional[List[ColumnInstruction]] = None,
    compression_codec_name: Optional[str] = None,
    max_dictionary_keys: Optional[int] = None,
    max_dictionary_size: Optional[int] = None,
    target_page_size: Optional[int] = None,
    generate_metadata_files: Optional[bool] = None,
    index_columns: Optional[Sequence[Sequence[str]]] = None
):
    """ Writes tables to disk in parquet format to a supplied set of paths.

    Note that either all the tables are written out successfully or none is.

    Args:
        tables (List[Table]): the source tables
        paths (List[str]): the destination paths. Any non-existing directories in the paths provided are
            created. If there is an error, any intermediate directories previously created are removed; note this makes
            this method unsafe for concurrent use
        table_definition (Optional[Union[Dict[str, DType], List[Column]]): the table definition to use for writing,
            instead of the definitions implied by the table. Default is None, which means use the column definitions
            implied by the table. This definition can be used to skip some columns or add additional columns with
            null values. Both table_definition and col_definitions cannot be specified at the same time.
        col_definitions (List[Column]): the column definitions to use for writing. This argument is deprecated and will
            be removed in a future release. Use table_definition instead.
        col_instructions (Optional[List[ColumnInstruction]]): instructions for customizations while writing
        compression_codec_name (Optional[str]): the compression codec to use. Allowed values include "UNCOMPRESSED",
            "SNAPPY", "GZIP", "LZO", "LZ4", "LZ4_RAW", "ZSTD", etc. If not specified, defaults to "SNAPPY".
        max_dictionary_keys (Optional[int]): the maximum number of unique keys the writer should add to a dictionary page
            before switching to non-dictionary encoding; never evaluated for non-String columns , defaults to 2^20 (1,048,576)
        max_dictionary_size (Optional[int]): the maximum number of bytes the writer should add to the dictionary before
            switching to non-dictionary encoding, never evaluated for non-String columns, defaults to 2^20 (1,048,576)
        target_page_size (Optional[int]): the target page size in bytes, if not specified, defaults to 2^20 bytes (1 MiB)
        generate_metadata_files (Optional[bool]): whether to generate parquet _metadata and _common_metadata files,
            defaults to False. Generating these files can help speed up reading of partitioned parquet data because these
            files contain metadata (including schema) about the entire dataset, which can be used to skip reading some
            files.
        index_columns (Optional[Sequence[Sequence[str]]]): sequence of sequence containing the column names for indexes
            to persist. The write operation will store the index info for the provided columns as sidecar tables. For
            example, if the input is [["Col1"], ["Col1", "Col2"]], the write operation will store the index info for
            ["Col1"] and for ["Col1", "Col2"]. By default, data indexes to write are determined by those present on the
            source table. This argument can be used to narrow the set of indexes to write, or to be explicit about the
            expected set of indexes present on all sources. Indexes that are specified but missing will be computed on
            demand.

    Raises:
        DHError
    """
    if col_definitions is not None:
        warn("col_definitions is deprecated and will be removed in a future release. Use table_definition "
             "instead.", DeprecationWarning, stacklevel=2)
        #TODO(deephaven-core#5362): Remove col_definitions parameter
    elif table_definition is None:
        raise ValueError("Either table_definition or col_definitions must be specified.")
    try:
        write_instructions = _build_parquet_instructions(
            col_instructions=col_instructions,
            compression_codec_name=compression_codec_name,
            max_dictionary_keys=max_dictionary_keys,
            max_dictionary_size=max_dictionary_size,
            target_page_size=target_page_size,
            for_read=False,
            generate_metadata_files=generate_metadata_files,
            table_definition=table_definition,
            col_definitions=col_definitions,
            index_columns=index_columns,
        )
        _JParquetTools.writeTables([t.j_table for t in tables], _j_string_array(paths), write_instructions)
    except Exception as e:
        raise DHError(e, "write multiple tables to parquet data failed.") from e