Json

`read_and_preprocess(file, id_key=None, encoding='utf-8', process_id=None, preprocess=None, archive_member=None, **kwargs)`

Read a dataset from a JSON (lines) file (optionally compressed) and preprocess it.

If id_key is provided, the JSON file is assumed to be in JSON lines format, and the value corresponding to id_key is used as the unique identifier for each entry. If id_key is None, the entire JSON file is loaded as a single object.

Compression inference follows pandas' documented behavior ('.gz', '.bz2', '.zip', '.xz', '.zst', and tar variants like '.tar.gz').

Parameters:

Name	Type	Description	Default
`file`	`str \| Path`	Path to the JSON or JSON lines file.	required
`id_key`	`str \| None`	Key in the JSON objects to use as the unique identifier.	`None`
`encoding`	`str`	File encoding to use when reading the file.	`'utf-8'`
`process_id`	`Callable \| None`	Optional function to process IDs.	`None`
`preprocess`	`Callable \| None`	Optional preprocessing function to apply to the dataset.	`None`
`archive_member`	`str \| None`	If the file is an archive, the member to extract (required for tar/zip with multiple files).	`None`
`**kwargs`		Additional keyword arguments to pass to `json.load` or `json.loads`.	`{}`

Returns: The loaded and preprocessed Dataset.

Source code in src/kibad_llm/dataset/json.py

def read_and_preprocess(
    file: str | Path,
    id_key: str | None = None,
    encoding: str = "utf-8",
    process_id: Callable | None = None,
    preprocess: Callable | None = None,
    archive_member: str | None = None,
    **kwargs,
) -> dict[Hashable, dict]:
    """Read a dataset from a JSON (lines) file (optionally compressed) and preprocess it.

    If `id_key` is provided, the JSON file is assumed to be in JSON lines format,
    and the value corresponding to `id_key` is used as the unique identifier for
    each entry. If `id_key` is None, the entire JSON file is loaded as a single object.

    Compression inference follows pandas' documented behavior ('.gz', '.bz2', '.zip', '.xz',
    '.zst', and tar variants like '.tar.gz').

    Args:
        file: Path to the JSON or JSON lines file.
        id_key: Key in the JSON objects to use as the unique identifier.
        encoding: File encoding to use when reading the file.
        process_id: Optional function to process IDs.
        preprocess: Optional preprocessing function to apply to the dataset.
        archive_member: If the file is an archive, the member to extract (required for tar/zip
            with multiple files).
        **kwargs: Additional keyword arguments to pass to `json.load` or `json.loads`.
    Returns:
        The loaded and preprocessed Dataset.
    """

    process_id = process_id or (lambda x: x)

    logger.info(f"Loading dataset from JSON file: {file} ...")

    with open_text(
        file,
        encoding=encoding,
        archive_member=archive_member,
    ) as f:
        if id_key is None:
            dataset = json.load(f, **kwargs)
        else:
            dataset = {}
            for line in f:
                if line.strip():
                    line_dict = json.loads(line, **kwargs)
                    key = process_id(line_dict[id_key])
                    del line_dict[id_key]
                    dataset[key] = line_dict

    if preprocess is not None:
        logger.info(f"Apply preprocessing function to dataset: {preprocess} ...")
        dataset = {k: preprocess(v) for k, v in dataset.items()}

    return dataset