Skip to content

Json

read_and_preprocess(file, id_key=None, encoding='utf-8', process_id=None, preprocess=None, archive_member=None, **kwargs)

Read a dataset from a JSON (lines) file (optionally compressed) and preprocess it.

If id_key is provided, the JSON file is assumed to be in JSON lines format, and the value corresponding to id_key is used as the unique identifier for each entry. If id_key is None, the entire JSON file is loaded as a single object.

Compression inference follows pandas' documented behavior ('.gz', '.bz2', '.zip', '.xz', '.zst', and tar variants like '.tar.gz').

Parameters:

Name Type Description Default
file str | Path

Path to the JSON or JSON lines file.

required
id_key str | None

Key in the JSON objects to use as the unique identifier.

None
encoding str

File encoding to use when reading the file.

'utf-8'
process_id Callable | None

Optional function to process IDs.

None
preprocess Callable | None

Optional preprocessing function to apply to the dataset.

None
archive_member str | None

If the file is an archive, the member to extract (required for tar/zip with multiple files).

None
**kwargs

Additional keyword arguments to pass to json.load or json.loads.

{}

Returns: The loaded and preprocessed Dataset.

Source code in src/kibad_llm/dataset/json.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def read_and_preprocess(
    file: str | Path,
    id_key: str | None = None,
    encoding: str = "utf-8",
    process_id: Callable | None = None,
    preprocess: Callable | None = None,
    archive_member: str | None = None,
    **kwargs,
) -> dict[Hashable, dict]:
    """Read a dataset from a JSON (lines) file (optionally compressed) and preprocess it.

    If `id_key` is provided, the JSON file is assumed to be in JSON lines format,
    and the value corresponding to `id_key` is used as the unique identifier for
    each entry. If `id_key` is None, the entire JSON file is loaded as a single object.

    Compression inference follows pandas' documented behavior ('.gz', '.bz2', '.zip', '.xz',
    '.zst', and tar variants like '.tar.gz').

    Args:
        file: Path to the JSON or JSON lines file.
        id_key: Key in the JSON objects to use as the unique identifier.
        encoding: File encoding to use when reading the file.
        process_id: Optional function to process IDs.
        preprocess: Optional preprocessing function to apply to the dataset.
        archive_member: If the file is an archive, the member to extract (required for tar/zip
            with multiple files).
        **kwargs: Additional keyword arguments to pass to `json.load` or `json.loads`.
    Returns:
        The loaded and preprocessed Dataset.
    """

    process_id = process_id or (lambda x: x)

    logger.info(f"Loading dataset from JSON file: {file} ...")

    with open_text(
        file,
        encoding=encoding,
        archive_member=archive_member,
    ) as f:
        if id_key is None:
            dataset = json.load(f, **kwargs)
        else:
            dataset = {}
            for line in f:
                if line.strip():
                    line_dict = json.loads(line, **kwargs)
                    key = process_id(line_dict[id_key])
                    del line_dict[id_key]
                    dataset[key] = line_dict

    if preprocess is not None:
        logger.info(f"Apply preprocessing function to dataset: {preprocess} ...")
        dataset = {k: preprocess(v) for k, v in dataset.items()}

    return dataset