Skip to content

Database unique summary

get_unique_single_and_multi_values(df)

Get unique values for single-label and multi-label columns in a dataframe.

Parameters:

Name Type Description Default
df DataFrame

input dataframe

required

Returns: A tuple of two dictionaries: - a dict mapping single-label column names to their unique values - a dict mapping multi-label column names to their unique values

Source code in src/kibad_llm/data_integration/database_unique_summary.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def get_unique_single_and_multi_values(
    df: pd.DataFrame,
) -> tuple[dict[str, list], dict[str, list]]:
    """
    Get unique values for single-label and multi-label columns in a dataframe.

    Args:
        df: input dataframe
    Returns:
        A tuple of two dictionaries:
        - a dict mapping single-label column names to their unique values
        - a dict mapping multi-label column names to their unique values
    """

    none_cols = [c for c in df.columns if df[c].isnull().all()]
    if len(none_cols) > 0:
        logger.warning(f"Columns with all None values found:\n{none_cols}")
    multi_label_cols = _get_list_cols(df)
    logger.info(f"Found {len(multi_label_cols)} multi-label columns:\n{multi_label_cols}")
    # assume all other columns are single-label
    single_label_cols = [c for c in df.columns if c not in none_cols + multi_label_cols]
    logger.info(f"Found {len(single_label_cols)} single-label columns:\n{single_label_cols}")

    unique_values_single = {
        col: _sort_with_none(df[col].unique().tolist()) for col in single_label_cols
    }
    # remove None values from multi-label unique entries
    unique_values_multi = {
        col: _sort_with_none(df[col].explode().unique().tolist(), remove_none=True)
        for col in multi_label_cols
    }

    return unique_values_single, unique_values_multi

show_unique_values_summary(input_file, top_n=20, fields=None, key_sep='.')

Show a summary of unique values in a JSONL file with nested entries.

Source code in src/kibad_llm/data_integration/database_unique_summary.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def show_unique_values_summary(
    input_file: str, top_n: int = 20, fields: list[str] | None = None, key_sep: str = "."
) -> None:
    """
    Show a summary of unique values in a JSONL file with nested entries.
    """

    df = pd.read_json(input_file, lines=True)
    df_flat = df.apply(
        lambda row: pd.Series(flatten_dict_simple(d=row.to_dict(), sep=key_sep)), axis="columns"
    )

    # get unique entries
    unique_values_single, unique_values_multi = get_unique_single_and_multi_values(df=df_flat)
    # combine both dicts
    unique_dict = {**unique_values_single, **unique_values_multi}

    # show numbers on console
    unique_dict_len = {dict_name: len(dict_list) for dict_name, dict_list in unique_dict.items()}
    logger.info(
        f"number of unique entries per key:\n{json.dumps(unique_dict_len, indent=2, sort_keys=True)}\n"
    )

    if fields is not None:
        unique_dict_filtered = {k: unique_dict[k] for k in sorted(unique_dict) if k in fields}
        msg = f"show unique entries for specified fields ({fields})"
    else:
        unique_dict_filtered = {
            k: unique_dict[k] for k in sorted(unique_dict) if len(unique_dict[k]) < top_n
        }
        num_remaining = len(unique_dict_len) - len(unique_dict_filtered)
        msg = (
            f"show unique entries for {len(unique_dict_filtered)} keys with less than "
            f"{top_n} entries (#remaining keys: {num_remaining})"
        )
    # sort entries for better readability
    unique_dict_filtered_sorted = dict(
        sorted(unique_dict_filtered.items(), key=lambda item: len(item[1]))
    )
    # lines = "\n".join(f'{k}: {v}' for k, v in unique_dict_filtered_sorted.items())
    lines = json.dumps(unique_dict_filtered_sorted, indent=2, sort_keys=True, ensure_ascii=False)
    logger.info(f"{msg}:\n{lines}")