Source code for mlvern.data.fingerprint
import pandas as pd
from mlvern.utils.hashing import hash_object
[docs]
def fingerprint_dataset(df: pd.DataFrame, target: str):
"""
FAST fingerprinting – runs every time.
"""
schema = {
"columns": list(df.columns),
"dtypes": df.dtypes.astype(str).to_dict(),
"target": target,
}
content_hash = hash_object(
{
"data": df.to_dict(orient="list"),
"schema": schema,
}
)
return {
"dataset_hash": content_hash[:12], # short hash for paths
"rows": df.shape[0],
"columns": df.shape[1],
"schema": schema,
}