Source code for split_utils

import numpy as np
from scipy import stats


[docs]def subsample_dist(values, sub_dist, n_samples, seed, return_kde_fit=False): """Subsample from a given array following a specified distribution Parameters ---------- values : np.ndarray Values to subsample from sub_dist : scipy.stats._continuous_distns object Subsampling distribution. Must have a `pdf` method for evaluating PDF n_samples : int Number of samples seed : int return_kde_fit : bool Whether to return the KDE fit on `values` Returns ------- np.ndarray Indices of subsamples within `values` """ kde = stats.gaussian_kde(values, bw_method='scott') rng = np.random.default_rng(seed) sub_p = sub_dist.pdf(values)/kde.pdf(values) sub_p = sub_p/sub_p.sum() samples_i = rng.choice(np.arange(len(values)), n_samples, p=sub_p, replace=False) if return_kde_fit: return samples_i, kde else: return samples_i
[docs]def random_split(indices, frac_val, seed): """Randomly split a list Parameters ---------- indices : np.ndarray List of indices frac_val : float Fraction of validation points seed : int Returns ------- tuple Training indices, validation indices """ rng = np.random.default_rng(seed) val_i = rng.choice(indices, size=int(len(indices)*frac_val)) train_i = list(set(indices) - set(val_i)) return train_i, val_i