Source code for PAsampling.utils.data_selection

import numpy as np
import h5py
from tqdm import tqdm
import random
from scipy.spatial import cKDTree
from ..wrappers.kmedoids_sampler import Kmedoids
from ..wrappers.facility_location_sampler import FacilityLocation
from ..wrappers.twin_sampler import Twin
from ..wrappers.fps_sampler import FPS
from ..wrappers.dafps_sampler import DAFPS
from ..wrappers.fps_plus_sampler import FPS_plus
np.random.seed(123)

def save_indices_to_h5py(group_train, group_test, trainig_set_sizes, indices, idx_test):
    for n in trainig_set_sizes:
        idx_train = indices[:n]
        idx_test_selected = list(set(idx_test).difference(set(idx_train)))
        group_train.create_dataset(f'selected_{n}', data=idx_train)
        group_test.create_dataset(f'selected_{n}', data=idx_test_selected)


[docs] def DataSelector(X, save_path, strategies=None, trainig_set_sizes=None, initial_conditions=None, knn=100, mu=3, ratios= None, gamma_FacLocG=1): """ Selects data subsets based on various strategies and saves the indices to an HDF5 file. Parameters: ----------- X : numpy.ndarray The input data array (n_points, n_features). save_path : str The path where the HDF5 file will be saved. strategies : list of str, optional List of strategies to use for data selection. Possible values include 'DAFPS', 'FPS', 'RDM', 'k-medoids++', 'FacilityLocation', 'Twinning', 'FPS-k-medoids++', 'FPS-FacLoc', 'FPS-RDM', 'FacLoc-G'. trainig_set_sizes : list of int, optional List of training set sizes to be used for each strategy. initial_conditions : list, optional List of initial conditions for the data selection strategies. knn : int, optional Number of nearest neighbors to consider for the DAFPS strategy. Default is 100. mu : int, optional Hyperparameter for the DAFPS and FPS-(mehtod) strategies. Default is 3. ratios : list of float, optional List of ratios for the Twinning strategy. gamma_FacLocG : float, optional Gamma parameter for the FacilityLocation strategy with Gaussian metric. Default is 1. Returns: -------- None """ f = h5py.File(save_path, "w") #if 'DAFPS' in strategies: # print('DA-FPS hyperparameters are u={} and k={}'.format(knn, mu)) if 'DAFPS' in strategies: print('DAFPS ') grp = f.create_group(f'DAFPS') dafps_sampler = DAFPS(X, knn=knn) for count, initial_sub in tqdm(enumerate(initial_conditions, 1)): subgrup_train = grp.create_group(f'train_Initialize_{count}') subgrup_test = grp.create_group(f'test_Initialize_{count}') dafps_indices = dafps_sampler.fit(initial_subset=[initial_sub], b_samples=max(trainig_set_sizes), mu=mu) idx_test = list(np.arange(X.shape[0])) save_indices_to_h5py(subgrup_train, subgrup_test, trainig_set_sizes, dafps_indices, idx_test) if 'FPS' in strategies: print('FPS') grp = f.create_group(f'FPS') for count, initial_sub in tqdm(enumerate(initial_conditions, 1)): subgrup_train = grp.create_group(f'train_Initialize_{count}') subgrup_test = grp.create_group(f'test_Initialize_{count}') fps_sampler = FPS(precomputed_distances=False) fps_indices = fps_sampler.fit(X, initial_subset=[initial_sub], b_samples=max(trainig_set_sizes)) idx_test = list(np.arange(X.shape[0])) save_indices_to_h5py(subgrup_train, subgrup_test, trainig_set_sizes, fps_indices, idx_test) if 'RDM' in strategies: print('RDM') grp = f.create_group(f'RDM') for i in tqdm(range(len(initial_conditions))): random.seed(i) j = i+1 subgrup_train = grp.create_group(f'train_Initialize_{j}') subgrup_test = grp.create_group(f'test_Initialize_{j}') idx_test = list(np.arange(X.shape[0])) random_indices = random.sample(idx_test, max(trainig_set_sizes)) save_indices_to_h5py(subgrup_train, subgrup_test, trainig_set_sizes, random_indices, idx_test) if 'k-medoids++' in strategies: print('k-medoids++') grp = f.create_group(f'k-medoids++') for count, _ in tqdm(enumerate(initial_conditions, 1)): subgrup_train = grp.create_group(f'train_Initialize_{count}') subgrup_test = grp.create_group(f'test_Initialize_{count}') idx_test = list(np.arange(X.shape[0])) for n in trainig_set_sizes: kmedoids_sampler = Kmedoids(b_samples=n, init='k-medoids++', metric='euclidean', random_state=count) kmedoids_indices = kmedoids_sampler.fit(X) idx_train = kmedoids_indices idx_test_selected= list(set(idx_test).difference(idx_train)) subgrup_train.create_dataset(f'selected_{n}', data = idx_train) subgrup_test.create_dataset(f'selected_{n}', data = idx_test_selected) if 'FacilityLocation' in strategies: print('FacilityLocation') grp = f.create_group(f'FacilityLocation') for count, initial_sub in tqdm(enumerate(initial_conditions, 1)): subgrup_train = grp.create_group(f'train_Initialize_{count}') subgrup_test = grp.create_group(f'test_Initialize_{count}') facility_location_sampler = FacilityLocation(initial_subset=[initial_sub], b_samples=max(trainig_set_sizes), metric='euclidean', verbose=False, n_jobs=-1) facility_location_indices = facility_location_sampler.fit(X) idx_test = list(np.arange(X.shape[0])) save_indices_to_h5py(subgrup_train, subgrup_test, trainig_set_sizes, facility_location_indices, idx_test) if 'Twinning' in strategies: print('Twinning') grp = f.create_group(f'Twinning') for count, initial_sub in tqdm(enumerate(initial_conditions, 1)): subgrup_train = grp.create_group(f'train_Initialize_{count}') subgrup_test = grp.create_group(f'test_Initialize_{count}') idx_test = list(np.arange(X.shape[0])) for n, ratio in zip(trainig_set_sizes, ratios): twin_sampler = Twin(ratio=ratio, idx_initial_point=initial_sub) idx_train = twin_sampler.fit(X) idx_test= list(set(idx_test).difference(idx_train)) subgrup_train.create_dataset(f'selected_{n}', data = idx_train) subgrup_test.create_dataset(f'selected_{n}', data = idx_test) if 'FPS-k-medoids++' in strategies and mu!=0: print('FPS-k-medoids++') grp = f.create_group(f'FPS-k-medoids++') for count, initial_sub in tqdm(enumerate(initial_conditions, 1)): subgrup_train = grp.create_group(f'train_Initialize_{count}') subgrup_test = grp.create_group(f'test_Initialize_{count}') idx_test = list(np.arange(X.shape[0])) for n in trainig_set_sizes: modified_sampler = FPS_plus(method='kmedoids', mu=mu) idx_train = modified_sampler.fit(X, initial_subset=[initial_sub], b_samples=n, random_state=count) idx_test= list(set(idx_test).difference(idx_train)) subgrup_train.create_dataset(f'selected_{n}', data = idx_train) subgrup_test.create_dataset(f'selected_{n}', data = idx_test) if 'FPS-FacLoc' in strategies and mu!=0: print('FPS-FacLoc') grp = f.create_group(f'FPS-FacLoc') for count, initial_sub in tqdm(enumerate(initial_conditions, 1)): subgrup_train = grp.create_group(f'train_Initialize_{count}') subgrup_test = grp.create_group(f'test_Initialize_{count}') modified_sampler = FPS_plus(method='facility_location', mu=mu) modified_indices = modified_sampler.fit(X, initial_subset=[initial_sub], b_samples=max(trainig_set_sizes)) idx_test = list(np.arange(X.shape[0])) save_indices_to_h5py(subgrup_train, subgrup_test, trainig_set_sizes, modified_indices, idx_test) if 'FPS-RDM' in strategies and mu!=0: print('FPS-random splits') grp = f.create_group(f'FPS-RDM') for count, initial_sub in tqdm(enumerate(initial_conditions, 1)): subgrup_train = grp.create_group(f'train_Initialize_{count}') subgrup_test = grp.create_group(f'test_Initialize_{count}') modified_sampler = FPS_plus(method='random', mu=mu) modified_indices = modified_sampler.fit(X, initial_subset=[initial_sub], b_samples=max(trainig_set_sizes), random_state=count-1) idx_test = list(np.arange(X.shape[0])) save_indices_to_h5py(subgrup_train, subgrup_test, trainig_set_sizes, modified_indices, idx_test) if 'FacLoc-G' in strategies: print('FacLoc-G') grp = f.create_group(f'FacLoc-G') for count, initial_sub in tqdm(enumerate(initial_conditions, 1)): subgrup_train = grp.create_group(f'train_Initialize_{count}') subgrup_test = grp.create_group(f'test_Initialize_{count}') facility_location_sampler = FacilityLocation(initial_subset=[initial_sub], b_samples=max(trainig_set_sizes), metric='Gaussian', verbose=False, n_jobs=-1) facility_location_indices = facility_location_sampler.fit(X, gamma=gamma_FacLocG) idx_test = list(np.arange(X.shape[0])) save_indices_to_h5py(subgrup_train, subgrup_test, trainig_set_sizes, facility_location_indices, idx_test) f.close()