diff --git a/QuadratiK/datasets/_dataset.py b/QuadratiK/datasets/_dataset.py index 016f703..3bf5f1e 100644 --- a/QuadratiK/datasets/_dataset.py +++ b/QuadratiK/datasets/_dataset.py @@ -42,7 +42,7 @@ def load_wireless_data(desc=False, return_X_y=False, as_dataframe=True, scaled=F (n_samples,) containing the target samples. data : pandas.DataFrame, if as_dataframe is True - Dataframe of the data with shape (n_samples, n_features + class) + Dataframe of the data with shape (n_samples, n_features + class). (desc, data, target) : tuple, if desc is True and return_X_y is True A tuple of description and two numpy.ndarray. The first containing a 2D @@ -60,7 +60,7 @@ def load_wireless_data(desc=False, return_X_y=False, as_dataframe=True, scaled=F User Localization in an Indoor Environment Using Fuzzy Hybrid of Particle Swarm Optimization & Gravitational Search Algorithm with Neural Networks. In: Deep, K., et al. Proceedings of Sixth International Conference on Soft Computing for Problem Solving. Advances in Intelligent - Systems and Computing, vol 546. Springer, Singapore. https://doi.org/10.1007/978-981-10-3322-3_27 + Systems and Computing, vol 546. Springer, Singapore. https://doi.org/10.1007/978-981-10-3322-3_27. Source ------- @@ -115,7 +115,7 @@ def load_wisconsin_breast_cancer_data( desc=False, return_X_y=False, as_dataframe=True, scaled=False ): """ - The wisconsin breast cancer dataset data frame has 569 rows and 31 columns. The first 30 variables + The Wisconsin breast cancer dataset data frame has 569 rows and 31 columns. The first 30 variables report the features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image. The last column indicates the class labels (Benign = 0 or Malignant = 1). @@ -153,7 +153,7 @@ def load_wisconsin_breast_cancer_data( (n_samples,) containing the target samples. data : pandas.DataFrame, if as_dataframe is True - Dataframe of the data with shape (n_samples, n_features + class) + Dataframe of the data with shape (n_samples, n_features + class). (desc, data, target) : tuple, if desc is True and return_X_y is True A tuple of description and two numpy.ndarray. The first containing a 2D @@ -293,7 +293,7 @@ def load_wine_data(desc=False, return_X_y=False, as_dataframe=True, scaled=False (n_samples,) containing the target samples. data : pandas.DataFrame, if as_dataframe is True - Dataframe of the data with shape (n_samples, n_features + class) + Dataframe of the data with shape (n_samples, n_features + class). (desc, data, target) : tuple, if desc is True and return_X_y is True A tuple of description and two numpy.ndarray. The first containing a 2D diff --git a/QuadratiK/kernel_test/_cv_functions.py b/QuadratiK/kernel_test/_cv_functions.py index 4592ad7..0ccd163 100644 --- a/QuadratiK/kernel_test/_cv_functions.py +++ b/QuadratiK/kernel_test/_cv_functions.py @@ -40,33 +40,33 @@ def cv_twosample( The quantile to use for critical value estimation. data_pool: numpy.ndarray - ndarray containing the data to be used in the test + ndarray containing the data to be used in the test. size_x : int - The number of rows in the data_pool corresponding to group X + The number of rows in the data_pool corresponding to group X. size_y : int - The number of rows in the data_pool corresponding to group Y + The number of rows in the data_pool corresponding to group Y. h : float - The tuning parameter for the kernel test + The tuning parameter for the kernel test. method : str Method to use for computing the critical value - (one of bootstrap, permutation, or subsampling) + (one of bootstrap, permutation, or subsampling). b : float, optional - Subsampling block size (only used if method is subsampling) + Subsampling block size (only used if method is subsampling). random_state : int, None, optional. - Seed for random number generation. Defaults to None + Seed for random number generation. Defaults to None. n_jobs : int, optional n_jobs specifies the maximum number of concurrently running workers. If 1 is given, no joblib parallelism is used at all, which is useful for debugging. For more information on joblib n_jobs refer to - - https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html + https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html. Returns ------- @@ -76,7 +76,7 @@ def cv_twosample( References ----------- Markatou Marianthi, Saraceno Giovanni, Chen Yang (2023). “Two- and k-Sample Tests Based on - Quadratic Distances.” Manuscript, (Department of Biostatistics, University at Buffalo) + Quadratic Distances.” Manuscript, (Department of Biostatistics, University at Buffalo). """ if method == "bootstrap": @@ -147,14 +147,14 @@ def cv_normality( The quantile of the distribution used to select the critical value. random_state : int, None, optional. - Seed for random number generation. Defaults to None + Seed for random number generation. Defaults to None. n_jobs : int, optional n_jobs specifies the maximum number of concurrently running workers. If 1 is given, no joblib parallelism is used at all, which is useful for debugging. For more information on joblib n_jobs refer to - - https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html + https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html. Returns ------- @@ -180,7 +180,7 @@ def cv_ksample( n_jobs=8, ): """ - Compute the critical value for k-sample kernel tests + Compute the critical value for k-sample kernel tests. Parameters -------------- @@ -208,14 +208,14 @@ def cv_ksample( (one of "bootstrap", "permutation" or "subsampling"). random_state : int, None, optional. - Seed for random number generation. Defaults to None + Seed for random number generation. Defaults to None. n_jobs : int, optional n_jobs specifies the maximum number of concurrently running workers. If 1 is given, no joblib parallelism is used at all, which is useful for debugging. For more information on joblib n_jobs refer to - - https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html + https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html. Returns --------- @@ -226,7 +226,7 @@ def cv_ksample( References ----------- Markatou Marianthi, Saraceno Giovanni, Chen Yang (2023). “Two- and k-Sample Tests Based on - Quadratic Distances.” Manuscript, (Department of Biostatistics, University at Buffalo) + Quadratic Distances.” Manuscript, (Department of Biostatistics, University at Buffalo). """ sizes = np.unique(y, return_counts=True)[1] n = len(y) diff --git a/QuadratiK/kernel_test/_h_selection.py b/QuadratiK/kernel_test/_h_selection.py index d1aee67..1a9111c 100644 --- a/QuadratiK/kernel_test/_h_selection.py +++ b/QuadratiK/kernel_test/_h_selection.py @@ -35,7 +35,7 @@ def _objective_one_sample( ---------- alternative : str family of alternative chosen for selecting h, - must be one of "mean", "variance" and "skewness" + must be one of "mean", "variance" and "skewness". delta : numpy.ndarray Array of parameter values indicating chosen alternatives. @@ -49,10 +49,10 @@ def _objective_one_sample( mean_dat : numpy.ndarray Means of the multivariate distribution to be used - for determining the best h + for determining the best h. n : int - Number of observations in set of samples + Number of observations in set of samples. num_iter : int The number of iterations to use for critical value estimation. @@ -61,30 +61,30 @@ def _objective_one_sample( Quantile to use for critical value estimation. rep_values : int - Number of the bootstrap replication + Number of the bootstrap replication. s_dat : numpy.ndarray Variances of the multivariate distribution to be used - for determining the best h + for determining the best h. skew_data : numpy.ndarray Skewness of the multivariate distribution to be used - for determining the best h + for determining the best h. random_state : int, None, optional. - Seed for random number generation. Defaults to None + Seed for random number generation. Defaults to None. n_jobs : int, optional n_jobs specifies the maximum number of concurrently running workers. If 1 is given, no joblib parallelism is used at all, which is useful for debugging. For more information on joblib n_jobs refer to - - https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html + https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html. Returns ------- List containing rep_values, delta, h and boolean - representing the rejection/acceptance of null hypothesis + representing the rejection/acceptance of null hypothesis. """ dk = delta_dim * delta if alternative == "location": @@ -147,13 +147,13 @@ def _objective_two_sample( ): """ Objective function using using the best - h is chosen for two sample test + h is chosen for two sample test. Parameters ---------- alternative : str family of alternative chosen for selecting h, - must be one of "mean", "variance" and "skewness" + must be one of "mean", "variance" and "skewness". b : float The size of the subsamples used in the subsampling algorithm. @@ -169,57 +169,57 @@ def _objective_two_sample( Bandwidth for the kernel function. m : int - Number of observations in second set of samples + Number of observations in second set of samples. mean_dat : numpy.ndarray Means of the multivariate distribution to be used - for determining the best h + for determining the best h. method : str the method used for critical value estimation, must be one of "subsampling", "bootstrap", or "permutation". n : int - Number of observations in first set of samples + Number of observations in first set of samples. num_iter : int The number of iterations to use for critical value estimation. pooled : numpy.ndarray Observations in first set and second - set of samples combined together retaining the number of columns + set of samples combined together retaining the number of columns. quantile : float Quantile to use for critical value estimation. rep_values : int - Number of the bootstrap replication + Number of the bootstrap replication. s_dat : numpy.ndarray Variances of the multivariate distribution to be used - for determining the best h + for determining the best h. skew_data : numpy.ndarray Skewness of the multivariate distribution to be used - for determining the best h + for determining the best h. d : int - Dimension of the data + Dimension of the data. random_state : int, None, optional. - Seed for random number generation. Defaults to None + Seed for random number generation. Defaults to None. n_jobs : int, optional n_jobs specifies the maximum number of concurrently running workers. If 1 is given, no joblib parallelism is used at all, which is useful for debugging. For more information on joblib n_jobs refer to - - https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html + https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html. Returns ------- List containing rep_values, delta, h and boolean - representing the rejection/acceptance of null hypothesis + representing the rejection/acceptance of null hypothesis. """ dk = delta_dim * delta @@ -291,13 +291,13 @@ def _objective_k_sample( ): """ Objective function using using the best - h is chosen for k-sample test + h is chosen for k-sample test. Parameters ---------- alternative : str family of alternative chosen for selecting h, - must be one of "mean", "variance" and "skewness" + must be one of "mean", "variance" and "skewness". num_iter : int The number of iterations to use for critical value estimation. @@ -316,11 +316,11 @@ def _objective_k_sample( Bandwidth for the kernel function. k : int - Number of classes (or groups) in the data + Number of classes (or groups) in the data. mean_dat : numpy.ndarray Means of the multivariate distribution to be used - for determining the best h + for determining the best h. method : str the method used for critical value estimation, @@ -333,30 +333,30 @@ def _objective_k_sample( Quantile to use for critical value estimation. rep_values : int - Number of the bootstrap replication + Number of the bootstrap replication. s_dat : numpy.ndarray Variances of the multivariate distribution to be used - for determining the best h + for determining the best h. skew_data : numpy.ndarray Skewness of the multivariate distribution to be used - for determining the best h + for determining the best h. random_state : int, None, optional. - Seed for random number generation. Defaults to None + Seed for random number generation. Defaults to None. n_jobs : int, optional n_jobs specifies the maximum number of concurrently running workers. If 1 is given, no joblib parallelism is used at all, which is useful for debugging. For more information on joblib n_jobs refer to - - https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html + https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html. Returns ------- List containing rep_values, delta, h and boolean - representing the rejection/acceptance of null hypothesis + representing the rejection/acceptance of null hypothesis. """ dk = delta_dim * delta if alternative == "location": @@ -434,22 +434,22 @@ def select_h( The function performs the selection of the optimal value for the tuning parameter h of the normal kernel function, for the two-sample and k-sample KBQD tests. It performs a small simulation study, generating samples according - to the family of alternative specified, for the chosen values + to the family of a specified alternative, for the chosen values of h_values and delta. Parameters ---------- x : numpy.ndarray or pandas.DataFrame - Data set of observations from X + Data set of observations from X. y : numpy.ndarray or pandas.DataFrame, optional Data set of observations from Y for two sample test - or set of labels in case of k-sample test + or set of labels in case of k-sample test. alternative : str, optional Family of alternative chosen for selecting h, must be one of "location", "scale" and "skewness". - Defaults to "location" + Defaults to "location". method : str, optional. The method used for critical value estimation, @@ -458,7 +458,8 @@ def select_h( b : float, optional. The size of the subsamples used in the subsampling algorithm. - Defaults to 0.8. + Defaults to 0.8 i.e. `0.8N` samples are used, where `N` + represents the total sample size. num_iter : int, optional. The number of iterations to use for critical value estimation. @@ -477,7 +478,7 @@ def select_h( Defaults to None. n_rep : int, optional. Defaults to 50. - Number of bootstrap replications + Number of bootstrap replications. n_jobs : int, optional. n_jobs specifies the maximum number of concurrently running workers. @@ -497,21 +498,21 @@ def select_h( values in h\\_values and delta. Defaults to False. random_state : int, None, optional. - Seed for random number generation. Defaults to None + Seed for random number generation. Defaults to None. Returns ------- h : float - The selected value of tuning parameter h + The selected value of tuning parameter h. h vs Power table : pandas.DataFrame - A table containing the h, delta and corresponding powers + A table containing the h, delta and corresponding powers. References ----------- Markatou M., Saraceno G., Chen Y. (2023). “Two- and k-Sample Tests Based on Quadratic Distances. ”Manuscript, (Department of Biostatistics, - University at Buffalo) + University at Buffalo). Examples -------- diff --git a/QuadratiK/kernel_test/_kernel_test.py b/QuadratiK/kernel_test/_kernel_test.py index 1d26510..d81d812 100644 --- a/QuadratiK/kernel_test/_kernel_test.py +++ b/QuadratiK/kernel_test/_kernel_test.py @@ -43,7 +43,8 @@ class KernelTest: The number of iterations to use for critical value estimation. Defaults to 150. b : float, optional - The size of the subsamples used in the subsampling algorithm. Defaults to 0.9. + The size of the subsamples used in the subsampling algorithm. Defaults to 0.9 i.e. + `0.9N` samples are used, where `N` represents the total sample size. quantile : float, optional The quantile to use for critical value estimation. Defaults to 0.95. @@ -63,7 +64,7 @@ class KernelTest: Maximum number of groups allowed. Defaults to 10. Change in case of more than 10 groups. random_state : int, None, optional. - Seed for random number generation. Defaults to None + Seed for random number generation. Defaults to None. n_jobs : int, optional. n_jobs specifies the maximum number of concurrently @@ -77,65 +78,65 @@ class KernelTest: ---------- For Normality Test: test_type\\_ : str - The type of test performed on the data + The type of test performed on the data. execution_time : float - Time taken for the test method to execute + Time taken for the test method to execute. un_h0_rejected\\_ : boolean - Whether the null hypothesis using Un is rejected (True) or not (False) + Whether the null hypothesis using Un is rejected (True) or not (False). vn_h0_rejected\\_ : boolean - Whether the null hypothesis using Vn is rejected (True) or not (False) + Whether the null hypothesis using Vn is rejected (True) or not (False). un_test_statistic\\_ : float - Un Test statistic of the perfomed test type + Un Test statistic of the perfomed test type. vn_test_statistic\\_ : float - Vn Test statistic of the perfomed test type + Vn Test statistic of the perfomed test type. un_cv\\_ : float - Critical value for Un + Critical value for Un. un_cv\\_ : float - Critical value for Vn + Critical value for Vn. For Two-Sample and K-Sample Test: test_type\\_ : str - The type of test performed on the data + The type of test performed on the data. execution_time : float - Time taken for the test method to execute + Time taken for the test method to execute. dn_h0_rejected\\_ : boolean - Whether the null hypothesis using Un is rejected (True) or not (False) + Whether the null hypothesis using Un is rejected (True) or not (False). dn_test_statistic\\_ : float - Un Test statistic of the perfomed test type + Un Test statistic of the perfomed test type. dn_cv\\_ : float - Critical value for Un + Critical value for Un. trace_h0_rejected\\_ : boolean - Whether the null hypothesis using trace statistic is rejected (True) or not (False) + Whether the null hypothesis using trace statistic is rejected (True) or not (False). trace_test_statistic\\_ : float - Trace Test statistic of the perfomed test type + Trace Test statistic of the perfomed test type. trace_cv\\_ : float - Critical value for trace statistic + Critical value for trace statistic. cv_method\\_ : str - Critical value method used for performing the test + Critical value method used for performing the test. References ----------- Markatou M., Saraceno G., Chen Y (2023). “Two- and k-Sample Tests Based on Quadratic Distances. - ”Manuscript, (Department of Biostatistics, University at Buffalo) + ”Manuscript, (Department of Biostatistics, University at Buffalo). Lindsay BG, Markatou M. & Ray S. (2014) Kernels, Degrees of Freedom, and Power Properties of Quadratic Distance Goodness-of-Fit Tests, Journal of the American Statistical - Association, 109:505, 395-410, DOI: 10.1080/01621459.2013.836972 + Association, 109:505, 395-410, DOI: 10.1080/01621459.2013.836972. Examples -------- @@ -267,7 +268,7 @@ def test(self, x, y=None): Returns ------- self : object - Fitted estimator + Fitted estimator. """ self.x = x @@ -497,7 +498,7 @@ def stats(self): Returns ------- summary_stats_df : pandas.DataFrame - Dataframe of descriptive statistics + Dataframe of descriptive statistics. """ summary_stats_df = stats(self.x, self.y) return summary_stats_df.round(4) @@ -510,7 +511,7 @@ def summary(self, print_fmt="simple_grid"): ---------- print_fmt : str, optional. Used for printing the output in the desired format. Defaults to "simple_grid". - Supports all available options in tabulate, see here: https://pypi.org/project/tabulate/ + Supports all available options in tabulate, see here: https://pypi.org/project/tabulate/. Returns -------- diff --git a/QuadratiK/kernel_test/_utils.py b/QuadratiK/kernel_test/_utils.py index 6858b5c..e6ea1dc 100644 --- a/QuadratiK/kernel_test/_utils.py +++ b/QuadratiK/kernel_test/_utils.py @@ -90,7 +90,7 @@ def param_centering(kmat_zz, z_mat, cov_h, mu_hat, sigma_hat): Returns -------- centered kernel matrix : numpy.ndarray - Matrix of centered kernel + Matrix of centered kernel. """ n_z = z_mat.shape[0] @@ -113,7 +113,7 @@ def dof_normality_test(sigma_h, v): Parameters ----------- sigma_h : np.ndarray - ovariance matrix of the gaussian kernel + ovariance matrix of the gaussian kernel. v : np.ndarray Covariance matrix of the tested distribution G @@ -121,7 +121,7 @@ def dof_normality_test(sigma_h, v): Returns -------- DOF and Coefficient : tuple - DOF and the coefficient of the asymptotic distribution + DOF and the coefficient of the asymptotic distribution. """ num_dof = np.linalg.det(sigma_h) ** (-1 / 2) - np.linalg.det(sigma_h + 2 * v) ** ( -1 / 2 @@ -148,18 +148,18 @@ def variance_normality_test(sigma_h, v, n): Parameters ---------- sigma_h: np.ndarray - Covariance matrix of the gaussian kernel + Covariance matrix of the gaussian kernel. v : np.ndarray - Covariance matrix of the tested distribution G + Covariance matrix of the tested distribution G. n : int - Sample size + Sample size. Returns -------- variance: float - value of computed variance + value of computed variance. """ d = sigma_h.shape[0] var = ( @@ -186,7 +186,7 @@ def variance_two_sample_test(k_cen, n, m): Parameters ---------- k_cen : numpy.ndarray - Matrix with centered kernel values + Matrix with centered kernel values. n : int Number of samples of type/kind 1. @@ -197,7 +197,7 @@ def variance_two_sample_test(k_cen, n, m): Returns ------- Computed variances : tuple - Tuple of variances for Dn and trace statistics + Tuple of variances for Dn and trace statistics. """ np.fill_diagonal(k_cen, 0) @@ -240,17 +240,17 @@ def variance_k_sample_test(k_cen, sizes, cum_size): Parameters ----------- k_cen : np.ndarray - matrix with centered kernel values + matrix with centered kernel values. sizes : list, np.ndarray - vector indicating size of samples + vector indicating size of samples. cum_size : list, np.ndarray - vector indicating sample's cumulative sizes + vector indicating sample's cumulative sizes. Returns: variance: float - value of computed variance + value of computed variance. """ k = len(sizes) diff --git a/QuadratiK/poisson_kernel_test/_cv_functions.py b/QuadratiK/poisson_kernel_test/_cv_functions.py index e70bd10..8346027 100644 --- a/QuadratiK/poisson_kernel_test/_cv_functions.py +++ b/QuadratiK/poisson_kernel_test/_cv_functions.py @@ -32,14 +32,14 @@ def poisson_cv(d, size, rho, num_iter, quantile, random_state=None, n_jobs=8): The quantile value to be calculated from the results. random_state : int, None, optional. - Seed for random number generation. Defaults to None + Seed for random number generation. Defaults to None. n_jobs : int, optional n_jobs specifies the maximum number of concurrently running workers. If 1 is given, no joblib parallelism is used at all, which is useful for debugging. For more information on joblib n_jobs refer to - - https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html + https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html. Returns --------- @@ -49,7 +49,7 @@ def poisson_cv(d, size, rho, num_iter, quantile, random_state=None, n_jobs=8): References ---------- Ding Yuxin, Markatou Marianthi, Saraceno Giovanni (2023). “Poisson Kernel-Based Tests for - Uniformity on the d-Dimensional Sphere.” Statistica Sinica. doi: doi:10.5705/ss.202022.0347 + Uniformity on the d-Dimensional Sphere.” Statistica Sinica. doi: doi:10.5705/ss.202022.0347. """ results = Parallel(n_jobs=n_jobs)( diff --git a/QuadratiK/poisson_kernel_test/_poisson_kernel_test.py b/QuadratiK/poisson_kernel_test/_poisson_kernel_test.py index 24c0a3b..0327d2e 100644 --- a/QuadratiK/poisson_kernel_test/_poisson_kernel_test.py +++ b/QuadratiK/poisson_kernel_test/_poisson_kernel_test.py @@ -20,8 +20,8 @@ class PoissonKernelTest: """ - Class for Poisson kernel-based quadratic distance test - of Uniformity on the Sphere + Class for Poisson kernel-based quadratic distance tests + of Uniformity on the Sphere. Parameters ---------- @@ -33,10 +33,10 @@ class PoissonKernelTest: Number of iterations for critical value estimation of U-statistic. quantile : float, optional - The quantile to use for critical value estimation + The quantile to use for critical value estimation. random_state : int, None, optional. - Seed for random number generation. Defaults to None + Seed for random number generation. Defaults to None. n_jobs : int, optional. n_jobs specifies the maximum number of concurrently running workers. @@ -48,20 +48,20 @@ class PoissonKernelTest: Attributes ----------- test_type\\_ : str - The type of test performed on the data + The type of test performed on the data. execution_time : float - Time taken for the test method to execute + Time taken for the test method to execute. u_statistic_h0\\_ : boolean A logical value indicating whether or not the null hypothesis - is rejected according to Un + is rejected according to Un. u_statistic_un\\_ : float The value of the U-statistic. u_statistic_cv\\_ : float - The empirical critical value for Un + The empirical critical value for Un. v_statistic_h0\\_ : boolean A logical value indicating whether or not the null hypothesis is @@ -76,7 +76,7 @@ class PoissonKernelTest: References ----------- Ding Y., Markatou M., Saraceno G. (2023). “Poisson Kernel-Based Tests for - Uniformity on the d-Dimensional Sphere.” Statistica Sinica. doi: doi:10.5705/ss.202022.0347 + Uniformity on the d-Dimensional Sphere.” Statistica Sinica. doi: doi:10.5705/ss.202022.0347. Examples --------- @@ -128,7 +128,7 @@ def __repr__(self): def test(self, x): """ Performs the Poisson kernel-based quadratic distance Goodness-of-fit tests for - Uniformity for spherical data using the Poisson kernel with concentration parameter :math:`rho` + Uniformity for spherical data using the Poisson kernel with concentration parameter :math:`rho`. Parameters ---------- @@ -138,7 +138,7 @@ def test(self, x): Returns ------- self : object - Fitted estimator + Fitted estimator. """ self.x = x if isinstance(x, np.ndarray): @@ -205,7 +205,7 @@ def stats(self): Returns ------- summary_stats_df : pandas.DataFrame - Dataframe of descriptive statistics + Dataframe of descriptive statistics. """ summary_stats_df = stats(self.x) return summary_stats_df.round(4) @@ -213,7 +213,7 @@ def stats(self): def summary(self, print_fmt="simple_grid"): """ Summary function generates a table for - the poisson kernel test results and the summary statistics. + the Poisson kernel test results and the summary statistics. Parameters ---------- diff --git a/QuadratiK/poisson_kernel_test/_utils.py b/QuadratiK/poisson_kernel_test/_utils.py index 234dc39..f91e976 100644 --- a/QuadratiK/poisson_kernel_test/_utils.py +++ b/QuadratiK/poisson_kernel_test/_utils.py @@ -32,7 +32,7 @@ def dof(d, rho): def stat_poisson_unif(x_mat, rho): """ Compute the Poisson kernel-based test for Uniformity - given a sample of observations on the Sphere + given a sample of observations on the Sphere. Parameters -------------- @@ -46,7 +46,7 @@ def stat_poisson_unif(x_mat, rho): --------- (Un,Vn) : tuple Tuple containing value of the - U-statistic and V-statistic + U-statistic and V-statistic. """ n_x = x_mat.shape[0] pmat = compute_poisson_matrix(x_mat, rho) diff --git a/QuadratiK/spherical_clustering/_pkbc.py b/QuadratiK/spherical_clustering/_pkbc.py index 8e4b4f2..ea37112 100644 --- a/QuadratiK/spherical_clustering/_pkbc.py +++ b/QuadratiK/spherical_clustering/_pkbc.py @@ -62,7 +62,7 @@ class PKBC: Defaults to 1e-7. random_state : int, None, optional. - Determines random number generation for centroid initialization. Defaults to None + Determines random number generation for centroid initialization. Defaults to None. n_jobs : int Used only for computing the WCSS efficiently. @@ -83,49 +83,49 @@ class PKBC: labels\\_ : dict Final cluster membership assigned by the algorithm to each observation. A dictionary containing key-value pairs, where each key is an element from the `num_clust` vector, - and each value is a numpy.ndarray of shape (n_samples,) + and each value is a numpy.ndarray of shape (n_samples,). log_lik_vecs\\_ : dict Array of log-likelihood values for each initialization. A dictionary containing key-value pairs, where each key is an element from the `num_clust` vector, - and each value is a numpy.ndarray of shape (num_init, ) + and each value is a numpy.ndarray of shape (num_init, ). loglik\\_ : dict Maximum value of the log-likelihood function. A dictionary containing key-value pairs, where each key is an element from the `num_clust` vector, - and each value is a float + and each value is a float. mu\\_ : dict Estimated centroids. A dictionary containing key-value pairs, where each key is an element from the `num_clust` vector, - and each value is a numpy.ndarray of shape (n_clusters, n_features) + and each value is a numpy.ndarray of shape (n_clusters, n_features). num_iter_per_runs\\_ : dict Number of E-M iterations per run. A dictionary containing key-value pairs, where each key is an element from the `num_clust` vector, - and each value is a numpy.ndarray of shape (num_init, ) + and each value is a numpy.ndarray of shape (num_init, ). post_probs\\_ : dict - Posterior probabilities of each observation for the indicated clusters + Posterior probabilities of each observation for the indicated clusters. A dictionary containing key-value pairs, where each key is an element from the `num_clust` vector, - and each value is a numpy.ndarray of shape (n_samples, num_clust) + and each value is a numpy.ndarray of shape (n_samples, num_clust). rho\\_ : dict Estimated concentration parameters rho. A dictionary containing key-value pairs, where each key is an element from the `num_clust` vector, - and each value is a numpy.ndarray of shape (n_clusters,) + and each value is a numpy.ndarray of shape (n_clusters,). euclidean\\_wcss\\_ : dict Values of within-cluster sum of squares computed with Euclidean distance. A dictionary containing key-value pairs, where each key is an element from the `num_clust` vector, - and each value is a float + and each value is a float. cosine\\_wcss\\_ : dict Values of within-cluster sum of squares computed with cosine similarity. A dictionary containing key-value pairs, where each key is an element from the `num_clust` vector, - and each value is a float + and each value is a float. References ---------- @@ -495,12 +495,12 @@ def stats_clusters(self, num_clust): Parameters ----------- num_clust : int - Number of clusters for which the summary statistics should be shown + Number of clusters for which the summary statistics should be shown. Returns ------- summary_stats_df : pandas.DataFrame - Dataframe of descriptive statistics + Dataframe of descriptive statistics. """ summary_stats = stats(self.dat_copy, self.labels_[num_clust]) @@ -513,10 +513,10 @@ def predict(self, X, num_clust): Parameters ----------- X : numpy.ndarray, pandas.DataFrame - New data to predict membership + New data to predict membership. num_clust : int - Number of clusters to be used for prediction + Number of clusters to be used for prediction. Returns -------- @@ -586,7 +586,7 @@ def plot(self, num_clust, y_true=None): y_true : numpy.ndarray, list, pandas.series, optional - If `y_true` is None, then only clusters colored according to the predicted labels. - - If `y_true` is provided, clusters are colored according to the predicted and true labels in different subplots + - If `y_true` is provided, clusters are colored according to the predicted and true labels in different subplots. Returns ------- diff --git a/QuadratiK/spherical_clustering/_pkbd.py b/QuadratiK/spherical_clustering/_pkbd.py index 558e102..f511a25 100644 --- a/QuadratiK/spherical_clustering/_pkbd.py +++ b/QuadratiK/spherical_clustering/_pkbd.py @@ -105,7 +105,7 @@ def rpkb(self, n, mu, rho, method="rejvmf", random_state=None): Defaults to 'rejvmf'. random_state : int, None, optional. - Seed for random number generation. Defaults to None + Seed for random number generation. Defaults to None. Returns ------- @@ -122,7 +122,7 @@ def rpkb(self, n, mu, rho, method="rejvmf", random_state=None): 29:4, 758-770, DOI: 10.1080/10618600.2020.1740713. Sablica L., Hornik K., Leydold J. "Efficient sampling from the PKBD - distribution," Electronic Journal of Statistics, 17(2), 2180-2209, (2023) + distribution," Electronic Journal of Statistics, 17(2), 2180-2209, (2023). Examples -------- diff --git a/QuadratiK/spherical_clustering/_utils.py b/QuadratiK/spherical_clustering/_utils.py index 038d462..7cf50ad 100644 --- a/QuadratiK/spherical_clustering/_utils.py +++ b/QuadratiK/spherical_clustering/_utils.py @@ -30,7 +30,7 @@ def c_d_lambda(beta, p, lamda): def calculate_wcss_euclidean(k, memb_best, dat, mu_best): """ - Used for computing the euclidean WCSS for a single cluster + Used for computing the euclidean WCSS for a single cluster. """ idx = np.where(memb_best == k)[0] return np.sum(np.linalg.norm(dat[idx] - mu_best[k], axis=1) ** 2) @@ -38,7 +38,7 @@ def calculate_wcss_euclidean(k, memb_best, dat, mu_best): def calculate_wcss_cosine(k, memb_best, dat, mu_best): """ - Used for computing the cosine WCSS for a single cluster + Used for computing the cosine WCSS for a single cluster. """ idx = np.where(memb_best == k)[0] return np.sum(dat[idx] * mu_best[k]) diff --git a/QuadratiK/tools/graphics.py b/QuadratiK/tools/graphics.py index 0d79941..d8d1362 100644 --- a/QuadratiK/tools/graphics.py +++ b/QuadratiK/tools/graphics.py @@ -62,7 +62,7 @@ def sphere3d(x, y=None): The parameter `x` represents the input data for the scatter plot. It should be a 2D array-like object with shape (n_samples, 3), where each row represents the coordinates of a point in - 3D space + 3D space. y : numpy.ndarray, list, pandas.series, optional The parameter `y` is an optional input that determines the color and @@ -166,7 +166,7 @@ def plot_clusters_2d(x, y=None): The parameter `x` is a 2-dimensional array or matrix containing the coordinates of the data points to be plotted. Each row of `x` represents the coordinates of a single data point - in the 2-dimensional space + in the 2-dimensional space. y : numpy.ndarray, pandas.DataFrame, optional The parameter `y` is an optional array that represents the labels diff --git a/QuadratiK/tools/tools.py b/QuadratiK/tools/tools.py index d1b6c78..34ea672 100644 --- a/QuadratiK/tools/tools.py +++ b/QuadratiK/tools/tools.py @@ -94,7 +94,7 @@ def stats(x, y=None): def sample_hypersphere(npoints=100, ndim=3, random_state=None): """ - Generate random samples from the hypersphere + Generate random samples from the hypersphere. Parameters -------------- @@ -107,7 +107,7 @@ def sample_hypersphere(npoints=100, ndim=3, random_state=None): Default is 3. random_state : int, None, optional. - Seed for random number generation. Defaults to None + Seed for random number generation. Defaults to None. Returns --------- diff --git a/README.rst b/README.rst index 494dd6d..911b075 100644 --- a/README.rst +++ b/README.rst @@ -1,5 +1,6 @@ +========== QuadratiK -========= +========== |GitHub Actions|_ |Codecov|_ |Documentation Status|_ |PyPI Version|_ |PyPI Python Version|_ |PyPI Downloads|_ |GitHub Search|_ |Black|_ .. |GitHub Actions| image:: https://github.com/rmj3197/QuadratiK/actions/workflows/release.yml/badge.svg @@ -41,34 +42,36 @@ QuadratiK .. _Black: https://github.com/psf/black Introduction -------------- +============= The QuadratiK package is implemented in both **R** and **Python**, providing a comprehensive set of goodness-of-fit tests and a clustering technique using kernel-based quadratic distances. This framework aims to bridge the gap between the statistical and machine learning literatures. It includes: * **Goodness-of-Fit Tests** : The software implements one, two, and k-sample tests for goodness of fit, offering an efficient and mathematically sound way to assess the fit of probability distributions. Expanded capabilities include supporting tests for uniformity on the :math:`d`-dimensional Sphere based on Poisson kernel densities. -* **Clustering Algorithm for Spherical Data**: the package incorporates a unique clustering algorithm specifically tailored for spherical data. This algorithm leverages a mixture of Poisson-kernel-based densities on the sphere, enabling effective clustering of spherical data or data that has been spherically transformed. This facilitates the uncovering of underlying patterns and relationships in the data. +* **Clustering Algorithm for Spherical Data**: the package incorporates a unique clustering algorithm specifically tailored for spherical data. This algorithm leverages a mixture of Poisson-kernel-based densities on the sphere, enabling effective clustering of spherical data or data that has been spherically transformed. This facilitates the uncovering of underlying patterns and relationships in the data. Additionally, the package also includes Poisson Kernel-based Densities random number generation. * **Additional Features**: Alongside these functionalities, the software includes additional graphical functions, aiding users in validating cluster results as well as visualizing and representing clustering results. This enhances the interpretability and usability of the analysis. +* **User Interface**: We also provide a dashboard application built using ``streamlit`` allowing users to access the methods implemented in the package without the need for programming. + Authors -++++++++ +--------- Giovanni Saraceno , Marianthi Markatou , Raktim Mukhopadhyay , Mojgan Golzy Mantainer: Raktim Mukhopadhyay Documentation -+++++++++++++++ +--------------- The documentation is hosted on Read the Docs at - https://quadratik.readthedocs.io/en/latest/ Installation using ``pip`` -+++++++++++++++++++++++++++ +---------------------------- ``pip install QuadratiK`` Examples -+++++++++ +---------- Find basic examples at `QuadratiK Examples `_ @@ -78,10 +81,10 @@ You can also execute the examples on Binder |Binder|. :target: https://mybinder.org/v2/gh/rmj3197/QuadratiK/HEAD?labpath=doc%2Fsource%2Fuser_guide Community -+++++++++++ +------------ Development Version Installation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ For installing the development version, please download the code files from the master branch of the Github repository. Please note that installation from Github might be buggy, for latest stable release please download using ``pip``. @@ -94,26 +97,26 @@ For downloading from Github use the following instructions, pip install -e . Contributing Guide -~~~~~~~~~~~~~~~~~~~~ +""""""""""""""""""""" Please refer to the `Contributing Guide `_. Code of Conduct -~~~~~~~~~~~~~~~~ +""""""""""""""""" The code of conduct can be found at `Code of Conduct `_. License -~~~~~~~~ +""""""""" This project uses the GPL-3.0 license, with a full version of the license included in the repository `here `_. Funding Information -++++++++++++++++++++ +--------------------- The work has been supported by Kaleida Health Foundation, Food and Drug Administration, and Department of Biostatistics, University at Buffalo. References -+++++++++++ +------------- Saraceno G., Markatou M., Mukhopadhyay R., Golzy M. (2024). Goodness-of-Fit and Clustering of Spherical Data: the QuadratiK package in R and Python. arXiv preprint arXiv:2402.02290. diff --git a/doc/source/api_reference/index.rst b/doc/source/api_reference/index.rst index efde65c..0f1c391 100644 --- a/doc/source/api_reference/index.rst +++ b/doc/source/api_reference/index.rst @@ -1,7 +1,8 @@ .. _api_reference: +=============== API Reference -============= +=============== .. automodule:: QuadratiK.kernel_test diff --git a/doc/source/changelog/v1.1.0.rst b/doc/source/changelog/v1.1.0.rst index d42d245..98c5df4 100644 --- a/doc/source/changelog/v1.1.0.rst +++ b/doc/source/changelog/v1.1.0.rst @@ -2,11 +2,11 @@ QuadratiK Version 1.1.0 ======================== This version includes - - 1. **[NEW]** Minor bug fixes - 2. **[NEW]** Enforced Opt Out from Telemetry (streamlit) while using UI - 3. **[NEW]** Additional test cases to increase coverage - 4. **[NEW]** Documentation improvements (hosting on Read the Docs, and included Code of Conduct and Contributing Guide) + 1. **[NEW]** Minor bug fixes. + 2. **[NEW]** Enforced Opt Out from Telemetry (streamlit) while using UI. + 3. **[NEW]** Additional test cases to increase coverage. + 4. **[NEW]** Documentation improvements (hosting on Read the Docs, and included Code of Conduct and Contributing Guide). 5. **[NEW]** Added predict method to extract cluster membership of new data. - 6. **[NEW]** Included Vn statistic for Normality Test - 7. **[NEW]** Included trace statistics and computation of variances for Two-sample and K-sample test - 8. **[REMOVED]** Removed non-parameteric Normality test \ No newline at end of file + 6. **[NEW]** Included Vn statistic for Normality Test. + 7. **[NEW]** Included trace statistics and computation of variances for Two-sample and K-sample test. + 8. **[REMOVED]** Removed non-parameteric Normality test. \ No newline at end of file diff --git a/doc/source/changelog/v1.1.1.rst b/doc/source/changelog/v1.1.1.rst index 237044c..d5fbf03 100644 --- a/doc/source/changelog/v1.1.1.rst +++ b/doc/source/changelog/v1.1.1.rst @@ -6,8 +6,8 @@ This version is currently in development. This version can be found on the master branch. This version includes - - 1. **[NEW]** Included repr method for KernelTest and PoissonKernelTest classes - 2. **[NEW]** Separate attributes for Dn and Trace statistics, CV and H0 - 3. **[NEW]** Updated summary print format for KernelTest and PoissonKernelTest - 4. **[NEW]** Updated Examples - 5. **[NEW]** Included Binder link \ No newline at end of file + 1. **[NEW]** Included repr method for KernelTest and PoissonKernelTest classes. + 2. **[NEW]** Separate attributes for Dn and Trace statistics, CV and H0. + 3. **[NEW]** Updated summary print format for KernelTest and PoissonKernelTest. + 4. **[NEW]** Updated Examples. + 5. **[NEW]** Included Binder link. \ No newline at end of file diff --git a/doc/source/development/index.rst b/doc/source/development/index.rst index f97072a..b3f30d7 100644 --- a/doc/source/development/index.rst +++ b/doc/source/development/index.rst @@ -1,15 +1,16 @@ +============ Development ============ Code of Conduct ------------------ +================= .. toctree:: :maxdepth: 4 CODE_OF_CONDUCT.rst Contributing Guide --------------------- +==================== .. toctree:: :maxdepth: 4 diff --git a/doc/source/getting_started/index.rst b/doc/source/getting_started/index.rst index b761380..02021e8 100644 --- a/doc/source/getting_started/index.rst +++ b/doc/source/getting_started/index.rst @@ -1,3 +1,4 @@ +================ Getting Started ================ diff --git a/doc/source/user_guide/basic_usage.ipynb b/doc/source/user_guide/basic_usage.ipynb index 9ec9962..721786e 100644 --- a/doc/source/user_guide/basic_usage.ipynb +++ b/doc/source/user_guide/basic_usage.ipynb @@ -7,7 +7,7 @@ "# Usage Examples for `QuadratiK` in Python\n", "Authors : Giovanni Saraceno, Marianthi Markatou, Raktim Mukhopadhyay, Mojgan Golzy\n", "\n", - "Date : 28 May 2024" + "Date : 4 June 2024" ] }, { @@ -24,7 +24,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Introduction" + "## Introduction" ] }, { @@ -38,7 +38,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Installation" + "### Installation" ] }, { @@ -52,7 +52,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Normality Test" + "## Normality Test" ] }, { @@ -91,7 +91,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# K-Sample Test" + "## K-Sample Test" ] }, { @@ -139,7 +139,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Two-Sample Test\n" + "## Two-Sample Test\n" ] }, { @@ -206,7 +206,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Uniformity Test" + "## Uniformity Test" ] }, { @@ -243,25 +243,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `qq_plot` function can be used to generate the qq-plots between the given samples and the uniform distribution " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from QuadratiK.tools import qq_plot\n", - "\n", - "qq_plot(data_unif, dist=\"uniform\")" + "The `qq_plot` function can be used to generate the qq-plots between the given samples and the uniform distribution." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Tuning Parameter $h$ selection" + "## Tuning Parameter $h$ selection" ] }, { @@ -271,7 +260,7 @@ "The algorithm is implemented through the function `select_h`. \n", "The function select_h takes as arguments the data matrix x, the vector of \n", "labels y, and the type of alternatives (one of \"location\", \"scale\" or \n", - "\"skewness\"). select_h returns not only the selected value of h, but also the \n", + "\"skewness\"); select_h returns not only the selected value of h, but also the \n", "power plot versus the considered list of h values for each tested value of \n", "$\\delta$. " ] @@ -280,7 +269,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## For Two-Sample Test" + "### For Two-Sample Test" ] }, { @@ -322,7 +311,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## For K-Sample Test" + "### For K-Sample Test" ] }, { @@ -351,14 +340,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Real World Examples" + "## Real World Examples" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Two-Sample Test" + "### Two-Sample Test" ] }, { @@ -397,15 +386,15 @@ "# performing the two sample test\n", "two_sample_test = KernelTest(h=h_selected, num_iter=150, random_state=42).test(X1, X2)\n", "\n", - "# printing the summary for the two sample test\n", - "print(two_sample_test.summary())" + "# printing two sample test object\n", + "print(two_sample_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## K-Sample Test" + "### K-Sample Test" ] }, { @@ -444,7 +433,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Poisson Kernel Based Clustering" + "### Poisson Kernel Based Clustering" ] }, { @@ -539,7 +528,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Initializing the Dashboard" + "## Initializing the Dashboard" ] }, { @@ -561,6 +550,20 @@ "UI().run()\n", "\"\"\"" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Dashboard](dash-landing.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above image shows the landing page of the user interface in the `QuadratiK` package" + ] } ], "metadata": { diff --git a/doc/source/user_guide/dash-landing.png b/doc/source/user_guide/dash-landing.png new file mode 100644 index 0000000..1305e62 Binary files /dev/null and b/doc/source/user_guide/dash-landing.png differ diff --git a/doc/source/user_guide/index.rst b/doc/source/user_guide/index.rst index 6619149..482fd3e 100644 --- a/doc/source/user_guide/index.rst +++ b/doc/source/user_guide/index.rst @@ -1,22 +1,23 @@ +============= User Guide ============= Dataset ---------------- +======== .. toctree:: :maxdepth: 4 datasets.rst Common Usage Examples ------------------------ +====================== .. toctree:: :maxdepth: 4 basic_usage Random Sampling from PKBD ---------------------------- +=========================== .. toctree:: :maxdepth: 4