Data Generation#
make_blobs#
#include <raft/random/make_blobs.cuh>
namespace raft::random
-
template<typename DataT, typename IdxT, typename layout>
void make_blobs(raft::resources const &handle, raft::device_matrix_view<DataT, IdxT, layout> out, raft::device_vector_view<IdxT, IdxT> labels, IdxT n_clusters = 5, std::optional<raft::device_matrix_view<DataT, IdxT, layout>> centers = std::nullopt, std::optional<raft::device_vector_view<DataT, IdxT>> const cluster_std = std::nullopt, const DataT cluster_std_scalar = (DataT)1.0, bool shuffle = true, DataT center_box_min = (DataT)-10.0, DataT center_box_max = (DataT)10.0, uint64_t seed = 0ULL, GeneratorType type = GenPC)# GPU-equivalent of sklearn.datasets.make_blobs.
- Template Parameters:
DataT – output data type
IdxT – indexing arithmetic type
- Parameters:
handle – [in] raft handle for managing expensive resources
out – [out] generated data [on device] [dim = n_rows x n_cols]
labels – [out] labels for the generated data [on device] [len = n_rows]
n_clusters – [in] number of clusters (or classes) to generate
centers – [in] centers of each of the cluster, pass a nullptr if you need this also to be generated randomly [on device] [dim = n_clusters x n_cols]
cluster_std – [in] standard deviation of each cluster center, pass a nullptr if this is to be read from the
cluster_std_scalar
. [on device] [len = n_clusters]cluster_std_scalar – [in] if ‘cluster_std’ is nullptr, then use this as the std-dev across all dimensions.
shuffle – [in] shuffle the generated dataset and labels
center_box_min – [in] min value of box from which to pick cluster centers. Useful only if ‘centers’ is nullptr
center_box_max – [in] max value of box from which to pick cluster centers. Useful only if ‘centers’ is nullptr
seed – [in] seed for the RNG
type – [in] RNG type
make_regression#
#include <raft/random/make_regression.cuh>
namespace raft::random
-
template<typename DataT, typename IdxT>
void make_regression(raft::resources const &handle, raft::device_matrix_view<DataT, IdxT, raft::row_major> out, raft::device_matrix_view<DataT, IdxT, raft::row_major> values, IdxT n_informative, std::optional<raft::device_matrix_view<DataT, IdxT, raft::row_major>> coef, DataT bias = DataT{}, IdxT effective_rank = static_cast<IdxT>(-1), DataT tail_strength = DataT{0.5}, DataT noise = DataT{}, bool shuffle = true, uint64_t seed = 0ULL, GeneratorType type = GenPC)# GPU-equivalent of sklearn.datasets.make_regression as documented at: https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html.
- Template Parameters:
DataT – Scalar type
IdxT – Index type
- Parameters:
handle – [in] RAFT handle
out – [out] Row-major (samples, features) matrix to store the problem data
values – [out] Row-major (samples, targets) matrix to store the values for the regression problem
n_informative – [in] Number of informative features (non-zero coefficients)
coef – [out] If present, a row-major (features, targets) matrix to store the coefficients used to generate the values for the regression problem
bias – [in] A scalar that will be added to the values
effective_rank – [in] The approximate rank of the data matrix (used to create correlations in the data). -1 is the code to use well-conditioned data
tail_strength – [in] The relative importance of the fat noisy tail of the singular values profile if effective_rank is not -1
noise – [in] Standard deviation of the Gaussian noise applied to the output
shuffle – [in] Shuffle the samples and the features
seed – [in] Seed for the random number generator
type – [in] Random generator type
rmat#
#include <raft/random/rmat_rectangular_generator.cuh>
namespace raft::random
-
template<typename IdxT, typename ProbT>
void rmat_rectangular_gen(raft::resources const &handle, raft::random::RngState &r, raft::device_vector_view<const ProbT, IdxT> theta, raft::device_mdspan<IdxT, raft::extents<IdxT, raft::dynamic_extent, 2>, raft::row_major> out, raft::device_vector_view<IdxT, IdxT> out_src, raft::device_vector_view<IdxT, IdxT> out_dst, IdxT r_scale, IdxT c_scale)# Generate a bipartite RMAT graph for a rectangular adjacency matrix.
This is the most general of several overloads of
rmat_rectangular_gen
in this file, and thus has the most detailed documentation.We call the
r_scale != c_scale
case the “rectangular adjacency matrix” case (in other words, generating bipartite graphs). In this case, atdepth >= r_scale
, the distribution is assumed to be:[theta[4 * depth] + theta[4 * depth + 2], theta[4 * depth + 1] + theta[4 * depth + 3]; 0, 0]
.Then for
depth >= c_scale
, the distribution is assumed to be:[theta[4 * depth] + theta[4 * depth + 1], 0; theta[4 * depth + 2] + theta[4 * depth + 3], 0]
.Note
This can generate duplicate edges and self-loops. It is the responsibility of the caller to clean them up accordingly.
Note
This also only generates directed graphs. If undirected graphs are needed, then a separate post-processing step is expected to be done by the caller.
- Template Parameters:
IdxT – Type of each node index
ProbT – Data type used for probability distributions (either fp32 or fp64)
- Parameters:
handle – [in] RAFT handle, containing the CUDA stream on which to schedule work
r – [in] underlying state of the random generator. Especially useful when one wants to call this API for multiple times in order to generate a larger graph. For that case, just create this object with the initial seed once and after every call continue to pass the same object for the successive calls.
out – [out] Generated edgelist [on device], packed in array-of-structs fashion. In each row, the first element is the source node id, and the second element is the destination node id.
out_src – [out] Source node id’s [on device].
out_dst – [out] Destination node id’s [on device].
out_src
andout_dst
together form the struct-of-arrays representation of the same output data asout
.theta – [in] distribution of each quadrant at each level of resolution. Since these are probabilities, each of the 2x2 matrices for each level of the RMAT must sum to one. [on device] [dim = max(r_scale, c_scale) x 2 x 2]. Of course, it is assumed that each of the group of 2 x 2 numbers all sum up to 1.
r_scale – [in] 2^r_scale represents the number of source nodes
c_scale – [in] 2^c_scale represents the number of destination nodes
- Pre:
out.extent(0) == 2 *
out_src.extent(0)is
true@pre
out_src.extent(0) == out_dst.extent(0)is
true`
-
template<typename IdxT, typename ProbT>
void rmat_rectangular_gen(raft::resources const &handle, raft::random::RngState &r, raft::device_vector_view<const ProbT, IdxT> theta, raft::device_vector_view<IdxT, IdxT> out_src, raft::device_vector_view<IdxT, IdxT> out_dst, IdxT r_scale, IdxT c_scale)# Overload of
rmat_rectangular_gen
that only generates the struct-of-arrays (two vectors) output representation.This overload only generates the struct-of-arrays (two vectors) output representation: output vector
out_src
of source node id’s, and output vectorout_dst
of destination node id’s.- Pre:
out_src.extent(0) == out_dst.extent(0)
istrue
-
template<typename IdxT, typename ProbT>
void rmat_rectangular_gen(raft::resources const &handle, raft::random::RngState &r, raft::device_vector_view<const ProbT, IdxT> theta, raft::device_mdspan<IdxT, raft::extents<IdxT, raft::dynamic_extent, 2>, raft::row_major> out, IdxT r_scale, IdxT c_scale)# Overload of
rmat_rectangular_gen
that only generates the array-of-structs (one vector) output representation.This overload only generates the array-of-structs (one vector) output representation: a single output vector
out
, where in each row, the first element is the source node id, and the second element is the destination node id.
-
template<typename IdxT, typename ProbT>
void rmat_rectangular_gen(raft::resources const &handle, raft::random::RngState &r, raft::device_mdspan<IdxT, raft::extents<IdxT, raft::dynamic_extent, 2>, raft::row_major> out, raft::device_vector_view<IdxT, IdxT> out_src, raft::device_vector_view<IdxT, IdxT> out_dst, ProbT a, ProbT b, ProbT c, IdxT r_scale, IdxT c_scale)# Overload of
rmat_rectangular_gen
that assumes the same a, b, c, d probability distributions across all the scales, and takes all three output vectors (out
with the array-of-structs output representation, andout_src
andout_dst
with the struct-of-arrays output representation).a
,b, and
c effectively replace the above overloadstheta
parameter.- Pre:
out.extent(0) == 2 *
out_src.extent(0)is
true@pre
out_src.extent(0) == out_dst.extent(0)is
true`
-
template<typename IdxT, typename ProbT>
void rmat_rectangular_gen(raft::resources const &handle, raft::random::RngState &r, raft::device_vector_view<IdxT, IdxT> out_src, raft::device_vector_view<IdxT, IdxT> out_dst, ProbT a, ProbT b, ProbT c, IdxT r_scale, IdxT c_scale)# Overload of
rmat_rectangular_gen
that assumes the same a, b, c, d probability distributions across all the scales, and takes only two output vectors (the struct-of-arrays output representation).a
,b, and
c effectively replace the above overloadstheta
parameter.- Pre:
out_src.extent(0) == out_dst.extent(0)
istrue
-
template<typename IdxT, typename ProbT>
void rmat_rectangular_gen(raft::resources const &handle, raft::random::RngState &r, raft::device_mdspan<IdxT, raft::extents<IdxT, raft::dynamic_extent, 2>, raft::row_major> out, ProbT a, ProbT b, ProbT c, IdxT r_scale, IdxT c_scale)# Overload of
rmat_rectangular_gen
that assumes the same a, b, c, d probability distributions across all the scales, and takes only one output vector (the array-of-structs output representation).a
,b, and
c effectively replace the above overloadstheta
parameter.
-
template<typename IdxT, typename ProbT>
void rmat_rectangular_gen(IdxT *out, IdxT *out_src, IdxT *out_dst, const ProbT *theta, IdxT r_scale, IdxT c_scale, IdxT n_edges, cudaStream_t stream, raft::random::RngState &r)# Legacy overload of
rmat_rectangular_gen
taking raw arrays instead of mdspan.- Template Parameters:
IdxT – type of each node index
ProbT – data type used for probability distributions (either fp32 or fp64)
- Parameters:
out – [out] generated edgelist [on device] [dim = n_edges x 2]. In each row the first element is the source node id, and the second element is the destination node id. If you don’t need this output then pass a
nullptr
in its place.out_src – [out] list of source node id’s [on device] [len = n_edges]. If you don’t need this output then pass a
nullptr
in its place.out_dst – [out] list of destination node id’s [on device] [len = n_edges]. If you don’t need this output then pass a
nullptr
in its place.theta – [in] distribution of each quadrant at each level of resolution. Since these are probabilities, each of the 2x2 matrices for each level of the RMAT must sum to one. [on device] [dim = max(r_scale, c_scale) x 2 x 2]. Of course, it is assumed that each of the group of 2 x 2 numbers all sum up to 1.
r_scale – [in] 2^r_scale represents the number of source nodes
c_scale – [in] 2^c_scale represents the number of destination nodes
n_edges – [in] number of edges to generate
stream – [in] cuda stream on which to schedule the work
r – [in] underlying state of the random generator. Especially useful when one wants to call this API for multiple times in order to generate a larger graph. For that case, just create this object with the initial seed once and after every call continue to pass the same object for the successive calls.
-
template<typename IdxT, typename ProbT>
void rmat_rectangular_gen(IdxT *out, IdxT *out_src, IdxT *out_dst, ProbT a, ProbT b, ProbT c, IdxT r_scale, IdxT c_scale, IdxT n_edges, cudaStream_t stream, raft::random::RngState &r)# Legacy overload of
rmat_rectangular_gen
taking raw arrays instead of mdspan. This overload assumes the same a, b, c, d probability distributions across all the scales.