Microsoft.ML.KMeansClustering.xml
22.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
<?xml version="1.0"?>
<doc>
<assembly>
<name>Microsoft.ML.KMeansClustering</name>
</assembly>
<members>
<member name="T:Microsoft.ML.KMeansClusteringExtensions">
<summary>
The trainer context extensions for the <see cref="T:Microsoft.ML.Trainers.KMeansPlusPlusTrainer"/>.
</summary>
</member>
<member name="M:Microsoft.ML.KMeansClusteringExtensions.KMeans(Microsoft.ML.ClusteringCatalog.ClusteringTrainers,System.String,System.String,System.Int32)">
<summary>
Train a KMeans++ clustering algorithm.
</summary>
<param name="catalog">The clustering catalog trainer object.</param>
<param name="featureColumnName">The name of the feature column.</param>
<param name="exampleWeightColumnName">The name of the example weight column (optional).</param>
<param name="clustersCount">The number of clusters to use for KMeans.</param>
<example>
<format type="text/markdown">
<]
]]></format>
</example>
</member>
<member name="M:Microsoft.ML.KMeansClusteringExtensions.KMeans(Microsoft.ML.ClusteringCatalog.ClusteringTrainers,Microsoft.ML.Trainers.KMeansPlusPlusTrainer.Options)">
<summary>
Train a KMeans++ clustering algorithm.
</summary>
<param name="catalog">The clustering catalog trainer object.</param>
<param name="options">Algorithm advanced options.</param>
</member>
<member name="T:Microsoft.ML.Trainers.KMeansModelParameters">
<example>
<format type="text/markdown">
<]
]]></format>
</example>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansModelParameters.GetVersionInfo">
<summary>
Version information to be saved in binary format
</summary>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansModelParameters.#ctor(Microsoft.ML.IHostEnvironment,System.Int32,Microsoft.ML.Data.VBuffer{System.Single}[],System.Boolean)">
<summary>
Initialize predictor with a trained model.
</summary>
<param name="env">The host environment</param>
<param name="k">Number of centroids</param>
<param name="centroids">Coordinates of the centroids</param>
<param name="copyIn">If true then the <paramref name="centroids"/> vectors will be subject to
a deep copy, if false then this constructor will take ownership of the passed in centroid vectors.
If false then the caller must take care to not use or modify the input vectors once this object
is constructed, and should probably remove all references.</param>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansModelParameters.#ctor(Microsoft.ML.IHostEnvironment,Microsoft.ML.ModelLoadContext)">
<summary>
Initialize predictor from a binary file.
</summary>
<param name="ctx">The load context</param>
<param name="env">The host environment</param>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansModelParameters.SaveCore(Microsoft.ML.ModelSaveContext)">
<summary>
Save the predictor in binary format.
</summary>
<param name="ctx">The context to save to</param>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansModelParameters.Create(Microsoft.ML.IHostEnvironment,Microsoft.ML.ModelLoadContext)">
<summary>
This method is called by reflection to instantiate a predictor.
</summary>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansModelParameters.InitPredictor">
<summary>
Initialize internal parameters: L2 norms of the _centroids.
</summary>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansModelParameters.GetClusterCentroids(Microsoft.ML.Data.VBuffer{System.Single}[]@,System.Int32@)">
<summary>
Copies the centroids to a set of provided buffers.
</summary>
<param name="centroids">The buffer to which to copy. Will be extended to
an appropriate length, if necessary.</param>
<param name="k">The number of clusters, corresponding to the logical size of
<paramref name="centroids"/>.</param>
</member>
<member name="T:Microsoft.ML.Trainers.KMeansPlusPlusTrainer">
<summary>
K-means is a popular clustering algorithm. With K-means, the data is clustered into a specified
number of clusters in order to minimize the within-cluster sum of squares.
</summary><remarks>
K-means++ improves upon K-means by using the <a href="https://research.microsoft.com/apps/pubs/default.aspx?id=252149">Yinyang K-Means</a> method for choosing the initial cluster centers.
YYK-Means accelerates K-Means up to an order of magnitude while producing exactly the same clustering results (modulo floating point precision issues).
YYK-Means observes that there is a lot of redundancy across iterations in the KMeans algorithms and most points do not change their clusters during an iteration.
It uses various bounding techniques to identify this redundancy and eliminate many distance computations and optimize centroid computations.
<para>For more information on K-means, and K-means++ see:</para>
<list type="bullet">
<item><description><a href="https://en.wikipedia.org/wiki/K-means_clustering">K-means</a></description></item>
<item><description><a href="https://en.wikipedia.org/wiki/K-means%2b%2b">K-means++</a></description></item>
</list>
</remarks>
</member>
<member name="F:Microsoft.ML.Trainers.KMeansPlusPlusTrainer.Defaults.ClustersCount">
<value>The number of clusters.</value>
</member>
<member name="F:Microsoft.ML.Trainers.KMeansPlusPlusTrainer.Options.ClustersCount">
<summary>
The number of clusters.
</summary>
</member>
<member name="F:Microsoft.ML.Trainers.KMeansPlusPlusTrainer.Options.InitAlgorithm">
<summary>
Cluster initialization algorithm.
</summary>
</member>
<member name="F:Microsoft.ML.Trainers.KMeansPlusPlusTrainer.Options.OptimizationTolerance">
<summary>
Tolerance parameter for trainer convergence. Low = slower, more accurate.
</summary>
</member>
<member name="F:Microsoft.ML.Trainers.KMeansPlusPlusTrainer.Options.MaxIterations">
<summary>
Maximum number of iterations.
</summary>
</member>
<member name="F:Microsoft.ML.Trainers.KMeansPlusPlusTrainer.Options.AccelerationMemoryBudgetMb">
<summary>
Memory budget (in MBs) to use for KMeans acceleration.
</summary>
</member>
<member name="F:Microsoft.ML.Trainers.KMeansPlusPlusTrainer.Options.NumThreads">
<summary>
Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.
</summary>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansPlusPlusTrainer.#ctor(Microsoft.ML.IHostEnvironment,Microsoft.ML.Trainers.KMeansPlusPlusTrainer.Options)">
<summary>
Initializes a new instance of <see cref="T:Microsoft.ML.Trainers.KMeansPlusPlusTrainer"/>
</summary>
<param name="env">The <see cref="T:Microsoft.ML.IHostEnvironment"/> to use.</param>
<param name="options">The advanced options of the algorithm.</param>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansPlusPlusInit.Initialize(Microsoft.ML.IHost,System.Int32,Microsoft.ML.IChannel,Microsoft.ML.Trainers.FeatureFloatVectorCursor.Factory,System.Int32,System.Int32,Microsoft.ML.Data.VBuffer{System.Single}[],System.Int64@,System.Int64@,System.Boolean)">
<summary>
Initialize starting centroids via KMeans++ algorithm. This algorithm will always run single-threaded,
regardless of the value of <paramref name="numThreads" />.
</summary>
</member>
<member name="T:Microsoft.ML.Trainers.KMeansAcceleratedRowMap">
<summary>
An instance of this class is used by SharedStates in YinYangTrainer
and KMeansBarBarInitialization. It effectively bounds MaxInstancesToAccelerate and
initializes RowIndexGetter.
</summary>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansAcceleratedRowMap.BuildParallelIndexLookup(Microsoft.ML.Trainers.FeatureFloatVectorCursor.Factory)">
<summary>
Initializes the parallel index lookup HashArray using a sequential RowCursor. We
preinitialize the HashArray so we can perform lock-free lookup operations during
the primary KMeans pass.
</summary>
</member>
<member name="T:Microsoft.ML.Trainers.KMeansBarBarInitialization.SharedState">
<summary>
Data for optimizing KMeans|| initialization. Very similar to SharedState class
For every instance, there is a space for the best weight and best cluster computed.
In this class, new clusters mean the clusters that were added to the cluster set
in the previous round of KMeans|| and old clusters are the rest of them (the ones
that were added in the rounds before the previous one).
In every round of KMeans||, numSamplesPerRound new clusters are added to the set of clusters.
There are 'numRounds' number of rounds. We compute and store the distance of each new
cluster from every round to all of the previous clusters and use it
to avoid unnecessary computation by applying the triangle inequality.
</summary>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansBarBarInitialization.SharedState.SetInstanceCluster(System.Int32,System.Single,System.Int32)">
<summary>
When assigning an accelerated row to a cluster, we store away the weight
to its closest cluster, as well as the identity of the new
closest cluster. Note that bestWeight can be negative since it is
corresponding to the weight of a distance which does not have
the L2 norm of the point itself.
</summary>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansBarBarInitialization.SharedState.SetClusterDistance(System.Int32,Microsoft.ML.Data.VBuffer{System.Single}@,System.Single,System.Int32,Microsoft.ML.Data.VBuffer{System.Single}@,System.Single)">
<summary>
Computes and stores the distance of a new cluster to an old cluster
<paramref name="newClusterFeatures"/> must be between 0..numSamplesPerRound-1.
</summary>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansBarBarInitialization.SharedState.CanWeightComputationBeAvoided(System.Single,System.Int32,System.Int32)">
<summary>
This function is the key to use triangle inequality. Given an instance x distance to the best
old cluster, cOld, and distance of a new cluster, cNew, to cOld, this function evaluates whether
the distance computation of dist(x,cNew) can be avoided.
</summary>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansBarBarInitialization.FindBestCluster(Microsoft.ML.Data.VBuffer{System.Single}@,System.Int32,Microsoft.ML.Trainers.KMeansBarBarInitialization.SharedState,System.Int32,System.Int32,Microsoft.ML.Data.VBuffer{System.Single}[],System.Single[],System.Boolean,System.Boolean,System.Single@,System.Int32@)">
<summary>
This function finds the best cluster and the best weight for an instance using
smart triangle inequality to avoid unnecessary weight computations.
Note that <paramref name="needToStoreWeight"/> is used to avoid the storing the new cluster in
final round. After the final round, best cluster information will be ignored.
</summary>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansBarBarInitialization.ComputeAccelerationMemoryRequirement(System.Int64,System.Int32,System.Int32,System.Boolean,System.Int64@,System.Int64@)">
<summary>
This method computes the memory requirement for _clusterDistances in SharedState (clusterBytes) and
the maximum number of instances whose weight to the closest cluster can be memorized in order to avoid
recomputation later.
</summary>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansBarBarInitialization.Initialize(Microsoft.ML.IHost,System.Int32,Microsoft.ML.IChannel,Microsoft.ML.Trainers.FeatureFloatVectorCursor.Factory,System.Int32,System.Int32,Microsoft.ML.Data.VBuffer{System.Single}[],System.Int64,System.Int64@,System.Int64@)">
<summary>
KMeans|| Implementation, see https://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf
This algorithm will require:
- (k * overSampleFactor * rounds * diminsionality * 4) bytes for the final sampled clusters.
- (k * overSampleFactor * numThreads * diminsionality * 4) bytes for the per-round sampling.
Uses memory in initializationState to cache distances and avoids unnecessary distance computations
akin to YinYang-KMeans paper.
Everywhere in this function, weight of an instance x from a cluster c means weight(x,c) = dist(x,c)^2-norm(x)^2.
We store weight in most cases to avoid unnecessary computation of norm(x).
</summary>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansRandomInit.Initialize(Microsoft.ML.IHost,System.Int32,Microsoft.ML.IChannel,Microsoft.ML.Trainers.FeatureFloatVectorCursor.Factory,System.Int32,Microsoft.ML.Data.VBuffer{System.Single}[],System.Int64@,System.Int64@)">
<summary>
Initialize starting centroids via reservoir sampling.
</summary>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansLloydsYinYangTrain.WorkChunkStateBase.Reduce(Microsoft.ML.Trainers.KMeansLloydsYinYangTrain.WorkChunkState[],Microsoft.ML.Trainers.KMeansLloydsYinYangTrain.ReducedWorkChunkState)">
<summary>
Reduces the array of work chunks into this chunk, coalescing the
results from multiple worker threads partitioned over a parallel cursor set and
clearing their values to prepare them for the next iteration.
</summary>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansLloydsYinYangTrain.ReducedWorkChunkState.UpdateClusters(Microsoft.ML.Data.VBuffer{System.Single}[],System.Single[],System.Single[],System.Single@)">
<summary>
Updates all the passed in variables with the results of the most recent iteration
of cluster assignment. It is assumed that centroids will contain the previous results
of this call.
</summary>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansLloydsYinYangTrain.SharedState.SetYinYangCluster(System.Int32,Microsoft.ML.Data.VBuffer{System.Single}@,System.Single,System.Int32,System.Single)">
<summary>
When assigning an accelerated row to a cluster, we store away the distance
to its closer and second closed cluster, as well as the identity of the new
closest cluster. This method returns the last known closest cluster.
</summary>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansLloydsYinYangTrain.SharedState.UpdateYinYangBounds(System.Int32)">
<summary>
Updates the known YinYang bounds for the given row using the centroid position
deltas from the previous iteration.
</summary>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansLloydsYinYangTrain.SharedState.IsYinYangGloballyBound(System.Int32)">
<summary>
Determines if the triangle distance inequality still applies to the given row,
allowing us to avoid per-cluster distance computation.
</summary>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansLloydsYinYangTrain.ProcessChunk(Microsoft.ML.Trainers.FeatureFloatVectorCursor,Microsoft.ML.Trainers.KMeansLloydsYinYangTrain.SharedState,Microsoft.ML.Trainers.KMeansLloydsYinYangTrain.WorkChunkStateBase,System.Int32,Microsoft.ML.Data.VBuffer{System.Single}[],System.Single[])">
<summary>
Performs the 'update' step of KMeans. This method is passed a WorkChunkState. In the parallel version
this chunk will be one of _numThreads chunks and the RowCursor will be part of a RowCursorSet. In the
unthreaded version, this chunk will be the final chunk and hold state for the entire data set.
</summary>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansUtils.ParallelWeightedReservoirSample(Microsoft.ML.IHost,System.Int32,System.Int32,Microsoft.ML.Trainers.FeatureFloatVectorCursor.Factory,Microsoft.ML.Trainers.KMeansUtils.WeightFunc,Microsoft.ML.Trainers.KMeansUtils.RowIndexGetter,Microsoft.ML.Data.VBuffer{System.Single}[]@,Microsoft.ML.Internal.Utilities.Heap{Microsoft.ML.Trainers.KMeansUtils.WeightedPoint}[]@)">
<summary>
Performs a multithreaded version of weighted reservior sampling, returning
an array of numSamples, where each sample has been selected from the
data set with a probability of numSamples/N * weight/(sum(weight)). Buffer
is sized to the number of threads plus one and stores the minheaps needed to
perform the per-thread reservior samples.
This method assumes that the numSamples is much smaller than the full dataset as
it expects to be able to sample numSamples * numThreads.
This is based on the 'A-Res' algorithm in 'Weighted Random Sampling', 2005; Efraimidis, Spirakis:
https://utopia.duth.gr/~pefraimi/research/data/2007EncOfAlg.pdf
</summary>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansUtils.ParallelMapReduce``2(System.Int32,Microsoft.ML.IHost,Microsoft.ML.Trainers.FeatureFloatVectorCursor.Factory,Microsoft.ML.Trainers.KMeansUtils.RowIndexGetter,Microsoft.ML.Trainers.KMeansUtils.InitAction{``0},Microsoft.ML.Trainers.KMeansUtils.MapAction{``0},Microsoft.ML.Trainers.KMeansUtils.ReduceAction{``0,``1},``0[]@,``1@)">
<summary>
Takes a data cursor and perform an in-memory parallel aggregation operation on it. This
helper wraps some of the behavior common to parallel operations over a IRowCursor set,
including building the set, creating separate Random instances, and IRowCursor disposal.
</summary>
<typeparam name="TPartitionState">The type that each parallel cursor will be expected to aggregate to.</typeparam>
<typeparam name="TGlobalState">The type of the final output from combining each per-thread instance of TInterAgg.</typeparam>
<param name="numThreads"></param>
<param name="baseHost"></param>
<param name="factory"></param>
<param name="rowIndexGetter"></param>
<param name="initChunk">Initializes an instance of TInterAgg, or prepares/clears it if it is already allocated.</param>
<param name="mapper">Invoked for every row, should update TInterAgg using row cursor data.</param>
<param name="reducer">Invoked after all row cursors have completed, combines the entire array of TInterAgg instances into a final TAgg result.</param>
<param name="buffer">A reusable buffer array of TInterAgg.</param>
<param name="result">A reusable reference to the final result.</param>
<returns></returns>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansUtils.FindBestCluster(Microsoft.ML.Data.VBuffer{System.Single}@,Microsoft.ML.Data.VBuffer{System.Single}[],System.Single[],System.Int32,System.Boolean,System.Single@,System.Int32@,System.Single@,System.Int32@)">
<summary>
Given a point and a set of centroids this method will determine the closest centroid
using L2 distance. It will return a value equivalent to that distance, the index of the
closest cluster, and a value equivalent to the distance to the second-nearest cluster.
</summary>
<param name="features"></param>
<param name="centroids"></param>
<param name="centroidL2s">The L2 norms of the centroids. Used for efficiency and expected to be computed up front.</param>
<param name="centroidCount">The number of centroids. Must be less than or equal to the length of the centroid array.</param>
<param name="needRealDistance">Whether to return a real L2 distance, or a value missing the L2 norm of <paramref name="features"/>.</param>
<param name="minDistance">The distance between <paramref name="features"/> and the nearest centroid in <paramref name="centroids" />.</param>
<param name="cluster">The index of the nearest centroid.</param>
<param name="secMinDistance">The second nearest distance, or PosInf if <paramref name="centroids" /> only contains a single point.</param>
<param name="secCluster">The index of the second nearest centroid, or -1 if <paramref name="centroids" /> only contains a single point.</param>
</member>
<member name="M:Microsoft.ML.Trainers.KMeansUtils.VerifyModelConsistency(Microsoft.ML.Data.VBuffer{System.Single}[])">
<summary>
Checks that all coordinates of all centroids are finite, and throws otherwise
</summary>
</member>
</members>
</doc>