[ml] k-means data clustering... Bueller?
David Faden
dfaden at gmail.com
Thu Jun 18 01:59:24 UTC 2009
Oops... and here's the .c file :-) I won't make it tonight again, but
I'll look forward to see where you go with this project.
David
On Wed, Jun 17, 2009 at 5:42 PM, David Faden<dfaden at gmail.com> wrote:
> I've attached some C code for k-means. I think this was partially
> automatically translated from Fortran so is somewhat ugly. The author
> has okayed its use. (I guess you might want to go with a GPL
> implementation just to be perfectly safe though.) I believe it should
> build with no dependencies.
>
> David
>
> On Tue, Jun 16, 2009 at 5:19 PM, Almir Karic<almir at kiberpipa.org> wrote:
>> i have toyed with it, scipy.cluster.vq.kmeans2 :D does pretty much everything
>> we need/want, you can pass it means or you can tell it how many means you want
>> and it randomly picks them (ofcourse in both cases telling you to which of the
>> means your vectors are closest to)
>>
>> On Tue, Jun 16, 2009 at 03:04:02PM -0700, Josh Myer wrote:
>>> Following up on Michael's post about HMM libraries, has anyone started
>>> on the k-means part of the wiimote stuff? Any libraries to post
>>> about, etc?
>>> --
>>> Josh Myer 650.248.3796
>>> josh at joshisanerd.com
>>> _______________________________________________
>>> ml mailing list
>>> ml at lists.noisebridge.net
>>> https://www.noisebridge.net/mailman/listinfo/ml
>> _______________________________________________
>> ml mailing list
>> ml at lists.noisebridge.net
>> https://www.noisebridge.net/mailman/listinfo/ml
>>
>
>
>
> --
> David Faden, dfaden at iastate.edu
> AIM: pitulx
>
--
David Faden, dfaden at iastate.edu
AIM: pitulx
-------------- next part --------------
#define BIG 1e+40
#include "array.h"
#include "kmeans.h"
#include <stdlib.h>
/**
* Run k-means on one-dimensional data.
* Returns a nonzero value to indicate failure.
*/
int kmeans1dCenters(const double* y, int len,
double* centers, int k)
{
int* classes = calloc(len, sizeof(int));
int* counts = calloc(k, sizeof(int));
int status = kmeans1d(y, len, centers, k, classes, counts);
free(classes);
free(counts);
return status;
}
/* Code based on Applied Statistics algorithms (C) Royal Statistical Society
1979. Adapted for C by Ranjan Maitra, Baltimore, 07/12/02 */
/* identical to kmns.c except for indx which is a number here */
static void optra(const double * const * a,int m,int n,double **c,int k,int *ic1,int *ic2,int *nc,
double *an1,double *an2,int *ncp,double *d,int *itran,int *live,
int *indx);
static void qtran(const double * const * a,int m,int n,double **c,int k,int *ic1,int *ic2,int *nc,
double *an1,double *an2,int *ncp,double *d,int *itran,int *indx);
/**
* Run k-means on one-dimensional data.
* Returns a nonzero value to indicate failure.
*/
int kmeans1d(const double* y, int len, double* centers,
int k, int* classes, int* counts)
{
int ifault = 0;
const double** a = calloc(len, sizeof(double*));
double** c = calloc(k, sizeof(double*));
double* wss = calloc(k, sizeof(double));
const int dim = 1;
const int iter = 1000;
int i;
for (i = 0; i < len; ++i) {
a[i] = &y[i];
}
for (i = 0; i < k; ++i) {
c[i] = ¢ers[i];
}
kmeans((const double* const*) a, len, dim, c, k, classes, counts, iter, wss, &ifault);
/* By construction, the modifications in c have been reflected in centers. */
free(a);
free(c);
free(wss);
return ifault;
}
/** ALGORITHM AS 136 APPL. STATIST. (1979) VOL.28, NO.1
* Divide M points in N-dimensional space into K clusters so that the within
* cluster sum of squares is minimized.
*
* a -- m x n input array
* m -- number of points/rows in a
* n -- dimension of a point (length of a row in a)
* c -- k x n array of centers
* k -- number of clusters
* ic1 -- m-length workspace (closest center to ith point)
* nc -- k-length workspace (number of points in jth cluster)
* iter -- max num of iterations
* wss -- k-length workspace
* ifault -- a nonzero value indicates an error
*/
void kmeans(const double * const * a, int m, int n, double **c, int k, int *ic1,int *nc,
int iter,double *wss,int *ifault)
{
int i,j,l,ij,il,indx,*ic2,*ncp,*itran,*live;
double temp,da,db,dc,dt[2],*an1,*an2,*d;
/* ALGORITHM AS 136 APPL. STATIST. (1979) VOL.28, NO.1
Divide M points in N-dimensional space into K clusters so that the within
cluster sum of squares is minimized. */
MAKE_VECTOR(ic2,m);
*ifault = 3;
if ((k <=1) || (k >= m)) { /* check for number of clusters */
return;
}
*ifault = 0;
/* For each point i, find its two closest centres, ic1(i) and ic2(i).
Assign it to ic1(i). */
for (i=0;i<m;i++) {
ic1[i] = 0;
ic2[i] = 1;
for (il=0;il<2;++il) {
dt[il]=0.;
for (j=0;j<n;++j) {
da=a[i][j]-c[il][j];
dt[il]+=da*da;
}
}
if (dt[0] >= dt[1]) {
ic1[i] = 1;
ic2[i] = 0;
temp=dt[0];
dt[0] = dt[1];
dt[1] = temp;
}
for (l=2;l<k;++l) {
db=0.;
for (j=0;j<n;++j) {
dc=a[i][j]-c[l][j];
db+=dc*dc;
if (db>dt[1]) {
j=n; /*go to the end of the loop -- end of story*/
}
}
if (db<dt[1]) {
if (db>=dt[0]) {
dt[1] = db;
ic2[i] = l;
}
else {
dt[1] = dt[0];
ic2[i] = ic1[i];
dt[0] = db;
ic1[i] = l;
}
}
}
}
/* Update cluster centres to be the average of points contained within them. */
MAKE_VECTOR(an1,k);
MAKE_VECTOR(an2,k);
MAKE_VECTOR(d,m);
MAKE_VECTOR(itran,k);
MAKE_VECTOR(live,k);
MAKE_VECTOR(ncp,k);
for (l=0;l<k;++l) {
nc[l] = 0;
for (j=0;j<n;++j) {
c[l][j]=0.;
}
}
for (i=0;i<m;++i) {
nc[ic1[i]]++;
for (j=0;j<n;++j) {
c[ic1[i]][j]+=a[i][j];
}
}
/* Check to see if there is any empty cluster at this stage */
for (l=0;l<k;++l) {
if (nc[l] == 0) {
*ifault=1;
return;
}
for (j=0;j<n;++j) {
c[l][j]/=nc[l];
}
/* Initialize an1, an2, itran & ncp:
an1[l]=nc[l]/(nc[l]-1); an2[l]=nc[L]/(nc[l]+1)
itran[l] = 1 if cluster l is updated in the quick-transfer stage, 0 ow.
In the optimal-transfer stage, ncp(l) stores the step at which cluster
l is last updated. In the quick-transfer stage, ncp(l) stores the step
at which cluster l is last updated plus m. */
an2[l] =nc[l]/(nc[l]+1.0);
an1[l] = BIG;
if (nc[l]>1) {
an1[l] =nc[l]/(nc[l]-1.0);
}
itran[l] = 1;
ncp[l] = -1;
}
indx=0;
for (ij=0;ij<iter;++ij) {
/* In this stage, there is only one pass through the data. Each point is
re-allocated, if necessary, to the cluster that will induce the maximum
reduction in within-cluster sum of squares. */
optra(a,m,n,c,k,ic1,ic2,nc,an1,an2,ncp,d,itran,live,&indx);
/* Stop if no transfer took place in the last m optimal transfer steps. */
if (indx == m) {
ij=iter;
}
else { /* Each point is tested in turn to see if it should be re-allocated
to the cluster to which it is most likely to be transferred,
ic2[i], from its present cluster, ic1[i]. Loop through the data
until no further change is to take place. */
qtran(a,m,n,c,k,ic1,ic2,nc,an1,an2,ncp,d,itran,&indx);
if (k==2) { /* k=2: no need to re-enter the optimal transfer stage*/
ij=iter;
}
else { /* ncp has to be set to 0 before entering optra. */
for (l=0;l<k; ++l) {
ncp[l] = 0;
}
}
}
}
if ((indx!=m) && (k!=2)) {
*ifault = 2; /* iterations exceeded: may indicate unforeseen looping */
}
for (l=0;l<k;++l) { /* Compute within-cluster SS for each cluster. */
wss[l]=0.;
for (j=0;j<n;++j) {
c[l][j]=0.;
}
}
for (i=0;i<m;++i) {
for (j=0;j<n;++j) {
c[ic1[i]][j]+=a[i][j];
}
}
for (j=0;j<n;++j) {
for (l=0;l<k;++l) {
c[l][j] /= (double)nc[l];
}
for (i=0;i<m;++i) {
da = a[i][j]-c[ic1[i]][j];
wss[ic1[i]]+=da*da;
}
}
FREE_VECTOR(ic2);
FREE_VECTOR(an1);
FREE_VECTOR(d);
FREE_VECTOR(itran);
FREE_VECTOR(live);
FREE_VECTOR(ncp);
FREE_VECTOR(an2);
}
static void optra(const double * const*a,int m,int n,double **c,int k,int *ic1,int *ic2,int *nc,
double *an1,double *an2,int *ncp,double *d,int *itran,int *live,
int *indx)
{
int i,j,l,l1,l2,ll,flag;
double r2,da,db,dc,dd,de,df,rr,al1,al2,alt,alw;
/* ALGORITHM AS 136.1 APPL. STATIST. (1979) VOL.28, NO.1
This is the optimal transfer stage. Each point is re-allocated, if
necessary, to the cluster that will induce a maximum reduction in the
within-cluster sum of squares.
If cluster L is updated in the last quick-transfer stage, it belongs to
the live set throughout this stage. Otherwise, at each step, it is not
in the live set if it has not been updated in the last M optimal transfer
steps. */
for (l=0;l<k;++l) {
if (itran[l] == 1) {
live[l]=m+1;
}
}
for (i=0;i<m;++i) {
(*indx)++;
l1 = ic1[i];
l2 = ic2[i];
ll = l2;
/* If point I is the only member of cluster L1, no transfer. */
if (nc[l1] != 1) {
/* If L1 has not yet been updated, no need to re-compute D(I).*/
if (ncp[l1] != 0) {
de=0.;
for (j=0;j<n;++j) {
df=a[i][j]-c[l1][j];
de+=df*df;
}
d[i]=de*an1[l1];
}
/* Find the cluster with minimum R2. */
da=0.;
for (j=0;j<n;++j) {
db=a[i][j]-c[l2][j];
da+=db*db;
}
r2=da*an2[l2];
for (l=0;l<k;++l) {
/* If I >= LIVE(L1), then L1 is not in the live set. If this is
true, we only need to consider clusters that are in the live set
for possible transfer of point I. Otherwise, we need to consider
all possible clusters. */
if (((i>=(live[l1]-1)) && (i>=(live[l]-1))) || (l == l1) || (l == ll)) {
}
else {
rr=r2/an2[l];
dc=0.;
flag=0;
j=0;
while ((flag==0) && (j<n)) {
dd=a[i][j]-c[l][j];
dc+=dd*dd;
if (dc>=rr) {
flag=1;
}
j++;
}
if (flag==0) {
r2=dc*an2[l];
l2=l;
}
}
}
if (r2>=d[i]) {/* If no transfer is necessary, L2 is the new IC2(I). */
ic2[i] = l2;
}
else {/* Update cluster centres, LIVE, NCP, AN1 & AN2 for clusters L1 and
L2, and update IC1(I) & IC2(I). */
(*indx)=0;
live[l1]=m+i+1;
live[l2]=m+i+1;
ncp[l1]=i+1;
ncp[l2]=i+1;
al1=(double)nc[l1];
alw=al1-1.0;
al2=(double)nc[l2];
alt=al2+1.0;
for (j=0;j<n;++j) {
c[l1][j]=(c[l1][j]*al1-a[i][j])/alw;
c[l2][j]=(c[l2][j]*al2+a[i][j])/alt;
}
nc[l1]--;
nc[l2]++;
an2[l1]=alw/ al1;
an1[l1]=BIG;
if (alw>1.0) {
an1[l1]=alw/(alw-1.0);
}
an1[l2]=alt/al2;
an2[l2]=alt/(alt+1.0);
ic1[i]=l2;
ic2[i]=l1;
}
}
if ((*indx)==m) {
return;
}
}
for (l=0;l<k;++l) { /* ITRAN(L) = 0 before entering QTRAN. Also, LIVE(L) has
to be decreased by M before re-entering OPTRA. */
itran[l]=0;
live[l]-=m;
}
return;
}
static void qtran(const double * const*a,int m,int n,double **c,int k,int *ic1,int *ic2,int *nc,
double *an1,double *an2,int *ncp,double *d,int *itran,int *indx)
{
int i,j,l1,l2,icoun,istep,flag,iflag;
double r2,da,db,dd,de,al1,al2,alt,alw;
/* ALGORITHM AS 136.2 APPL. STATIST. (1979) VOL.28, NO.1
This is the quick transfer stage. IC1(I) is the cluster which point I
belongs to. IC2(I) is the cluster which point I is most likely to be
transferred to. For each point I, IC1(I) & IC2(I) are switched, if
necessary, to reduce within-cluster sum of squares. The cluster centres
are updated after each step.
In the optimal transfer stage, NCP(L) indicates the step at which
cluster L is last updated. In the quick transfer stage, NCP(L) is
equal to the step at which cluster L is last updated plus M. */
icoun=0;
istep=0;
iflag=0;
while (iflag==0) {
for (i=0;i<m;i++) {
++icoun;
++istep;
l1 = ic1[i];
l2 = ic2[i];
/* If point I is the only member of cluster L1, no transfer. */
if (nc[l1] != 1) {/* If ISTEP > NCP(L1), no need to re-compute distance
from point I to cluster L1. Note that if cluster
L1 is last updated exactly M steps ago, we still
need to compute the distance from point I to
cluster L1. */
if (istep<=ncp[l1]) {
da=0.;
for (j=0;j<n;++j) {
db = a[i][j]-c[l1][j];
da += db * db;
}
d[i]=da*an1[l1];
/* If ISTEP >= both NCP(L1) & NCP(L2) there will be no transfer
of point I at this step. */
}
if ((istep<ncp[l1]) || (istep < ncp[l2])) {
r2=d[i]/an2[l2];
dd=0.;
flag=0;
j=0;
while ((j<n) && (flag==0)) {
de=a[i][j]-c[l2][j];
dd+=de*de;
if(dd>= r2) {
flag=1;
}
j++;
}
if (flag==0) {
/* Update cluster centres, NCP, NC, ITRAN, AN1 & AN2 for clusters L1
& L2. Also update IC1(I) & IC2(I). Note that if any updating
occurs in this stage, INDX is set back to 0. */
icoun = 0;
(*indx) = 0;
itran[l1]=1;
itran[l2]=1;
ncp[l1]=istep+m;
ncp[l2]=istep+m;
al1=(double)nc[l1];
alw=al1-1.0;
al2=(double)nc[l2];
alt=al2+1.0;
for (j=0;j<n;++j) {
c[l1][j]=(c[l1][j]*al1-a[i][j])/alw;
c[l2][j]=(c[l2][j]*al2+a[i][j])/alt;
}
nc[l1]--;
nc[l2]++;
an2[l1] = alw / al1;
an1[l1] = BIG;
if (alw > 1.0) {
an1[l1] = alw / (alw - 1.0);
}
an1[l2] = alt / al2;
an2[l2] = alt / (alt + 1.0);
ic1[i] = l2;
ic2[i] = l1;
}
}
}
if (icoun==m) return;
}
}
}
More information about the ml
mailing list