##############################################################################
# 
#			Cluster Forests (CF)
#
# This is the program to implement our idea of cluster ensemble by random 
# probing of a high dimensional data cloud from many different views. Each
# of such a view is further calibrated by a data-driven feature pursuit 
# procedure based on the kappa-criterion, which has the desirable property
# of being noise-resistant and in favor of strong features. CF can be 
# convinently viewed as a clustering version of Random Forests.
#
# Author: 	Donghui Yan 			dhyan@berkeley.edu
#		            Aiyou Chen 			Aiyouchen@google.com
#		            Michael I. Jordan	jordan@cs.berkeley.edu 
# Date:		 	05/09/2013
#
# Major revisions: 
# 05/10/2019 Donghui Yan			dyan@umassd.edu
# Implemented the Nystrom version of spectral clustering for large dataset 
# when aggregating clustering instances in the ensemble. The Nystrom 
# algorithm we use was originally developed in Matlab by Malik's group 
# at UC Berkeley. 
############################################################################
#		Copyright 2013-2019, by the author(s).
#		      All rights reserved.
# Permission to make digital or hard copies of all or part of this work for
# personal or classroom use is granted without fee provided that copies are
# not made or distributed for profit or commercial advantage and that copies
# bear this notice and the full citation on the first page. To copy otherwise, 
# to republish, to post on servers or to redistribute to lists, requires prior 
# specific permission.
#
############################################################################
# 
# Note: (1) All parameters are to be modified within this file.
# (2) Results reported in our paper are averaged over 100 repetitions; to 
# produce results in less time, one can use smaller number (e.g., 20) of 
# repetitions with a slight difference in results.
# 
############################################################################
# History:
# 05/09/2013 Original version.
############################################################################
library(MASS);
library(expm);

#####################################################################
# Constants and parameters used by CF
#####################################################################
infty<-100000000000;
#The true scaling parameter value is beta*ntrees
beta<-0.10;
#The numebr of features to sample from the feature set at each pursuit
bb<-2;
#The number of times to sample from the feature set in feature competition
#Set qq=0 to disable feature competition
qq<-0;
#The threshold level for pairwise affinities
#gamma=0.4 is the default
gamma<-0.3;
#The number of clusters in the base clustering algorithm
#Default to be the number of final clusters
nclusterB<-0;
#Number of clustering trees in the ensemble
ntrees<-20;
nRep<-20;
NST<-800;


####################################################################
# To compute the total sum of squares matrix on given features
####################################################################
sst<-function(x,cols)
{
n<-nrow(x);
z1<-x[,cols];
z2<-matrix(0,n,n);

if(length(cols)==1) 
	{ tmp<-kronecker(t(z1),z1,FUN="-"); return(tmp^2);}

for(i in 1:length(cols))
{
	tmp<-kronecker(z1[,i],z1[,i],FUN="-");
	tmp1<-matrix(tmp,n,n);
	z2<-z2+tmp1*tmp1;
}
z2
}## End of function sst()


#####################################################################
## Jitendra Malik's one-shot method for Nystrom
## A=n x n, B=n x (N-n)
## E=N x k
## Commented by dhyan, Aug 21st, 2008
#####################################################################
oneShot<-function(A,B,n,m,k)
{
d1 <- rowSums(A) + rowSums(B); 
d2 <- colSums(B)  + t(B)  %*% ginv(A) %*% rowSums(B);
##cat("d1=",d1,"\n"); cat("d2=",d2,"\n");
##tmp<-rbind(d1,d2); 
tmp<-c(d1,d2);
tmp<-sqrt(1/tmp); 
dhat <- t(tmp);

A = A * (dhat[1:n] %*% t(dhat[1:n]));
B = B * (dhat[1:n] %*% t(dhat[(n+1):(n+m)]));

Asi <- sqrtm(ginv(A));

Q<-A+Asi %*% B %*% t(B) %*% Asi;
Q<-(Q+t(Q))/2;

##cat("dim(Q)=", dim(Q), "\n");
##cat("Number of missing values in Q=", sum(is.na(Q)), "\n");
##browser();
mySvd <- svd(Q);
U<-mySvd$u; L<-mySvd$d; T<-mySvd$v;
##[U,L,T]=svd(Q);
##browser();

##cat("dim(A)=",dim(A),"dim(B)=",dim(B),"dim(Asi)=",dim(Asi),"\n");
##cat("dim(U)=",dim(U),"dim(L)=",dim(L), "\n");
Lm<-matrix(0, length(L), length(L)); diag(Lm)<-1/sqrt(L);
V <- rbind(A, t(B)) %*% Asi %*% U %*% Lm;
E<-matrix(0, n+m, k);
for(i in 2:(k+1)) {  
		E[, i-1] <-V[, i] / V[,1];
}

return(E);
}


##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
##% This function implements Jitendra Malik's one-shot algorithm for Nystrom 
##% dhyan, Aug 18, 2008
##%
##% Dataset
##% ---------		------------------------------------------------
##% Connect-4(STD)	r=20,65.80/10335,M~=12g
##% 			r=50,65.76/1602,M~=4.9g
##% 			r=100,65.82/438,r=200,65.82/181
##% USCI(STD)		r=200,would take at least M~=21g
##%			r=300,93.87/4134,M~=18g
##% 			r=500,93.88/1603,M~=11-12g
##% PokerHand		r=3000,42.04/1047,M~=17g
##% PokerHand(STD)	r=3000,49.97/1035,M~=17g
##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
nystrom<-function(W, n, ncluster)
{
N<-nrow(W); 

cat("Starting Nystrom at", date(),"\n");
idx<-sample(1:N,n,replace=FALSE);
idx2 <- (1:N)[-idx];

A<-W[idx,idx]; B<-W[idx, idx2];

##AA<-A; BB<-B;
##sigmas<-c(seq(0.01,0.1,by=0.01),0.2, 0.3, 0.5, 0.8,1,5, 7, 9,10, 12,15,20,40,50,80,100);
##for(sigma in sigmas)	
##{
##	cat("Sigma=", sigma, "\n");
##	A<-exp(-AA/sigma); B<-exp(-BB/sigma);
	
	E<-oneShot(A, B, n, N-n, ncluster);

	myKmeans<-kmeans(E, ncluster, iter.max = 200, nstart = 20, algorithm = c("Hartigan-Wong"));
	cat("Finished Nystrom at", date(),"\n");
	##cRate(sp,idx,ncluster,N);
##}
##browser();
return(myKmeans$cluster);
}




###########################################################################
# The base clustering algorithm
# This is essentially the k-means clustering algorithm, with a little
# protection against undesirable cases
###########################################################################
bCluster<-function(x,nc,cols)
{
n<-nrow(x);
#To prevent the # distinct points < # cluster centers
tmp<-seq(1,n);
z2<-sst(x,cols);
z2<-z2+kronecker(tmp,t(tmp),FUN=">");
diag(z2)<-1;
z3<-matrix(0,n);
for(i in 1:n) { z3[z2[i,]==0]<-1; }
	
if(n-sum(z3)<nc)  { cl<-data.frame(cbind(1));cl$err<-1; }
else
{
cl<-kmeans(x[,cols],
		centers=nc,
		iter.max=200, nstart=20,
		algorithm=c("Hartigan-Wong"));
cl$err<-0;
}

cl
}##End of function bCluster()



###########################################################################
# Feature competing
# To obtain a strong set of bb features among "ntry" trials
###########################################################################
featureCompeting<-function(x,columns,nc,currCols)
{
ntry<-0;
zkappa<-10^10;
zcol<-NULL;
while(1)
{
	cols<-sample(1:length(columns),bb,replace=FALSE);
	if(is.null(currCols)) { cl<-bCluster(x,nc,cols);}
	else { cl<-bCluster(x,nc,append(currCols,cols));}
	if(cl$err==1) next;
	if(is.null(currCols)) { z2<-sst(x, cols);}
	else { z2<-sst(x,append(currCols,cols));}
	zz<-sum(z2)/2;	#zz computes the total squared distances
	kappa<-sum(cl$withinss)/zz;
	if(kappa<zkappa) { zcol<-cols;zkappa<-kappa;}
	ntry<-ntry+1;
	if(ntry>qq) break;
}
zcol
}## End of function featureCompeting()


###############################################################################
# To generate all possible permutations of a given list
###############################################################################
permu<-function(vect)
{
        n<-length(vect);
	if(n>1)
	{
		pmat<-matrix(0,factorial(n),n);
	        blkSize<-factorial(n-1);
	        for(i in 1:n)
	        {
	        	idxb<-(i-1)*blkSize+1; idxe<-i*blkSize;
			pmat[idxb:idxe,1]<-rep(vect[i],blkSize);
		        tvect<-vect[-i];
		        pmat[idxb:idxe,2:n]<-permu(tvect);
												}
	}
	else { pmat<-vect[1]; }
	pmat
}


###########################################################################
# To compute the clustering accuracy using the true labels in the data.
# 
# If # clusters <=7, look for maximum match over all permutations of 
#       cluster IDs
# Else maximum match over 10000 random sampling from all permutations
#
# No attempt is made to optimize the computation as that is not the focus
# of the algorithm.
###########################################################################
cRate<-function(sp0, sp, nc, N)
{
if(nc>7) { cat("It may take a while as # clusters appears to be large ......\n");}
nrs<-10000;
spx=matrix(0,N,1); spy=matrix(0,N,1);

perms<-permu(seq(1,nc));
tr<-0;
if(nc <8)
{
        np=dim(perms)[1];
	for(i in 1:np)
	{
		for(j in 1:nc) { spx[sp==j]<-perms[i,j]; }
	        tmp<-sum(sp0==spx)/N;
	        if(tr<tmp) {tr=tmp;}
	}
} else
{
        idx<-sample(1:factorial(nc),nrs,replace=FALSE);
        for(i in 1:nrs)
        {       permx=perms[idx[i],];
                for(j in 1:nc) { spx[sp==j]<-permx[j]; }
                tmp<-sum(sp0==spx)/N;
                if(tr<tmp) {tr=tmp;}
        }
}
tr
}


#############################################################################
#			The nCut Algorithm
#
# To implement the Normalized Cut algorithm of Shi and Malik (2000)
# The NCut is equivalent to solving a constrained programming problem
#	(D-W)y=rDy
# with the constraints that y(i) \in {1,-b} and y'D1=0, where the constraint
# y'D1=0 is automatically satisfied and y is relaxed to take real values.
#
#############################################################################
nCut<-function(W,ncluster,n)
{
#Number of points not yet assigned to a cluster
nr<-n;
#Vector to store the original indices of all points currently not clustered
rvec<-(1:n);
#The similarity matrix left after each round of NCut
rsmat<-matrix(0,n,n);
rsmat<-W;
cut<-NULL; 
#To declare return values
discard<-0; mynCut<-NULL;

iloop<-0;
sp<-NA;
while(iloop<ncluster)
{
	iloop<-iloop+1;

	#For the last piece, no further NCut will be performed
	if(iloop==ncluster) 
		{ #cat("iloop =",iloop, "\nThis cut = \n",rvec,"\n\n");
		  sp[rvec]<-iloop;
			next;}

	#Prepare for generalized eigen analysis
	tmp<-rep(1,nr); 
	if(iloop!=1) { rsmat<-rsmat[-cut,]; rsmat<-rsmat[,-cut]; W<-rsmat; }
	tmp<-rsmat%*%tmp;
	D<-matrix(0,nr,nr); sqrtD<-matrix(0,nr,nr);
	if(length(tmp)!=nr){
		cat("The length of tmp is ",length(tmp),"\n");
		cat("The length of diag(D) is ",nr,"\n");
		browser();
		cat("The length of last cut is ",length(cut),"\n");
		cat("The #rows of rsmat is ",length(rsmat[,1]),"\n");
		cat("The #cols of rsmat is ",length(rsmat[1,]),"\n");
	}

	diag(D)<-tmp;
	diag(sqrtD)<-1/(sqrt(tmp));
	A<-sqrtD %*% (D-W);
	A<-A %*% sqrtD;

	#Eigen analysis
	eigens<-eigen(A,symmetric=T);
	z<-sort(eigens$values,index.return=T);
	ssvector<-eigens$vectors[,z$ix[2]];
	cut<-(1:nr)[ssvector>=0];

	#We favor the smaller cutoff
	if(length(cut)>0.5*nr) cut<-(1:nr)[-cut];
	ccut<-rvec[cut];

	#Bad things happen, discard the current loop
	if((length(cut)==n) || (length(cut)==0))
	{ sp[ccut]<-iloop;discard<-1;cat("Discard the current loop\n");break;}

	#To trace back to the original indices of all points currently NCutted
	rvec<-rvec[-cut];
	nr<-nr-length(cut);
	sp[ccut]<-iloop;
	if(nr <1) {cat("Defective sampling\n");break;}
}#End of while(iloop)
mynCut$sp<-sp; mynCut$discard<-discard;
mynCut
}#End of nCut() function


##########################################################################
# The real Cluster Forests function
##########################################################################
clusterForests<-function(x,rmcls,nclusterF)
{
myCf<-NULL;
columns<-names(x)[-rmcls];
n<-nrow(x);
nclusterB<-nclusterF;
smat<-matrix(0,n,n);

iloop<-0;
while(iloop<ntrees)
{
	if(iloop %% 5 == 0) cat("--------iloop=",iloop,"@",date(),"\n");
	lastRatios<-NULL;
	###################################################################
	## firstStep=	TRUE	To enable feature competition
	## 		FALSE	To disable
	###################################################################
	##firstStep<-TRUE;
	firstStep<-FALSE;

	#We are going to grow the tree to the maximum possible
	lastWCD<-infty;		#Within cluster distance
	lastRatio<-infty;	#Ratio of WCD/BCD
	currCols<-NULL;		#The set of features currently used
	x$cluster<-NA;
	clCurr<-NULL;

	numRetry<-0;
	while(1)
	{
		numRetry<-numRetry+1;
		if(firstStep==FALSE)
			{ cols<-sample(1:length(columns),bb,replace=TRUE);}
		else
			{ cols<-featureCompeting(x,columns,nclusterB,currCols); 
			  firstStep<-FALSE;}

		cols<-columns[cols]; cols<-match(cols,names(x));
		tmpCols<-append(currCols, cols);

		cl<-NULL;
		cl<-bCluster(x,nclusterB,tmpCols);
		if(cl$err==1) next;
		z2<-sst(x,tmpCols);
		zz<-sum(z2)/2;	#zz computes the total squared distances

		#########################################################
		# To compute the kappa value 
		#########################################################
		kappa<-sum(cl$withinss)/zz;

		if(kappa<lastRatio)
		{
			lastRatio<-kappa;
			currCols<-tmpCols;
			clCurr<-cl;
		}
		else break;
	} #End of the while(1) loop


	#To recover the original index
	x$cluster[1:n]<-clCurr$cluster;
	ssmat<-kronecker(x$cluster,t(x$cluster),FUN="==");
	ssmat[ssmat==TRUE]<-1;ssmat[ssmat==FALSE]<-0;
	ssmat[is.na(ssmat)]<-0;
	smat<-smat+ssmat;
	iloop<-iloop+1;
	lastRatios<-c(lastRatios,lastRatio);
}#End of while(iloop)

############################################################################
# To threshold and scale the pairwise affinities
############################################################################
smat[smat<ntrees*gamma]<-0;
smat<-exp(beta*smat);

############################################################################
# To obtain final clustering with normalized Cuts or Nystrom if the data is large
############################################################################
if(n<2000)
{
	myCf<-nCut(smat,nclusterF,n);
}
else	##Nystrom for large dataset
{
	sp<-nystrom(smat, NST, nclusterF);	
	myCf$sp<-sp; myCf$discard<-0;
}
myCf
}#End of clusterForests() function



###########################################################################
# The main functon starts here
# This is a wrapper for the Cluster Forests function
###########################################################################
clusterForestsWrapper<-function(x,sp,nclusterF,m)
{
n<-nrow(x);

x$cluster<-NA;
tmp<-c("label","cluster");
rmcls<-match(tmp,names(x));
y<-x[,-rmcls];
for(i in 1:m) { y[,i]<-as.numeric(y[,i]);}
x<-y;
x$label<-NA; x$cluster<-NA;

set.seed(1);
#To compute the similarity matrix of the original data
smat0<-matrix(0,n,n);
smat0<-kronecker(sp,t(sp),FUN="==");
smat0[smat0==TRUE]<-1;smat0[smat0==FALSE]<-0;
sp0<-sp;

accuracy<-NULL; accuracyr<-NULL;
for(j in 1:nRep)
{
cat("================= Iteration ",j, " =================\n");

myCf<-clusterForests(x,rmcls,nclusterF);
sp<-myCf$sp; discard<-myCf$discard;
if(discard==1) next;

#To report the accuracy according to two different metrics
smat1<-kronecker(sp,t(sp),FUN="==");
smat1[smat1==TRUE]<-1;smat1[smat1==FALSE]<-0;
zr<-(sum(smat0==smat1)-n)/(n^2-n)
z<-cRate(sp0,sp,nclusterF,n);
accuracy<-append(accuracy,z);
accuracyr<-append(accuracyr,zr);
cat("--------rateC=", z, "rateR=", zr, "@", date(), "\n");
}#End of the for(j) loop
cat("\n======>The mean accuracy:  rho_c =",mean(accuracy),", rho_r =",mean(accuracyr), "<======\n\n");
}
source("cfLoader.R");
