; $Id: kmeans.pro,v 1.1 1993/04/02 18:54:39 idl Exp $ ; Copyright (c) 1991-1993, Research Systems Inc. All rights ; reserved. Unauthorized reproduction prohibited. Function Normal1, Data,R,C ;Normal1 returns the matrix obtained by normalizing the columns ; of Data. R= # of rows, C= # of columns. Y= Data-Data#Replicate(1./R,R) #Replicate(1.,R) std =sqrt(Y^2 # Replicate(1./(R-1),R)) D1=Fltarr(c,c) for i=0l,C-1 do $ if std(i) NE 0 then D1(i,i)=1./std(i) else d1(i,i)=0 return, D1#Y end ;Normal1 Pro StatComp,D,V,N,Mx,Mn,STD,SS ; compute summary statistics,min,max,standard deviation for ; all variables in a cluster whose cases are listed in V. ; Statistics stored in the 1-dim arrays, Mx=Max,Mn=Min, ; STD=standard deviation,SS=sum of squares. D=Data, see ; procedure kmeans. R=Size(D) N=R(1) ; N = # of variables R1=R(0) R=R(2) ; R = # of cases Y= D-V#Replicate(1.,R) ; Compute sum of squares and ; standard deviation if(R1 GT 1) THEN SS= Y^2#Replicate(1.,R) ELSE SS=Y^2 if(R1 GT 1) THEN STD=SQRT(Y^2#Replicate(1./(R-1),R)) $ ELSE STD=FLtArr(N) Mx=Fltarr(N) MN=Mx for i=0l,N-1 DO BEGIN ; compute max and min MN(i)=min(D(i,*)) Mx(i)=max(D(i,*)) ENDFOR RETURN END Pro VAnova,Data,Cluster,VarNames,SS,unit,N,CN,R,SizeB,B1,SW1 ;Compute and print out 1-way analysis of variance for each ;variable. Interpret clusters as treatments. N=# of variables, ; CN= #of clusters, R=of cases, SizeB(i)= size of ith cluster. Common KBlock,B ;B(i,j)= ith variable mean ; for cluster j. printf,unit,Format='(/,/,/)' printf,unit," Overall Variable Statistics" printf,unit,"*******************************************************************" printf,unit, $ Format='("Variable",6x,"Within SS",4X,"DF",4x,"Between SS",4X,"DF",5X,"FRatio")' SW1=Fltarr(N) MS= Replicate(1.,N)#SizeB ;compute between sum of squares TT= B*MS#Replicate(1.,CN) TT= TT*TT/Total(SizeB) B1= B^2 * MS#Replicate(1.,CN)-TT ; B1(i) = between SS for ith ;variable DFB=CN-1 ; compute between degrees of freedom DFW=R-DFB-1 ; compute within degrees of freedom if DFB EQ 0 THEN DFB= 1.e-10 if DFW EQ 0 THEN DFW = 1.e-10 for i=0l,N-1 DO BEGIN SSB=B1(i) SW= Total(SS(i,*)) ; within SS SW1(i)=SW printf,unit,Format='(A8,4X,G11.4,3X,I3,3X,G11.4,3X,I3,1X,G11.4)',VarNames(i),SW,DFW,SSB,DFB,SSB*(DFW)/(SW*DFB) ENDFOR RETURN END ;Procedure Vanova Pro OutPut1, Clus,Data,CaseNam,VarNam,unit,SD1,SB1,SW1 ; Output mean,min,max and standard deviation for each ; variable for each cluster.Also, output overall analysis ; of variance. Common KBlock,B ;B(i,j) = value of ith variable in jth cluster SB=Size(B) K= SB(2) ; final # of clusters N= SB(1) ; number of variables SD=Size(Data) CN=SD(2) ; number of cases SD1=Fltarr(N,K) NC=Size(CaseNam) if(NC(1) NE 0 ) THEN BEGIN ;check and, maybe, fix case names CaseName=CaseNam if(NC(1) LT CN) THEN BEGIN printf,unit,'kmeans-missing Case names' I=Indgen(CN) CaseName=[CaseName,'Case'+STRTRIM(I(NC(1):CN-1),2.)] ENDIF ENDIF ELSE CaseName= 'Case'+STRTRim(INDgen(CN),2) NC=Size(VarNam) if(NC(1) NE 0 ) THEN BEGIN ;likewise, for variable names VarName=VarNam if(NC(1) LT N) THEN BEGIN printf,unit,'kmeans-missing Variable names' I=Indgen(N) VarName=[VarName,'Var'+STRTRIM(I(NC(1):N-1)),2] ENDIF ENDIF ELSE VarName='Var'+ STRTrim(INDgen(N),2) SSW=Fltarr(N,K) ;SSW(i)= sum of squares for cluster i SizeB= FltArr(K) ;SizeB(i) = # of cases in cluster i for i = 0l,K-1 DO BEGIN ;print out cases and statistics for ;each cluster V=B(*,i) ;V= vector of mean var values for ith cluster printf,unit,Format='(/,/,/)' printf,unit,Format='(20X,"Cluster:",I12)',i printf,unit,'********************************************************************************' printf,unit, $ Format='(/,6X,"Case",5X,"Distance",3X,"|",5X,"Var",8X,"Min",8X,"Max",7X,"Mean",8X,"STD",/,/)' CL=where(Clus EQ i,count) ;Cl= list of cases in ith cluster if count ne 0 THEN BEGIN SizeB(i)=count M=count> N D1=Data(*,CL) ;D1 is Data restricted to the ;cases in cluster i. StatComp,D1,V,N,mx, mn,STD,SS ; compute var stats SSW(*,i)=SS ;compute within sum of squares for anova SD1(*,i)=STD for j=0l,M-1 DO BEGIN if(J LE count-1) THEN BEGIN ;print Cases and distances Dist= Data(*,CL(J))-V printf,unit, $ Format='(A10,2X,G11.4,3X,A1,$)', $ CaseName(CL(j)),SQRT(Total(Dist*Dist)),"|" ENDIF ELSE printf,unit,Format='(26X,"|",$)' if(J LE N-1) THEN BEGIN ; print variables and their stats printf,unit,Format='(2X,A6,G11.4,G11.4,G11.4,G11.4)', $ VarName(j),mn(j),mx(j),V(j),STD(j) ENDIF ELSE printf,unit," " ENDFOR ENDIF ENDFor VAnova,Data,Clus,VarName,SSW,unit,N,K,CN,SizeB,SB1,SW1 ; output anova RETURN END ; OutPut1 Pro KMeans1, Data,Clus,iter,K ; Data is a (C,R) dimensioned array of R cases and C ; variables.Clus must be a one-dimensional array of size ; R and with each element between 0 and K-1. K is the number ; of clusters and iter the maximum #of times to relocate case ; in clusters to minimize error. On exit, CLUS represents ; the final clusters of the partition. Clus(i)=cluster ; containing casei. Common KBlock,B On_Error,2 SD=Size(Data) if SD(0) eq 1 then R = 1 else R=SD(2) ; R= # of cases C=SD(1) ; N= # of variables B=Fltarr(C,K) D=Fltarr(K,R) INCLUS=FLtarr(R,k) L=Fltarr(K) ;L(i)= size of ith cluster for i=0l,k-1 DO BEGIN X=( CLUS EQ i) L(i)= Total(x) if(L(i) NE 0) THEN X=X/L(i) INCLUS(*,i)=X END B= Data#INCLUS ;B(N,L)= mean of the Nth variable in cluster L CD=Fltarr(R) X=Data-B(*,Clus) e = Total(X*X) for j=1l,iter DO BEGIN ; reiteratively, relocate ; cases in clusters ; to reduce error GotOne=0 for i= 0l,R-1 DO BEGIN ; run thru cases Min=1.e30 NK=-1 D1= Data(*,i)-B(*,Clus(i)) ; if(L(CLUS(i)) GT 1) $ THEN D1= L(CLUS(I))*Total(D1*D1)/(L(Clus(i))-1) $ else D1=0 ;D1=~Distance from i to its cluster Clus(i) for n= 0l,k-1 DO BEGIN if(n NE CLUS(i)) THEN BEGIN D2= Data(*,i)-B(*,n) D2= L(n)*Total(D2*D2)/(L(n)+1) ;D2=~Distance from case i to cluster n if (D2-D1 LT Min) THEN BEGIN ;D1-D2= error change if object i is ; relocated into cluster n Min=D2-D1 NK=n ENDIF ENDIF ENDFOR if(Min LT 0) THEN BEGIN ;relocate to cluster NK if it reduces error B(*,CLus(i))= $ (B(*,Clus(i))*L(CLus(i))-Data(*,i))/(L(Clus(i))-1) ;update cluster means after relocation B(*,NK)=(B(*,NK)*L(NK)+Data(*,i))/(L(NK)+1) L(CLUS(I))=L(CLUS(I))-1 ; update cluster size CLUS(I)=NK L(NK)=L(NK)+1 e=e+Min ; update error GotOne=1 ENDIF ENDFOR if(Not GotOne ) THEN RETURN ;quit with final Partition if ;cant relocate to reduce error ENDFOR RETURN END Pro kmeans,Data1,CLuster,VarName = VarName,CaseName = CaseName,$ Iter=IT,Number=n,Norm=Nr,Missing=M, List_Name=LN,$ ClustMeans=CM,ClustSTD=CS,SSBetween=SB,SSWithin=SW,NoPrint=NP ;+ ; ; NAME: ; KMEANS ; ; PURPOSE: ; To split the cases in Data into n (default=2) groups that have ; minimum in group variation relative to between group variation. ; ; CATEGORY: ; Statistics. ; ; CALLING SEQUENCE: ; KMEANS, Data, Cluster, VarName, CaseName ; ; INPUTS: ; Data = a (C,R) dimensioned array where R is ; the number of cases to be partitioned and C is the number ; of variables. ; ; KEYWORDS: ; Number = the number of clusters in final ; partition. The default is 2 ; Iter = the number of iterations used to assign ; cases to clusters. The default is 50. ; Norm = flag, if set, to signal whether to ; normalize the variable values in Data. ; Values normalized only if Norm=1. ; ; Missing = missing data value. If undefined, ; assume no missing data ; ; List_Name= name of output file. Default is to ; the screen. ; ClustMeans=array of cluster means. If defined on ; entry, CM(i,j) = mean of ith variable ; in j th cluster on exit ; ClustSTD = array of cluster standard deviations. ; If defined on entry, CS(i,j) = ; standard deviation of ith variable ; in jth cluster on exit ; SSBetween = array of sum of squares between ; clusters. If defined on entry, ; SSB(i)= sum of squares for ith ; variable. ; SSWithin = array of sum of squares within clusters. ; SSW is the analogue for SSB ; VarName= one dimensional array of C variable names ; CaseName= one dimensional array of R case names ; ; OUTPUT: ; Summary statistics for each variable for each cluster, overall ; analysis of variance for each variable. ; ; OPTIONAL OUTPUT PARAMETERS: ; Cluster a one dimensional array. Cluster(i) = the cluster containing ; case i in the final partition ; ; RESTRICTIONS: ; None. ; ; COMMON BLOCKS: ; None. ; ; SIDE EFFECTS: ; None. ; ; PROCEDURE: ; Adapted from algorithm in Clustering Algorithms by Hartigan, ; Wiley Series in Probablity and Mathematical Statistics, Chapt.4. ; Function kmeans1 implements a function that given a partition P ; returns a partition P' in the same neighborhood with reduced in group ; error. ; ; Function is called repeatedly until it finds a fixed point or local ; minimum. Kmeans1 recomputes cluster means after each reassignment. ; ; Procedure Kmeans successively finds partitions with the starting ; partition for K the final partition for K-1 with the case farthest ; from its cluster mean split off to form a new cluster. ;- ;on_error,2 Common KBlock,B SD=Size(Data1) if(N_ELements(LN) EQ 0)THEN Unit = -1 $ else openw,unit,/Get,LN if(SD(0) NE 2) THEN BEGIN printf,unit, $ " kmeans- Data must be a two dimensional array." goto,Done1 ENDIF Data = Data1 if (N_Elements(M) EQ 1) THEN BEGIN Data=ListWise(Data,M) if N_elements(Data) le 1 THEN $ if(Data EQ -1) THEN BEGIN printf,unit, $ "kmeans- halted since all cases have missing data." goto,DONE1 ENDIF SD = size(Data) ENDIF C=Sd(1) if SD(0) eq 1 then R = 1 else R=SD(2) if( N_Elements(N) EQ 1) then if(n GT R) THEN BEGIN printf,unit, $ "kmeans- Partition number must not be greater than number of cases" goto,DONE1 ENDIF else PN=n else PN= 2 if(N_Elements(IT) NE 0) THEN Iter= IT else Iter =50 if(N_Elements(nr) NE 0) THEN if ( nr EQ 1) THEN BEGIN D=Data Data= Normal1(Data,R,C) ENDIF Cluster = Fltarr(R) ; Clus(i)= cluster containing case i ; initially all cases in same cluster Temp=Replicate(1.,C) for i=0l,PN-1 DO BEGIN ; successively, construct cluster partitions kmeans1,data,cluster,iter,i+1 max2=0 ind1 = -1 for j=0l,i do begin ;Find case farthest from cluster mean Ind= where(cluster EQ j,count) if(count NE 0) THEN BEGIN A=Data(*,IND)-B(*,j)#Replicate(1.,count) if count eq 1 THEN BEGIN here = 0 max1 = Total(Temp *A^2) ENDIF ELSE BEGIN max1= max(Temp # (A*A),here) here=Ind(here) ENDELSE if(max1 GT max2) THEN BEGIN ind1=here max2=max1 ENDIF ENDIF ENDFOR if(i NE PN-1 and ind1 ne -1) THEN Cluster(ind1)= i+1 ENDFOR if (N_Elements(NP) EQ 0) THEN $ OutPut1,cluster,Data,CaseName,VarName,unit,SD1,SB1,SW1 if(N_Elements(CS) NE 0) THEN CS=SD1 if(N_Elements(SB) NE 0) THEN SB=SB1 if(N_Elements(SW) NE 0) THEN SW=SW1 if(N_Elements(CM) NE 0) THEN CM=B DONE1: if( unit NE -1) Then Free_Lun ,unit RETURN END ; kmeans