C hiclust.f
C This software implements the method called HICLUST.
C The author of this software is Stephen C Johnson.
C Note that this program is DIFFERENT from the same-named program in
C icicle.shar, though the two programs are very similar
C Copyright (c) 1993 by AT&T.
C Permission to use, copy, modify, and distribute this software for any
C purpose without fee is hereby granted, provided that this entire
C notice is included in all copies of any software which is or includes
C a copy or modification of this software and in all copies of the
C supporting documentation for such software.
C THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMP-
C LIED WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR AT&T MAKE ANY
C REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
C OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
C This software comes from the FIRST MDS Package of AT&T Bell Laboratories
C For explanation of the method this software implements, see
C "Hierarchical Clustering Schemes" by Stephen C Johnson in
C Psychometrika, 1967, vol. 32, no. 3, pp 241-254
C----+----@----+----@----+----@----+----@----+----@----+----@----+----@
C HIERARCHICAL CLUSTERING REVISITED
C NOTE -- THIS TAKES 28 K ON THE GE 645--MODEL A
C SMALLER DIMENSIONS, LESS CORE.
C THE GE 645 HAS 36 BIT WORDS
C IF YOUR MACHINE HAS LONGER OR SHORTER WORDS, YOU WILL
C WANT TO MODIFY ARRAY FORM AND THE ASSOCIATED READ STATEMENT
C ACCORDINGLY
C THIS WILL TREAT UP TO 100 BY 100 DATA INPUT MATRICES
C IF YOU CANT MORE, YOU WILL HAVE TO CHANGE THE OUTPUT ROUTINES
C AND POSSIBLY NEED SECONDARY STORAGE
C CURRENTLY, THE MAXIMUM AND MINIMUM METHODS ARE OUTPUTED
C OTHER METHODS (SUCH AS AVERAGEING) MAY BE ADDED
C I HAVE TRIED TO INDICATE WHERE YOU WOULD ADD THESE
DIMENSION D(100,100), FORM(11), E(100,100)
DIMENSIONXX(100),LK(100),LKI(100),LKJ(100),IP(200)
DIMENSIONP(200)
LOGICAL W(100)
DIMENSIONSIG(100)
DATA XXXX/1HX/, BLANK/1H /, PERIOD/1H./
C READ IN N = NUMBER OF VARIABLES
C IS = +1 IF MATRIX IS MATRIX OF DISTANCES
C -1 IF MATRIX IS MATRIX OF PROXIMITIES
10 READ11,N,IS
C IF N = NEGATIVE OR 0, STOP
IF (N .LE. 0) STOP
S = IS
N1=N-1
N2=N1-1
XN=N
C READ IN FORMAT
READ12,(FORM(I),I=1,11)
E(1,1)=0.0
C READ IN TRIANGULAR MATRIX WITHOUT DIAGONAL
C MAKE IT INTO A SYMMETRIC ARRAY WITH 0 DIAG., E(I,J)
DO20I=2,N
E(I,I)=0.0
I1=I-1
READFORM,(E(I,J),J=1,I1)
DO20J=1,I1
20 E(J,I)=E(I,J)
C **** NOTE ****
C THIS IS EXPERIMENTAL AND MAY BE OMMITTED
C COMPUTE SUMS, GAMMAS, AND SIGMA FOR EACH I FORM 2 TO N-1
C S1 = SUM OF ENTRIES, SQUARED
C S2 = SUM OF SQUARED ROW SUMS
C
C S3 EQUALS SUM OF SQUARES
S1=0.
S2=0.
S3=0.
DO30I=1,N
T=0.
DO25J=1,N
S3=S3+E(I,J)**2
25 T=T+E(I,J)
S2=S2+T**2
30 S1=S1+T
S1=S1**2
DO40I=2,N1
XK=I
T1=(XK-3.)/(XK-1.)+(XK-1.)/(XN-XK)
T2=(XN-2.)/(XN-XK)-4./(XK-1.)
T3=2./(XK-1.)+1./(XN-XK)
XX3=XN*(XN-1.)*XK
XX2=XX3*(XN-2.)
XX1=XX2*(XN-3.)
G1=(-T1)/XX1
G2=T2/XX2-4.*G1
G3=T3/XX3-G2-2.*G1
40 SIG(I)=S*SQRT(S1*G1+S2*G2+S3*G3)
C **** END OF EXPERIMENTAL SECTION
C INITIALIZE TO FIRST METHOD
K=1
C START A METHOD
C PRINT A TITLE
50 GOTO(51,52),K
51 PRINT61
GOTO98
52 PRINT62
C SET W(I) = .FALSE.
C LK(I)=0
C W(I) = .TRUE. IF LI I HAS BEEN MERGED WITH SOME OTHER PT
C LK(I) = LIST OF PTS MERGED, FOR OUTPUT PURPOSES---SEE BELOW
98 DO99I=1,N
W(I) = .FALSE.
LK(I)=0
C COPY OFF THE ORIGINAL MATRIX INTO TEMPORARY STORAGE
C **** NOTE ****
C YOU MAY NOT WANT TO DO THIS IF PRESSED FOR SPACE
C YOU COULD READ THE DATA DIRECTLY INTO THE D ARRAY, AND NOT
C NEED THE SPACE FOR THE E ARRAY AT ALL
C THEN USING TWO METHODS WOULD REQUIRE READING THE DATA TWICE
DO99J=1,N
99 D(I,J)=E(I,J)
C II COUNTS THE NUMBER OF PAIRS MERGED
DO181II=1,N1
C FIND MIN PAIR DISTANCE
C FIRST FIND TWO UNMERGED POINTS -- INITIALIZE X
C X IS THE RUNNING MINIMUM DISTANCE
100 DO102NI=2,N
IF (W(NI)) GO TO 102
NI1=NI-1
DO101NJ=1,NI1
101 IF (.NOT. W(NJ)) GO TO 104
102 CONTINUE
C MUST BE ONE LEFT
STOP
104 X=D(NI,NJ)
C BEGIN THE REAL BUSINESS OF FINDING THE CLOSEST PAIR
C ONLY LOOK AT UNMERGED POINTS
DO150I=2,N
IF (W(I)) GO TO 150
I1 = I-1
DO149J=1,I1
IF (W(J)) GO TO 149
C THIS IS THE CENTRAL COMPARE STATEMENT
IF((D(I,J)-X)*S .GT. 0.0) GO TO 149
C WE HAVE FOUND A CLOSER PAIR -- UPDATE X, NI, NJ
NI = I
NJ=J
X=D(I,J)
149 CONTINUE
150 CONTINUE
C WE HAVE NOW LOOKED AT EVERY PAIR OF POINTS --
C STORE X IN XX, NI AND NJ IN ARRAYS LKI AND LKJ
XX(II)=X
LKI(II)=NI
LKJ(II)=NJ
C POINT NJ IS CONSIDERED 'MERGED', WHERE BY THIS IS MEANT THAT
C IT NO LONGER POSSESSES A SEPARATE ROW OF THE MATRIX,
C BUT IS ENTIRELY SUBSUMED TO NI, (WHICH IS LARGER).
W(NJ) = .TRUE.
C SELECT UNMERGED POINTS, L, WHICH ARE NOT NEITHER NI NOR NJ
DO179L=1,N
IF (W(L)) GO TO 179
IF (L .EQ. NI) GO TO 179
C BRANCH ACCORDING TO METHOD, AND FIND D(NI,L) AND D(L,NI)
160 GOTO(200,300),K
C *****MINIMUM METHOD*****
C D(NI,L)= MIN(D(NI,L),D(NJ,L))
200 IF((D(NI,L)-D(NJ,L))*S .LE. 0.0) GO TO 170
D(NI,L)=D(NJ,L)
D(L,NI)=D(L,NJ)
GOTO170
C *****MAXIMUM METHOD*****
C D(NI,L)= MAX(D(NI,L),D(NJ,L))
300 IF((D(NI,L)-D(NJ,L))*S .GE. 0.0) GO TO 170
D(NI,L)=D(NJ,L)
D(L,NI)=D(L,NJ)
170 CONTINUE
C GO GET MORE L'S
179 CONTINUE
C NOW GO BACK AND DO AGAIN USING NEW D ARRAY --(ITERATE)
181 CONTINUE
C ITERATION COMPLETE -- BEG IN PRINTOUT ROUTINE
C WE NOW HAVE
C 1. LKI ARRAY
C 2. LKJ ARRAY
C THESE CONTANN SUCCESSIVE LINKS MERGED AT EACH
C ITERATI ON
C LKI(I) IS THE LARGER OF THE TWO
C LKJ(I) IS THE SMALLER OF THE TWO, THE ONE THAT IS
C MERGED (ELIMINATED) AT STEP I
C 3. XX(I) = THE SIZE (VALUE) OF THE LINK MERGED AT STEP I
C ARRANGE THE LINKS IN ORDER FOR PRINTING OUT
C FIRST PUT THE LAST TWO POINTS CONNECTED INTO THE LK ARRAY
LK(1)=LKJ(N1)
LK(2)=LKI(N1)
C THEN WORK BACKWARDS -- PICK A LINKAGE. THE LARGER MEMBER OF
C THIS GROUP MUST BE ALREADY PLACED IN THE LK LIST.
C MOVE THE ENTRIES IN THE LK LIST OVER ONE, AND 'MAKE
C ROOM' FOR THE NEW ENTRY, WHICH GOES IMMEDIATELY TO
C THE LEFT OF THE LARGER ELEMENT.
C IF THESE COMMENTS ARE NOT CLEAR, IT IS IN THE COMBIN-
C ATORIAL NATURE OF THE SUBJECT THAT THIS BE SO,
C AND WORKING THROUGH A SIMPLE EXAMPLE SHOULD BE
C VERY EDIFYING. IM DOING THE BEST I CAN.
DO550II=2,N1
JJ=N-II
C FIND THE LKI ENTRY IN THE LK LIST
DO510KK=1,II
510 IF(LKI(JJ) .EQ. LK(KK)) GO TO 515
C IT MUST BE SOMEWHERE
STOP
C SHOVE THINGS OVER AND PUT THE LKJ ENTRY IN THE LK LIST
515 DO530IK=KK,II
KI=II+KK-IK
530 LK(KI+1)=LK(KI)
550 LK(KK)=LKJ(JJ)
C OVER AND OVER IN THE DO LOOP UNTIL ALL IS DONE
C THE LK ORDER IS THE ORDER IN WHICH THE FINAL OUTPUT WILL
C BE ARRANGED
C INDEX THROUGH THE LKI AND LKJ ARRAYS, USING I
DO585I=1,N1
C FIND LKI(I) IN THE LK ARRAY-- THE INDEX IS JI
DO565JI=1,N
565 IF(LK(JI) .EQ. LKI(I)) GO TO 570
C IT MUST BE SOMEWHERE
STOP
C FIND LKJ(I) IN THE LK ARRAY-- THE INDEX IS JJ
570 DO575JJ=1,N
575 IF(LK(JJ) .EQ. LKJ(I)) GO TO 580
C IT MUST BE SOMEWHERE
STOP
580 LKI(I)=JI
LKJ(I)=JJ
585 CONTINUE
C LKI(II)AND LKJ(II)NOW CONTAIN THE ORDINAL POSITIONS OF THE II LINK
C THE FOLLOWING CODE MAY BE DONE MORE THAN ONCE, DEPENDING ON
C HOW LARGE N IS -- IF N IS MORE THAN 50, THE CODE IS DONE
C TWICE. THE ONLY DIFFERENCE BETWEEN THE TIMES IS WHICH
C PROTION OF THE IP AND P ARRAYS IS PRINTED OUT -- OTHERWISE
C THE SITUATION IS IDENTICAL.
C M1 = STARTING I VALUE
C M2 = FINAL I VALUE
C IP1 = STARTING P VALUE
C IP2 = FINAL P VALUE
C THE FIRST TIME,
C M1 = 1, IP1=1, M2=MIN(50,N), IP2=100 OR 2*N-1, DEPENDING
C AS N GREATER THAN 50, OR NOT
C THE SECOND TIME(WHEN N GREATER THAN 50),
C M1=51, M2=N, IP1=100, IP2=2*N-1
C SET UP FIRST TIME
700 M1=1
IP1=1
IF (N .GT. 50) GO TO 710
M2 = N
ISAIL=1
IP2=2*N-1
GOTO800
710 M2=50
ISAIL=2
IP2=100
GOTO800
C SET UP SECOND TIME
750 PRINT751
ISAIL=1
M1=51
M2=N
IP1=100
IP2=2*N-1
C PRINT FIRST DIGITS
800 DO551I=M1,M2
551 IP(I)=LK(I)/10
PRINT553,(IP(I),I=M1,M2)
C PRINT SECOND DIGITS
DO552I=M1,M2
552 IP(I)=LK(I)-10*IP(I)
PRINT553,(IP(I),I=M1,M2)
PRINT554
C INITIALIZE THE P ARRAY BY FILLING WITH BLANKS AND PERIODS
C THE PERIODS REPRESENT VARIABLES AS YET UNCLUSTERED -- THE
C BLANKS ARE FOR SPACING
DO560I=1,199,2
P(I+1)=BLANK
560 P(I)=PERIOD
C INDEX THROUGH THE MERGES, USIGG I
DO600I=1,N1
C COMPUTE THE APPROPRIATE PLACES IN THE P ARRAY
JI=2*LKI(I)-1
JJ=2*LKJ(I)-1
C F+LL IN XS BETWEEN JI AND JJ IN THE P ARRAY
DO590KK=JJ,JI
590 P(KK)=XXXX
C IF NEXT ONE IS CLUSTERED AT SAME LEVEL, DONT PRINT YET
IF ((XX(I) .EQ. XX(I+1)) .AND. (I .NE. N1)) GO TO 600
C PRINT OUT THE P ARRAY
595 PRINT596,XX(I),(P(III),III=IP1,IP2)
600 CONTINUE
G OTO(900,750),ISAIL
900 PRINT901
C **** NOTE ****
C THIS IS EXPERIMENTAL AND MAY BE OMMITTED
C EXPAND BY CLUSTERS AND COMPUTE THE Z SCORES
C CLEAR IP
C IP HAS THE CLUSTER NUMBER
DO910I=1,N
910 IP(I)=0
C LOOK AT THE LINKS, INDEXED BY I
DO950I=1,N2
LI=LKI(I)
LJ=LKJ(I)
IP1=IP(LJ)
IP2=IP(LI)
C LI AND LJ LINK CLUSTERS NUMBERED IP1 AND IP2
C SET ALL IP1 AND IP2 ELEMENTS, AND THOSE BETWEEN LJ AND LI, TO I.
DO912II=LJ,LI
912 IP(II)=I
IF(IP1.EQ.0)GOTO920
915 IF(LJ.EQ.1)GOTO920
LJ=LJ-1
IF(IP(LJ).NE.IP1)GOTO919
IP(LJ)=I
GOTO915
919 LJ=LJ+1
920 LKJ(I)=LJ
IF(IP2.EQ.0)GOTO930
925 IF(LI.EQ.N)GOTO930
LI=LI+1
IF(IP(LI).NE.IP2)GOTO929
IP(LI)=I
GOTO925
929 LI=LI-1
930 LKI(I)=LI
C LI AND LJ NOW HAVE THE TOTAL SPAN OF THE CLUSTER
C COMPUTE THE STATISTIC AND DIVIDE BY THE CORRECT SIGMA
SS=0.
ST=0.
DO960II=LJ,LI
LLI=LK(II)
DO961JJ=1,N
961 SS=SS+E(JJ,LLI)
DO962JJ=LJ,LI
LLJ=LK(JJ)
962 ST=ST+E(LLI,LLJ)
960 CONTINUE
SS=SS-ST
KXK=LI-LJ+1
XK=KXK
XD1=(XN-XK)*XK
XD2=XK*(XK-1.)
VAL=(SS/XD1-ST/XD2)/SIG(KXK)
PRINT970,KXK,VAL,(LK(II),II=LJ,LI)
970 FORMAT(//1X,I5,5H PTS.,F25.10,/,(/,1X,25I4))
950 CONTINUE
C **** NOTE ****
C THIS STATISTIC HAS NO VALIDITY IF THE DATA HAS ONLY RANK ORDER
C STRUCTURE. IT SHOULD BE DISREGARDED IN THIS CASE
C IF YOU TRANSFORM YOUR DATA INTO RANKS, HOWEVER, THESE VALUES
C MAKE SENSE.
C **** END OF EXPERIMENTAL SECTION ****
C GOTO NEXT METHOD -- IF DONE, GO READ ANOTHER N
K=K+1
IF (K .EQ. 3) GO TO 10
GO TO 50
11 FORMAT(I3,I2)
12 FORMAT(11A6)
61 FORMAT(21H1CONNECTEDNESS METHOD)
62 FORMAT(16H1DIAMETER METHOD)
751 FORMAT(10H1CONTINUED)
553 FORMAT(20X,50(1X,I1))
554 FORMAT(//)
596 FORMAT(1X,E16.8,4X,100A1)
901 FORMAT(//,14H END OF METHOD/1H1)
END