# Clusters are initially just the rows
clust=[bicluster(rows[i],id=i) for i in range(len(rows))]
while len(clust)>1:
lowestpair=(0,1)
closest=distance(clust[0].vec,clust[1].vec)
# loop through every pair looking for the smallest distance
for i in range(len(clust)):
for j in range(i+1,len(clust)):
# distances is the cache of distance calculations
if (clust[i].id,clust[j].id) not in distances:
distances[(clust[i].id,clust[j].id)]=distance(clust[i].vec,clust[j].vec)
d=distances[(clust[i].id,clust[j].id)]
if d<closest:
closest=d
lowestpair=(i,j)
# calculate the average of the two clusters
mergevec=[
(clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0
for i in range(len(clust[0].vec))]
# create the new cluster
newcluster=bicluster(mergevec,left=clust[lowestpair[0]],
right=clust[lowestpair[1]],
distance=closest,id=currentclustid)
# cluster ids that weren't in the original set are negative
currentclustid-=1
del clust[lowestpair[1]]
del clust[lowestpair[0]]
clust.append(newcluster)
def printclust(clust,labels=None,n=0):
# indent to make a hierarchy layout
for i in range(n): print ' ',
if clust.id<0:
# negative id means that this is branch
print '-'
else:
# positive id means that this is an endpoint
if labels==None: print clust.id
else: print labels[clust.id]
# now print the right and left branches
if clust.left!=None: printclust(clust.left,labels=labels,n=n+1)
if clust.right!=None: printclust(clust.right,labels=labels,n=n+1)
# indent to make a hierarchy layout
for i in range(n): print ' ',
if clust.id<0:
# negative id means that this is branch
print '-'
else:
# positive id means that this is an endpoint
if labels==None: print clust.id
else: print labels[clust.id]
# now print the right and left branches
if clust.left!=None: printclust(clust.left,labels=labels,n=n+1)
def printclust(clust,labels=None,n=0):
# indent to make a hierarchy layout
for i in range(n): print ' ',
if clust.id<0:
# negative id means that this is branch
print '-'
else:
# positive id means that this is an endpoint
if labels==None: print clust.id
else: print labels[clust.id]
# now print the right and left branches
if clust.left!=None: printclust(clust.left,labels=labels,n=n+1)
if clust.right!=None: printclust(clust.right,labels=labels,n=n+1)
class matchrow:
def __init__(self,row,allnum=False):
if allnum:
self.data=[float(row[i]) for i in range(len(row)-1)]
else:
self.data=row[0:len(row)-1]
self.match=int(row[len(row)-1])
def loadmatch(f,allnum=False):
rows=[]
for line in file(f):
rows.append(matchrow(line.split(','),allnum))
return rows
A new warning, UnicodeWarning, is triggered when you attempt to compare a Unicode string and an 8-bit string that can’t be converted to Unicode using the default ASCII encoding. The result of the comparison is false:
>>> chr(128) == unichr(128) # Can't convert chr(128) to Unicode
__main__:1: UnicodeWarning: Unicode equal comparison failed
to convert both arguments to Unicode - interpreting them
as being unequal
False
>>> chr(127) == unichr(127) # chr(127) can be converted
True
Previously this would raise a UnicodeDecodeError exception, but in 2.5 this could result in puzzling problems when accessing a dictionary. If you looked up unichr(128) and chr(128) was being used as a key, you’d get a UnicodeDecodeError exception. Other changes in 2.5 resulted in this exception being raised instead of suppressed by the code in dictobject.c that implements dictionaries.
Raising an exception for such a comparison is strictly correct, but the change might have broken code, so instead UnicodeWarning was introduced.