# # (C) 2005, Rob W. W. Hooft (rob@hooft.net) # # Comparison algorithm as implemented by Reinhard Schneider and Chris Sander # for comparison of protein sequences, but implemented to compare two # ASCII strings. This can be very useful for command interpreters to account # for mistyped commands (use the routine "compare(s1, s2)" in here to get # a score for each possible command, and see if one stands out). The comparison # makes use of a similarity matrix for letters: in the protein case this is # normally a chemical functionality similarity, for our case this is a matrix # based on keys next to each other on a US Qwerty keyboard and on "capital # letters are similar to their lowercase equivalent" # # The algorithm does not cut corners: it is sure to find the absolute best # match between the two strings. # # No attempt has been made to make this efficient in time or memory. Time taken # and memory used is proportional to the product of the length of the input # strings. Use for strings longer than 25 characters is entirely for your own # risk. # # Use freely, but please attribute when using. # How much does it cost to make a hole in one of the strings? GAPOPENPENALTY = -0.3 # How much does it cost to elongate a hole in one of the strings? GAPELONGATIONPENALTY = -0.2 # How much alike (0.0-1.0) are small and capital letters? CAPITALIZESCORE = 0.8 # How much alike (0.0-1.0) are characters next to each other on a (US) keyboard? NEXTKEYSCORE = 0.4 comparematrix = {} def _makekeyboardmap(): # different characters score 0.0, equal characters score 1.0 for i in range(33, 126+1): for j in range(33, 126+1): comparematrix[i,j] = 0.0 comparematrix[i,i] = 1.0 # Capital and small letters are CAPITALIZESCORE alike capdist = ord('A') - ord('a') for i in range(ord('a'), ord('z')+1): comparematrix[i, i + capdist] = CAPITALIZESCORE comparematrix[i + capdist, i] = CAPITALIZESCORE # Keyboard layout, add some score for letters that are close together line1='`1234567890-= ' line2=' qwertyuiop[] ' line3=' asdfghjkl; ' line4=' zxcvbnm,./ ' for i in range(len(line1)-1): _keyboardneighbour(line1[i], line1[i+1]) _keyboardneighbour(line2[i], line2[i+1]) _keyboardneighbour(line3[i], line3[i+1]) _keyboardneighbour(line4[i], line4[i+1]) _keyboardneighbour(line1[i], line2[i]) _keyboardneighbour(line2[i], line3[i]) _keyboardneighbour(line3[i], line4[i]) _keyboardneighbour(line1[i], line2[i+1]) _keyboardneighbour(line2[i], line3[i+1]) _keyboardneighbour(line3[i], line4[i+1]) def _keyboardneighbour(c1, c2): i1 = ord(c1) i2 = ord(c2) if 33 <= i1 <= 126 and 33 <= i2 <= 126: comparematrix[i1,i2] = 0.4 comparematrix[i2,i1] = 0.4 _makekeyboardmap() def compare(s1, s2): lh = {} gapped = {} l1 = len(s1) l2 = len(s2) if s1 == s2: return l1+1 # Top left of the matrix is "before the first character" in both directions lh[1, 1] = 0.0 gapped[1, 1] = False # Start with a gap in s1 lh[2, 1] = GAPOPENPENALTY gapped[2, 1] = True for ii in range(3, l1+2): lh[ii, 1] = lh[ii-1, 1] + GAPELONGATIONPENALTY gapped[ii, 1] = True # Start with a gap in s2 lh[1, 2] = GAPOPENPENALTY gapped[1, 2] = True for jj in range(3, l2+2): lh[1, jj] = lh[1, jj-1] + GAPELONGATIONPENALTY gapped[1, jj] = True # The main algorithm: for each point in the matrix decide what the best # route so far is, by comparing the diagonal route forward with the # possibility to open or elongate a gap either way. for jj in range(1, l2+1): for ii in range(1, l1+1): oc1 = ord(s1[ii-1]) oc2 = ord(s2[jj-1]) if 33 <= oc1 <= 126 and 33 <= oc2 <= 126: ld = comparematrix[oc1, oc2] elif oc1 == oc2: ld = 1.0 else: ld = 0.0 if gapped[ii+1, jj]: gph = GAPELONGATIONPENALTY else: gph = GAPOPENPENALTY if gapped[ii, jj+1]: gpv = GAPELONGATIONPENALTY else: gpv = GAPOPENPENALTY s = lh[ii, jj] + ld sh = lh[ii+1, jj] + gph sv = lh[ii, jj+1] + gpv sd = max(sh, sv) if s >= sd: lh[ii+1, jj+1] = s gapped[ii+1, jj+1] = False else: lh[ii+1, jj+1] = sd gapped[ii+1, jj+1] = True # The highest alignment score is in the bottom right corner of the matrix, # behind the last character in both strings return lh[l1+1, l2+1] def test(): assert compare("test", "test") == len("test") + 1.0 assert compare("Test", "test") == len("test") - 1.0 + CAPITALIZESCORE assert compare("rest", "test") == len("test") - 1.0 + NEXTKEYSCORE assert compare("rest", "crest") == len("rest") + GAPOPENPENALTY if __name__ == "__main__": test() try: while True: s1 = raw_input("s1: ") if not s1: break s2 = raw_input("s2: ") print "Score:", compare(s1, s2) except (EOFError, KeyboardInterrupt): pass