""" bioutil.py V. 1.14, Dic 6 2006, by leonardo maffi Dictionaries for translation, encoding, etc. of nucleotide and amino acid sequences. """ """ Possibile improvements: - Other symbols can be added to the standard genetic code. - Maybe the ambiguity_codes dict must contain lowercase strings. """ import re def readOneFasta(fasta_file): """readOneFasta(fasta_file): reads the first sequence from a fasta file, and returns the joined sequence. (Note: this is a rough and basic function still).""" cumulative = [] first = True for line in fasta_file: stripped_line = line.strip() if not stripped_line: continue if stripped_line[0] == ">": if first: first = False else: break else: cumulative.append(stripped_line) return "".join(cumulative) # ------------------------------------------------------------------------------- def xcodons(seq, overlap=0): """xcodons(seq, overlap=0): generator that returns the integer codons of the given (string) sequence seq. Trailing bases not multiple of 3 are ignored. overlap is the length of the overlap of adiacent codons. If it is zero then they don't share bases. It can be 0, 1 or 2. Examples: list(xcodons("0123456")) ==> ['012', '345'] list(xcodons("0123456", overlap=1)) ==> ['012', '234', '456'] list(xcodons("0123456", overlap=2)) ==> ['012', '123', '234', '345', '456'] """ assert 0 <= overlap <= 2 len_seq = len(seq) pos = 0 while len_seq - pos >= 3: yield seq[pos : pos+3] pos += 3 - overlap # ------------------------------------------------------------------------------- # Genetic code converters. Everything will be lowercase. Perfect codes. standard_genetic_code = """ Short Symbol Codons Name Nome Ala A GCU,GCC,GCA,GCG Alanine Alanina Arg R CGU,CGC,CGA,CGG,AGA,AGG Arginine Arginina Asn N AAU,AAC Asparagine Asparagina Asp D GAU,GAC Aspartic acid Acido aspartico Cys C UGU,UGC Cysteine Cisteina Gln Q CAA,CAG Glutamine Glutammina Glu E GAA,GAG Glutamic acid Acido glutammico Gly G GGU,GGC,GGA,GGG Glycine Glicina His H CAU,CAC Histidine Istidina Ile I AUU,AUC,AUA Isoleucine Isoleucina Leu L UUA,UUG,CUU,CUC,CUA,CUG Leucine Leucina Lys K AAA,AAG Lysine Lisina Met M AUG Methionine Metionina Phe F UUU,UUC Phenylalanine Fenilalanina Pro P CCU,CCC,CCA,CCG Proline Prolina Ser S UCU,UCC,UCA,UCG,AGU,AGC Serine Serina Thr T ACU,ACC,ACA,ACG Threonine Treonina Trp W UGG Tryptophan Triptofano Xxx X NNN Unknown Sconosciuto Tyr Y UAU,UAC Tyrosine Tirosina Val V GUU,GUC,GUA,GUG Valine Valina End * UAG,UGA,UAA Stop Stop """ """ Other symbols are often necessary in partly determined sequences: - B (Asx) was assigned to aspartic acid or asparagine when these have not been distinguished; - U (Sec = UGA) selenocysteine - X (Xaa) means that the identity of an amino acid is undetermined, or that the amino acid is atypical. - Z (Glx) was similarly assigned to glutamic acid or glutamine (or substances such as 4-carboxyglutamic acid and 5-oxoproline that yield glutamic acid on acid hydrolysis of peptides) - J is used in NMR work as designation for signals assigned either to leucine or to isoleucine which cannot be distinguished from each other. """ def table_converter(txt): """table_converter(txt): converts a 2D text table in a list of titles plus a list of lists. The first nonempty line of txt is used as the titles row, and it must start with a nonspace. Each label of the titles row must start at the beginning of the columns.""" # Split enters, clean lines and remove empty ones clean1 = filter(None, map(str.strip, txt.split("\n"))) # header line header, clean2 = clean1[0], clean1[1:] titles = header.split() # find splitting positions according to the starting point of the labels of the header line spos = [header.index(part) for part in titles] + [None] # split according to the splitting positions, and strip the elements, for each line result = [[row[p:spos[i+1]].strip() for i,p in enumerate(spos[:-1])] for row in clean2] return titles, result # split the lowercasse txt table, finding the header and the all other lines titles, table1 = table_converter(standard_genetic_code.lower()) # transpose table1, so table1_trasp contains the columns table2 = zip(*table1) # split the third column of table2 according to the comma table2[2] = [row.split(",") for row in table2[2]] __all__ = [] # Create all the possible converter dicts for title1, col1 in zip(titles, table2): for title2, col2 in zip(titles, table2): if title1 != title2: # creates the currenct dict converter curr_dict = {} for key, val in zip(col1, col2): if isinstance(key, list): for k in key: curr_dict[k] = val else: curr_dict[key] = val #print dict_name, curr_dict # debugging # Create the new dict name dict_name = title1 + "2" + title2 # Add the dict name to the globals globals()[dict_name] = curr_dict # Add the dict name to the list of exported names __all__.append(dict_name) # debugging # print "All converters:", __all__, "\n" # for d in __all__: print d, globals()[d], "\n" # ------------------------------------------------------------------------------- # background residue frequencies backf1 = """\ A 0.074 C 0.025 D 0.054 E 0.054 F 0.047 G 0.074 H 0.026 I 0.068 L 0.099 K 0.058 M 0.025 N 0.045 P 0.039 Q 0.034 R 0.052 S 0.057 T 0.051 V 0.073 W 0.013 Y 0.034""" backf2 =[row.split() for row in backf1.lower().split("\n") if row.strip()] background_afreq = dict( (aa, float(fr)) for aa,fr in backf2 ) # ------------------------------------------------------------------------------- # Kyte & Doolittle index of hydrophobicity kd = {'A': 1.8, 'R':-4.5, 'N':-3.5, 'D':-3.5, 'C': 2.5, 'Q':-3.5, 'E':-3.5, 'G':-0.4, 'H':-3.2, 'I': 4.5, 'L': 3.8, 'K':-3.9, 'M': 1.9, 'F': 2.8, 'P':-1.6, 'S':-0.8, 'T':-0.7, 'W':-0.9, 'Y':-1.3, 'V': 4.2} def hydrophobicity(seq, table=kd): return [table[aa] for aa in seq] # ------------------------------------------------------------------------------- # BLOSUM62 amino acid substitution matrix. # Reference: Henikoff, S. and Henikoff, J. G. (1992). Amino acidsubstitution matrices # from protein blocks. Proc. Natl. Acad.Sci. USA 89: 10915-10919. # Info: http://en.wikipedia.org/wiki/Substitution_matrix # ftp://ftp.ncbi.nih.gov/blast/matrices/ # See PAM matrices too. _blosum62_a = """\ A B C D E F G H I K L M N P Q R S T V W X Y Z * A 4 -2 0 -2 -1 -2 0 -2 -1 -1 -1 -1 -2 -1 -1 -1 1 0 0 -3 -1 -2 -1 -4 B -2 6 -3 6 2 -3 -1 -1 -3 -1 -4 -3 1 -1 0 -2 0 -1 -3 -4 -1 -3 2 -4 C 0 -3 9 -3 -4 -2 -3 -3 -1 -3 -1 -1 -3 -3 -3 -3 -1 -1 -1 -2 -1 -2 -4 -4 D -2 6 -3 6 2 -3 -1 -1 -3 -1 -4 -3 1 -1 0 -2 0 -1 -3 -4 -1 -3 2 -4 E -1 2 -4 2 5 -3 -2 0 -3 1 -3 -2 0 -1 2 0 0 -1 -2 -3 -1 -2 5 -4 F -2 -3 -2 -3 -3 6 -3 -1 0 -3 0 0 -3 -4 -3 -3 -2 -2 -1 1 -1 3 -3 -4 G 0 -1 -3 -1 -2 -3 6 -2 -4 -2 -4 -3 0 -2 -2 -2 0 -2 -3 -2 -1 -3 -2 -4 H -2 -1 -3 -1 0 -1 -2 8 -3 -1 -3 -2 1 -2 0 0 -1 -2 -3 -2 -1 2 0 -4 I -1 -3 -1 -3 -3 0 -4 -3 4 -3 2 1 -3 -3 -3 -3 -2 -1 3 -3 -1 -1 -3 -4 K -1 -1 -3 -1 1 -3 -2 -1 -3 5 -2 -1 0 -1 1 2 0 -1 -2 -3 -1 -2 1 -4 L -1 -4 -1 -4 -3 0 -4 -3 2 -2 4 2 -3 -3 -2 -2 -2 -1 1 -2 -1 -1 -3 -4 M -1 -3 -1 -3 -2 0 -3 -2 1 -1 2 5 -2 -2 0 -1 -1 -1 1 -1 -1 -1 -2 -4 N -2 1 -3 1 0 -3 0 1 -3 0 -3 -2 6 -2 0 0 1 0 -3 -4 -1 -2 0 -4 P -1 -1 -3 -1 -1 -4 -2 -2 -3 -1 -3 -2 -2 7 -1 -2 -1 -1 -2 -4 -1 -3 -1 -4 Q -1 0 -3 0 2 -3 -2 0 -3 1 -2 0 0 -1 5 1 0 -1 -2 -2 -1 -1 2 -4 R -1 -2 -3 -2 0 -3 -2 0 -3 2 -2 -1 0 -2 1 5 -1 -1 -3 -3 -1 -2 0 -4 S 1 0 -1 0 0 -2 0 -1 -2 0 -2 -1 1 -1 0 -1 4 1 -2 -3 -1 -2 0 -4 T 0 -1 -1 -1 -1 -2 -2 -2 -1 -1 -1 -1 0 -1 -1 -1 1 5 0 -2 -1 -2 -1 -4 V 0 -3 -1 -3 -2 -1 -3 -3 3 -2 1 1 -3 -2 -2 -3 -2 0 4 -3 -1 -1 -2 -4 W -3 -4 -2 -4 -3 1 -2 -2 -3 -3 -2 -1 -4 -4 -2 -3 -3 -2 -3 11 -1 2 -3 -4 X -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -4 Y -2 -3 -2 -3 -2 3 -3 2 -1 -2 -1 -1 -2 -3 -1 -2 -2 -2 -1 2 -1 7 -2 -4 Z -1 2 -4 2 5 -3 -2 0 -3 1 -3 -2 0 -1 2 0 0 -1 -2 -3 -1 -2 5 -4 * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 """ _blosum62_b = [row.split() for row in _blosum62_a.lower().split("\n") if row.strip()] # Create the blosum62 matrix of integers blosum62_mat = [map(int, row[1:]) for row in _blosum62_b[1:]] # Create a conversion dict for the blosum62 matrix blosum62_index = dict((symbol, i) for i,symbol in enumerate(_blosum62_b[0])) sy = _blosum62_b[0] blosum62 = dict( ((a1,a2),val) for a1,row in zip(sy, blosum62_mat) for a2,val in zip(sy, row) ) # print blosum62_mat, "\n\n", blosum62_index # debug # ------------------------------------------------------------------------------- # Ambiguity code rules. # The table format is: IUB(/GCG) = Base # Note: . means everything bug A,C,G,T ambiguity_code_rules_tab = """\ . = A = A C = C G = G T = T R = G,A Y = C,T M = A,C K = G,T S = G,C W = A,T B = C,G,T D = A,G,T H = A,C,T V = A,C,G X,N = A,C,G,T """ # Remove commas and split the text table according to the newlines lines1 = ambiguity_code_rules_tab.replace(",", "").split("\n") # Strip each nonempty line and split it according to the equal sign lines2 = [row.strip().split("=") for row in lines1 if row.strip()] # Strip the two elements of each row lines3 = [map(str.strip, row) for row in lines2] # Create a *uppercase* dictionary that maps the IUB/GCG codes to the equivalent bases ambiguity_codes = {} for iub_seq, bases in lines3: for iub in iub_seq: ambiguity_codes[iub] = bases #print ambiguity_codes # debug # ------------------------------------------------------------------------------- _ambiguity_codes_re = {"^" : ""} for code, bases in ambiguity_codes.iteritems(): if len(bases) <= 1: _ambiguity_codes_re[code] = bases else: _ambiguity_codes_re[code] = "[%s]" % bases def restrictionRE(rec_seq): """restrictionRE(rec_seq): given the (uppercase) recognition sequence of a restriction enzyme (it may contain the cutting point too, represented as ^, it's ignored), it returns the Regular Expression string to find it. Example: restrictionRE("R^CCGGNY") ==> "[GA]CCGG[ACGT][CT]" """ # REBASE URL: http://rebase.neb.com/rebase/link_bionet return "".join(_ambiguity_codes_re[symbol] for symbol in rec_seq) # ------------------------------------------------------------------------------- def xrestrictionFinder(seqence, patt): """xrestrictionFinder(seqence, patt): generator function that finds all restriction sites, overlapping ones too. Input: - seqence: the genetic sequence (a string) to search on. - patt: a compiled Regular Expression object, like ones produced by estrictionRE() and then compiled. Output: it yelds tuples of two elements: - the matchObj.group(), that is the actual pattern found. - the matchObj.start(), that is the position of its starting point in the given sequence. Notes: - It finds overlapping patterns too. - It is tested only with regular expressions produces by restriction enzymes (like ones of the REBASE, produced compiling restrictionRE), it is not testes with any other regular expression. - So far it works with linear genomes only (no circular ones).""" pos = 0 while True: matchObj = patt.search(seqence, pos) if not matchObj: break else: pos = matchObj.start() + 1 yield matchObj.group(), matchObj.start() def restrictionPoints(seq, recog_seq): """restrictionPoints(seq, recog_seq): commodity function that given a genetic sequence seq and the recognition sequence recog_seq of a restriction enzyme (for example RGCNGC^Y, where ^ is the cutting point), returns the list of positions of restriction points. Note: the ^ must be present.""" shift = recog_seq.index("^") recog_seq_patt = restrictionRE(recog_seq) recog_seq_re = re.compile(recog_seq_patt, flags=re.I) return [pos+shift for seq,pos in xrestrictionFinder(seq, recog_seq_re)] # ------------------------------------------------------------------------------- def loadRebase(filename="link_bionet"): """ load_rebase(filename="link_bionet"): load the REBASE file in a list of tuples. Note: this function can be improved, see below. Input lines format: enzyme_name (prototype) recognition_sequence Note: prototype may be missing. Note: there are some repetitions like: BtrI CAC^GTC BtrI GAC^GTG BtsI GCAGTGNN^ BtsI ^CACTGC Returned data: {enzyme_name: [recognition sequences]} --------------------------------------------------------------------------- Possible improvements: - This method has to handle reverse complements of recognition sites. Many restriction enzymes are palindromic in the sense that their reverse complements are the same. These biological palindromes indicate that the enzyme can cut the site on both strands. - Note that if the cut site isn't exactly in the middle, there will be "sticky ends" of single stranded DNA that make it possible to anneal the fragment with a complementary sticky end. - It would be nice to be able to return if a particular restriction enzyme produces sticky ends at its cut site. It would also be useful to know what other enzymes create sticky ends that will anneal with the sticky ends of this enzyme. --------------------------------------------------------------------------- Info on REBASE restriction enzymes database: URL: http://rebase.neb.com/rebase/link_bionet Data format: enzyme_name (prototype) recognition_sequence recognition_sequence: These are written from 5' to 3', only one strand being given, and the point of cleavage is indicated ^. When no ^ appears, the precise cleavage site has not been determined. In all cases the recognition sequences are oriented so that the cleavage sites lie on their 3' side. Recognition sequences are given using the standard abbreviations to represent ambiguity. """ fin = file(filename) while not fin.readline().startswith("Rich Roberts"): pass fin.readline() data = {} # With Python 2.5 it can be used a defaultDict. for line in fin: line2 = line.strip() if line2: enzyme_name = line2.split(" ", 1)[0] rec_seq = line2.rsplit(" ", 1)[-1] if enzyme_name in data: data[enzyme_name].append(rec_seq) else: data[enzyme_name] = [rec_seq] fin.close() return data # ------------------------------------------------------------------------------- _test_data = """\ CGGGCGCACAGCCGGCGCGGAGGCCCCACAGCCCCGCCGGGACCCGAGGCCAAGCGAGGGGCTGCCAGTGTCCCGGG ACCCACCGCGTCCGCCCCAGCCCCGGGTCCCCGCGCCCACCCCATGGCGACGGACGCGGCGCTACGCCGGCTTCTGAG GCTGCACCGCACGGAGATCGCGGTGGCCGTGGACAGCGCCTTCCCACTGCTGCACGCGCTGGCTGACCACGACGTGGT CCCCGAGGACAAGTTTCAGGAGACGCTTCATCTGAAGGAAAAGGAGGGCTGCCCCCAGGCCTTCCACGCCCTCCTGTC CTGGCTGCTGACCCAGGACTCCACAGCCATCCTGGACTTCTGGAGGGTGCTGTTCAAGGACTACAACCTGGAGCGCTA TGGCCGGCTGCAGCCCATCCTGGACAGCTTCCCCAAAGATGTGGACCTCAGCCAGCCCCGGAAGGGGAGGAAGCCCCC GGCCGTCCCCAAGGCTTTGGTACCGCCACCCAGACTCCCCACCAAGAGGAAGGCCTCAGAAGAGGCTCGAGCTGCCGC GCCAGCAGCCCTGACTCCAAGGGGCACCGCCAGCCCAGGCTCTCAACTGAAGGCCAAGCCCCCCAAGAAGCCGGAGAG CAGCGCAGAGCAGCAGCGCCTTCCACTCGGGAACGGGATTCAGACCATGTCAGCTTCAGTCCAGAGAGCTGTGGCCAT GTCCTCCGGGGACGTCCCGGGAGCCCGAGGGGCCGTGGAGGGGATCCTCATCCAGCAGGTGTTTGAGTCAGGCGGCTC CAAGAAGTGCATCCAGGTTGGTGGGGAGTTCTACACTCCCAGCAAGTTCGAAGACTCCGGCAGTGGGAAGAACAAGGC CCGCAGCAGCAGTGGCCCGAAGCCTCTGGTTCGAGCCAAGGGAGCCCAGGGCGCTGCCCCCGGTGGAGGTGAGGCTAG GCTGGGCCAGCAGGGCAGCGTTCCCGCCCCTCTGGCCCTCCCCAGTGACCCCCAGCTCCACCAGAAGAATGAGGACGA GTGTGCCGTGTGTCGGGACGGCGGGGAGCTCATCTGCTGTGACGGCTGCCCTCGGGCCTTCCACCTGGCCTGCCTGTC CCCTCCGCTCCGGGAGATCCCCAGTGGGACCTGGAGGTGCTCCAGCTGCCTGCAGGCAACAGTCCAGGAGGTGCAGCC CCGGGCAGAGGAGCCCCGGCCCCAGGAGCCACCCGTGGAGACCCCGCTCCCCCCGGGGCTTAGGTCGGCGGGAGAGGA GGTAAGAGGTCCACCTGGGGAACCCCTAGCCGGCATGGACACGACTCTTGTCTACAAGCACCTGCCGGCTCCGCCTTC TGCAGCCCCGCTGCCAGGGCTGGACTCCTCGGCCCTGCACCCCCTACTGTGTGTGGGTCCTGAGGGTCAGCAGAACCT GGCTCCTGGTGCGCGTTGCGGGGTGTGCGGAGATGGTACGGACGTGCTGCGGTGTACTCACTGCGCCGCTGCCTTCCA CTGGCGCTGCCACTTCCCAGCCGGCACCTCCCGGCCCGGGACGGGCCTGCGCTGCAGATCCTGCTCAGGAGACGTGAC CCCAGCCCCTGTGGAGGGGGTGCTGGCCCCCAGCCCCGCCCGCCTGGCCCCTGGGCCTGCCAAGGATGACACTGCCAG TCACGAGCCCGCTCTGCACAGGGATGACCTGGAGTCCCTTCTGAGCGAGCACACCTTCGATGGCATCCTGCAGTGGGC CATCCAGAGCATGGCCCGTCCGGCGGCCCCCTTCCCCTCCTGACCCCAGATGGCCGGGACATGCAGCTCTGATGAGAG AGTGCTGAGAAGGACACCTCCTTCCTCAGTCCTGGAAGCCGGCCGGCTGGGATCAAGAAGGGGACAGCGCCACCTCTT GTCAGTGCTCGGCTGTAAACAGCTCTGTGTTTCTGGGGACACCAGCCATCATGTGCCTGGAAATTAAACCCTGCCCCA CTTCTCTACTCTGGAAGTCCCCGGGAGCCTCTCCTTGCCTGGTGACCTACTAAAAATATAAAAATTAGCTGGGTGTGG TGGTGGGTGCCTGTAATCCCAGCTACATGGGAGCCTGAGGCATGAGAATCACTTGAACTCGGGAGGTGGAGGTTGCAG TGAGCTGAGATTGCGCCACTGCACTCCAGTCTGGTCGGCAAGAGTGAGACTCCGTCTCAAAAACAAAACAAAAAAACC ACATAACATAAATTTATCATCTCGACCACTTTTCAGTTCAGTGGCATTCACATCTCATGTA""" _test_data = _test_data.replace(" ", "").replace("\n","").replace("T", "u").lower() #print len(test_data) / 3.0 # ------------------------------------------------------------------------------- # Chou-Fasman parameters # From: http://www.cmbi.kun.nl/gvteach/aainfo/chou.shtml chou_fasman_txt = """\ AAcid P(a) P(b) P(turn) f(i) f(i+1) f(i+2) f(i+3) A 1.42 0.83 0.66 0.06 0.076 0.035 0.058 R 0.98 0.93 0.95 0.070 0.106 0.099 0.085 D 1.01 0.54 1.46 0.147 0.110 0.179 0.081 N 0.67 0.89 1.56 0.161 0.083 0.191 0.091 C 0.70 1.19 1.19 0.149 0.050 0.117 0.128 E 1.39 1.17 0.74 0.056 0.060 0.077 0.064 Q 1.11 1.10 0.98 0.074 0.098 0.037 0.098 G 0.57 0.75 1.56 0.102 0.085 0.190 0.152 H 1.00 0.87 0.95 0.140 0.047 0.093 0.054 I 1.08 1.60 0.47 0.043 0.034 0.013 0.056 L 1.41 1.30 0.59 0.061 0.025 0.036 0.070 K 1.14 0.74 1.01 0.055 0.115 0.072 0.095 M 1.45 1.05 0.60 0.068 0.082 0.014 0.055 F 1.13 1.38 0.60 0.059 0.041 0.065 0.065 P 0.57 0.55 1.52 0.102 0.301 0.034 0.068 S 0.77 0.75 1.43 0.120 0.139 0.125 0.106 T 0.83 1.19 0.96 0.086 0.108 0.065 0.079 W 1.08 1.37 0.96 0.077 0.013 0.064 0.167 Y 0.69 1.47 1.14 0.082 0.065 0.114 0.125 V 1.06 1.70 0.50 0.062 0.048 0.028 0.053 """ chou_fasman_titles, cf_tab = table_converter(chou_fasman_txt) chou_fasman = [[(float(el) if pos else el) for pos,el in enumerate(row)] for row in cf_tab] # ------------------------------------------------------------------------------- __all__ += ["fasta_patt", "xcodons", "background_afreq", "blosum62", "blosum62_mat",\ "blosum62_index", "ambiguity_codes", "restrictionRE", "xrestrictionFinder",\ "restrictionPoints", "loadRebase"] if __name__ == '__main__1': # A little demo print "Test data bases:", _test_data, "\n" print "Test data symbols:", print "".join(codons2symbol.get(b3, "x") for b3 in xcodons(_test_data)).upper() print "\n__all__:", __all__, "\n\n" patt = re.compile( "ab|bg" ) assert list(xrestrictionFinder("abgab", patt)) == [('ab', 0), ('bg', 1), ('ab', 3)] for recog_seq in "RGCGC^Y", "^NNNNNNNNGACNNNNNNTGG", "GACGCNNNNN^": print "restriction points for", recog_seq, "are:", print restrictionPoints(_test_data, recog_seq) REBASE_filename = "bioutil_link_bionet" try: rebase = loadRebase(REBASE_filename) except IOError: print "\nREBASE file not found.", print 'You can create it saving the following URL as "bioutil_link_bionet":' print "http://rebase.neb.com/rebase/link_bionet" else: print "\nREBASE loaded." print "Some restriction enzymes with undetermined precise cleavage site:" print [(n,rs) for n,rs in rebase.items() if sum("^" not in r for r in rs)][:30] print "\nSome restriction enzymes with more than one recognized sequence:" print [(n,rseqs) for n,rseqs in rebase.iteritems() if len(rseqs)>1][:20]