Package PyML :: Package utils :: Module fasta
[hide private]
[frames] | no frames]

Source Code for Module PyML.utils.fasta

  1   
  2  # Copyright (C) 2003, 2004 by BiRC -- Bioinformatics Research Center 
  3  #                                     University of Aarhus, Denmark 
  4  #                                     Contact: Thomas Mailund <mailund@birc.dk> 
  5  #  
  6  # This program is free software; you can redistribute it and/or modify 
  7  # it under the terms of the GNU General Public License as published by 
  8  # the Free Software Foundation; either version 2 of the License, or (at 
  9  # your option) any later version. 
 10  #  
 11  # This program is distributed in the hope that it will be useful, but 
 12  # WITHOUT ANY WARRANTY; without even the implied warranty of 
 13  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU 
 14  # General Public License for more details. 
 15  #  
 16  # You should have received a copy of the GNU General Public License 
 17  # along with this program; if not, write to the Free Software 
 18  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, 
 19  # USA. 
 20   
 21  """ 
 22  A parser for FASTA files. 
 23   
 24  Copyright (C) 2003, 2004 by BiRC -- Bioinformatics Research Center 
 25                                      University of Aarhus, Denmark 
 26                                      Contact: Thomas Mailund <mailund@birc.dk> 
 27  with changes by Asa Ben-Hur 
 28  """ 
 29   
 30  from __future__ import generators 
 31  import os 
 32   
 33   
34 -def myopen(fileName) :
35 36 if not ( os.path.exists(fileName) and os.path.isfile(fileName) ): 37 raise ValueError, 'file does not exist at %s' % fileName 38 39 import gzip 40 fileHandle = gzip.GzipFile(fileName) 41 42 gzippedFile = True 43 try : 44 line = fileHandle.readline() 45 fileHandle.close() 46 except : 47 gzippedFile = False 48 49 if gzippedFile : 50 return gzip.GzipFile(fileName) 51 else : 52 return open(fileName)
53 54
55 -class MalformedInput :
56 "Exception raised when the input file does not look like a fasta file." 57 pass
58
59 -class FastaRecord :
60 "a fasta record." 61
62 - def __init__(self, header, sequence):
63 "Create a record with the given header and sequence." 64 self.header = header 65 self.sequence = sequence
66
67 - def __str__(self) :
68 69 return '>' + self.header + '\n' + self.sequence + '\n'
70 71
72 -def _fasta_itr_from_file(file) :
73 "Provide an iteration through the fasta records in file." 74 75 h = file.readline()[:-1] 76 if h[0] != '>': 77 raise MalformedInput() 78 h = h[1:] 79 80 seq = [] 81 for line in file: 82 line = line[:-1] # remove newline 83 84 if line[0] == '>': 85 yield FastaRecord(h,''.join(seq)) 86 87 h = line[1:] 88 seq = [] 89 continue 90 91 #seq += [line] 92 seq.append(line) 93 94 yield FastaRecord(h,''.join(seq))
95 96
97 -def _fasta_itr_from_name(fname):
98 "Provide an iteration through the fasta records in the file named fname. " 99 100 f = myopen(fname) 101 for rec in _fasta_itr_from_file(f) : 102 yield rec
103 104
105 -def _fasta_itr(src):
106 """Provide an iteration through the fasta records in file `src'. 107 108 Here `src' can be either a file object or the name of a file. 109 """ 110 if type(src) == str : 111 return _fasta_itr_from_name(src) 112 elif type(src) == file : 113 return _fasta_itr_from_file(src) 114 else: 115 raise TypeError
116
117 -def fasta_get_by_name(itr,name):
118 "Return the record in itr with the given name." 119 x = name.strip() 120 for rec in itr: 121 if rec.header.strip() == x: 122 return rec 123 return None
124
125 -class fasta_itr (object) :
126 "An iterator through a sequence of fasta records." 127
128 - def __init__(self,src) :
129 "Create an iterator through the records in src." 130 131 self.__itr = _fasta_itr(src)
132
133 - def __iter__(self) :
134 135 return self
136
137 - def next(self) :
138 139 return self.__itr.next()
140
141 - def __getitem__(self,name) :
142 143 return fasta_get_by_name(iter(self),name)
144
145 -class fasta_slice (object) :
146 147 """Provide an iteration through the fasta records in 'src', from 148 'start' to 'stop'. 149 150 """
151 - def __init__(self, src, first, last = None):
152 """ 153 :Parameters: 154 - `src` - the fasta file/file handle. file can be gzipped. 155 - `first` - the first record (either its index in the file or 156 its identifier 157 - `last` - the last record to be output (index in the file or identifier) 158 """ 159 self.__itr = _fasta_itr(src) 160 self.__first = first 161 self.__last = last 162 if type(first) == int : 163 self.__current = 0 164 elif type(first) == type('') : 165 self.__current = None 166 else : 167 raise ValueError, 'bad first' 168 169 self.__foundFirst = False 170 if self.__first == 0 or self.__first == '' : 171 self.__foundFirst = True
172 173
174 - def __iter__(self) :
175 176 return self
177
178 - def next(self) :
179 180 if not self.__foundFirst : 181 for rec in self.__itr : 182 if type(self.__first) == int : 183 if self.__first == self.__current : 184 self.__foundFirst = True 185 break 186 self.__current += 1 187 else : 188 if rec.header == self.__first : 189 self.__foundFirst = True 190 break 191 self.__current = rec.header 192 if not self.__foundFirst : 193 raise ValueError, 'did not find first record' 194 return rec 195 196 rec = self.__itr.next() 197 198 if self.__last is not None : 199 if type(self.__first) == int : 200 self.__current += 1 201 if self.__current == self.__last : 202 raise StopIteration 203 else : 204 if rec.header == self.__last : 205 raise StopIteration 206 self.__current = rec.header 207 208 return rec
209 210
211 - def __getitem__(self, name):
212 213 return fasta_get_by_name(iter(self),name)
214
215 - def save(self, fileName) :
216 217 outfile = open(fileName, 'w') 218 for record in self : 219 outfile.write(str(record))
220
221 -def get_sequence(src, name):
222 "Return the record in src with the given name." 223 224 return fasta_itr(src)[name]
225 226
227 -def fasta_count(src) :
228 """ 229 count the number of records in a fasta file 230 """ 231 232 num_records = 0 233 for rec in fasta_itr(src) : 234 num_records += 1 235 236 return num_records
237 238
239 -def fasta_split(fileName, num_files, directory = None) :
240 """ 241 split a fasta file into a given number of files 242 the resulting files are named by adding a number to the provided file name. 243 244 :Parameters: 245 - `fileName` - the fasta file to split 246 - `num_files` - the number of files to split into 247 - `directory` - the directory into which to write the files 248 """ 249 250 num_records = fasta_count(fileName) 251 print num_records 252 if directory is None : 253 base, ext = os.path.splitext(fileName) 254 else : 255 dir, name = os.path.split(fileName) 256 base, ext = os.path.splitext(name) 257 base = os.path.join(directory, base) 258 print base 259 rec_num = 0 260 file_num = 1 261 recs_per_file = num_records / num_files + 1 262 for rec in fasta_itr(fileName) : 263 if rec_num % recs_per_file == 0 : 264 outfile = open(base + '.' + str(file_num) + ext, 'w') 265 file_num += 1 266 outfile.write(str(rec)) 267 rec_num += 1
268
269 -def fasta_subset(infileName, outfileName, ids) :
270 271 if type(ids) != type({}) : 272 import misc 273 ids = misc.list2dict(ids) 274 275 outfile = open(outfileName, 'w') 276 for rec in fasta_itr(infileName) : 277 if rec.header in ids : 278 outfile.write(str(rec))
279
280 -def fasta_delimiter(fastaFile) :
281 282 rec = fasta_itr(fastaFile).next() 283 if rec.header.find('|') >= 0 : 284 return '|' 285 else : 286 return None
287 288 289 if __name__ == '__main__': 290 291 import sys 292 if len(sys.argv) != 2: 293 print "missing file name" 294 sys.exit(2) 295 296 print 'iterating through all sequences in input file' 297 for rec in fasta_itr(sys.argv[1]): 298 print rec 299 300 print 'iterating through input, from the second sequence' 301 for rec in fasta_slice(sys.argv[1], 1, 3): 302 print rec 303