1
2 import os
3 from PyML.utils import misc,myio
4 from PyML.base.pymlObject import PyMLobject
5
6 __docformat__ = "restructuredtext en"
7
9
10 '''A parser class to read datasets from a file.
11 Each parser support the following interface:
12 Constructor - pass a file name / file handle and information on which
13 pattern/classes/features to read from the file
14 check - checks whether the file conforms to the format read by the parser
15 scan - scan the file and make the _address variable that lists the positions
16 in the file of all the patterns that need to be read
17 next - read the next pattern (after calling the __iter__ method)
18 '''
19
20 commentChar = ['%', '#']
21
23
24 if type(file) == type('') :
25 if not os.path.exists(file) :
26 raise ValueError, "file does not exist at %s" % file
27 self._fileHandle = myio.myopen(file)
28
29 else :
30 self._fileHandle = file
31
32 if 'classes' in args :
33 self.classesToRead = args['classes']
34 else :
35 self.classesToRead = []
36
37 if 'patterns' in args :
38 self.patternsToRead = args['patterns']
39 else :
40 self.patternsToRead = None
41
42 if 'features' in args :
43 self.featuresToRead = args['features']
44 else :
45 self.featuresToRead = []
46
50
54
56
57 self._addressIterator = iter(self._address)
58 return self
59
60
62 '''how many patterns are read'''
63
64 return len(self._address)
65
66
70
80
82
83 '''A class for parsing sparse data'''
84
86
87 Parser.__init__(self, file, **args)
88 self.sparsify = False
89 if 'sparsify' in args :
90 self.sparsify = args['sparsify']
91
93
94 self._fileHandle.seek(0)
95 line,pos = self.skipComments()
96 return len(line.split(':')) > 1
97
99
100 self._fileHandle.seek(0)
101 patternID = None
102 L = []
103
104 line,pos = self.skipComments()
105
106 patternIDflag = (line.find(",") != -1)
107 if patternIDflag :
108 patternID = []
109
110 tokens = line.split(',')[-1].split()
111 if len(tokens) == 0 or tokens[0].find(':') >= 0 :
112 raise ValueError, "unlabeled data"
113
114 while line :
115
116 if patternIDflag:
117 (patID, line) = line.split(",")
118 patternID.append(patID)
119 L.append(line.split()[0])
120
121 line = self._fileHandle.readline()
122
123 return L,patternID
124
126
127 self._fileHandle.seek(0)
128 patternID = None
129 self._featureID = []
130
131 address = []
132
133 line, pos = self.skipComments()
134
135
136 patternIDflag = (line.find(",") != -1)
137 if patternIDflag :
138 patternID = []
139
140
141 tokens = line.split(',')[-1].split()
142 if len(tokens) == 0 or tokens[0].find(':') >= 0 :
143 L = None
144 labeledData = 0
145 firstToken = 0
146 else :
147 L = []
148 labeledData = 1
149 firstToken = 1
150
151 self._numFeatures = 0
152
153 self.integerID = True
154
155 i = 0
156 featureDict = {}
157 foundIntegerID = False
158 while line :
159 nextPos = pos + len(line)
160 if patternIDflag:
161 (patID, line) = line.split(",")
162
163 tokens = line.split()
164 if labeledData :
165 label = tokens[0]
166 else :
167 label = None
168 if not foundIntegerID :
169 if labeledData :
170 t = tokens[1:]
171 else :
172 t = tokens
173 if len(t) > 0 :
174 foundIntegerID = True
175 for token in t :
176 try :
177 int(token.split(':')[0])
178 except :
179 self.integerID = False
180
181 if (label in self.classesToRead or len(self.classesToRead) == 0) :
182 if labeledData :
183 L.append(label)
184 if patternIDflag :
185 patternID.append(patID)
186 address.append(pos)
187
188 pos = nextPos
189 line = self._fileHandle.readline()
190 i +=1
191 if i % 100 == 0 and i > 0 :
192 print 'scanned',i,'patterns'
193
194 self._featureDict = {}
195 self._featureDict2 = {}
196 self._featureKeyDict = {}
197 self._address = address
198 self._labeledData = labeledData
199 self._labels = L
200 self._patternIDflag = patternIDflag
201 self._patternID = patternID
202 self._firstToken = firstToken
203
204
206
207 self._addressIterator = iter(self._address)
208
209 return self
210
212
213 address = self._addressIterator.next()
214 self._fileHandle.seek(address)
215
216 line = self._fileHandle.readline()
217 if self._patternIDflag:
218 (patID, line) = line.split(",")
219
220 tokens = line.split()
221 if self._labeledData :
222 label = tokens[0]
223 else :
224 label = None
225
226 x = {}
227 if len(tokens) > self._firstToken :
228 for token in tokens[self._firstToken:] :
229 (featureID, featureVal) = token.split(":")
230 if self.integerID :
231 featureID = int(featureID)
232
233 uniqueHash = True
234
235 if (featureID in self._featureDict2 and
236 self._featureDict2[featureID] != featureID) :
237 uniqueHash = False
238
239 for i in range(255) :
240 fid = featureID + '+' + chr(i)
241 if fid not in self._featureDict2 :
242 featureID = fid
243 uniqueHash = True
244 if not uniqueHash :
245 raise ValueError, 'non-unique hash'
246
247 if not self.integerID :
248 featureKey = hash(featureID)
249 else :
250 featureKey = featureID
251 self._featureDict[featureID] = featureKey
252 self._featureDict2[featureID] = featureID
253 self._featureKeyDict[featureKey] = 1
254
255 if float(featureVal) != 0.0 or not self.sparsify :
256
257 x[featureKey] = float(featureVal)
258
259 return x
260
261 - def postProcess(self) :
262
263 if len(self._featureDict.keys()) != len(misc.unique(self._featureDict.values())) :
264 print len(self._featureDict.keys()), len(misc.unique(self._featureDict.values()))
265 raise ValueError, 'non-unique hash'
266
267 featureKeyDict = {}
268 featureKey = self._featureDict.values()
269 featureKey.sort()
270 for i in range(len(featureKey)) :
271 featureKeyDict[featureKey[i]] = i
272 inverseFeatureDict = misc.invertDict(self._featureDict)
273 featureID = [str(inverseFeatureDict[key]) for key in featureKey]
274
275 return featureID, featureKey, featureKeyDict
276
277
279
280 """A class for parsing delimited files"""
281
282 attributes = {'idColumn' : None,
283 'labelsColumn' : None,
284 'headerRow' : False}
285
287
288 """
289 :Keywords:
290 - `headerRow` - True/False depending on whether the file contains a
291 header row that provides feature IDs
292 - `idColumn` - set to 0 if the data has pattern IDs in the first column
293 - `labelsColumn` - possible values: if there are no patternIDs
294 it is either 0 or -1, and if there are patternIDs, 1 or -1
295 """
296
297 Parser.__init__(self, file, **args)
298 PyMLobject.__init__(self, None, **args)
299
300 if self.labelsColumn == 1 :
301 self.idColumn = 0
302 if self.idColumn is None and self.labelsColumn is None :
303 self._first = 0
304 else :
305 self._first = max(self.idColumn, self.labelsColumn) + 1
306 print 'label at ', self.labelsColumn
307
309 """very loose checking of the format of the file:
310 if the first line does not contain a colon (":") it is assumed
311 to be in csv format
312 the delimiter is determined to be "," if the first line contains
313 at least one comma; otherwise a split on whitespaces is used.
314 """
315
316 self._fileHandle.seek(0)
317
318 line,pos = self.skipComments()
319 if len(line.split('\t')) > 1 :
320 self.delim = '\t'
321 elif len(line.split(',')) > 1 :
322 self.delim = ','
323 else :
324 self.delim = None
325 line,pos = self.skipHeader(line,pos)
326 print 'delimiter', self.delim
327
328
329
330
331 if len(line.split(':')) > 1 : return False
332
333 return True
334
336 """
337 check if the file has a first line that provides the feature IDs
338 """
339
340 tokens = line[:-1].split(self.delim)
341 if self.labelsColumn == -1 :
342 self._last = len(tokens) - 1
343 else :
344 self._last = len(tokens)
345
346 if self.headerRow :
347 self._featureID = tokens[self._first:self._last]
348 pos += len(line)
349 line = self._fileHandle.readline()
350
351
352 return line, pos
353
355
356 self._fileHandle.seek(0)
357
358 L = []
359 patternID = []
360
361 line,pos = self.skipComments()
362 line, pos = self.skipHeader(line, pos)
363 tokens = line[:-1].split(self.delim)
364 if self.labelsColumn is None :
365 if len(tokens) == 2 :
366 self.labelsColumn = 1
367 self.idColumn = 0
368 elif len(tokens) == 1 :
369 self.labelsColumn = 0
370
371 i = 1
372 while line :
373 tokens = line[:-1].split(self.delim)
374 if self.idColumn is not None :
375 patternID.append(tokens[self.idColumn])
376 else :
377 patternID.append(str(i))
378 if self.labelsColumn is not None :
379 L.append(tokens[self.labelsColumn])
380 line = self._fileHandle.readline()
381 i =+ 1
382
383 return L,patternID
384
385
387
388 self._fileHandle.seek(0)
389 self._featureID = None
390 address = []
391
392 line,pos = self.skipComments()
393 line, pos = self.skipHeader(line, pos)
394
395 tokens = line.split(self.delim)
396 self._patternID = []
397
398 dim = len(tokens) - (self.idColumn is not None) - \
399 (self.labelsColumn is not None)
400
401 self._labels = None
402 if self.labelsColumn is not None :
403 self._labels = []
404
405 i = 0
406 while line :
407 address.append(pos)
408 pos += len(line)
409 line = self._fileHandle.readline()
410 i +=1
411 if i % 1000 == 0 and i > 0 :
412 print 'scanned',i,'patterns'
413
414 self._address = address
415 if self._featureID is None :
416 self._featureID = [str(i) for i in range(dim)]
417
418
420
421 address = self._addressIterator.next()
422 self._fileHandle.seek(address)
423
424 line = self._fileHandle.readline()
425 tokens = line[:-1].split(self.delim)
426 x = [float(token) for token in tokens[self._first:self._last]]
427 if self.labelsColumn is not None :
428 self._labels.append(tokens[self.labelsColumn])
429 if self.idColumn is not None :
430 self._patternID.append(tokens[self.idColumn])
431
432 return x
433
434 - def postProcess(self) :
435
436 featureKey = [hash(id) for id in self._featureID]
437 featureKeyDict = {}
438 for i in range(len(featureKey)) :
439 featureKeyDict[featureKey[i]] = i
440
441 return self._featureID, featureKey, featureKeyDict
442
444
445 if 'hint' in args :
446 hint = args['hint']
447 if hint == 'sparse' :
448 return SparseParser(fileHandle, **args)
449 elif hint == 'csv' :
450 p = CSVParser(fileHandle, **args)
451 p.check()
452
453 return p
454
455 p = SparseParser(fileHandle, **args)
456 if p.check() :
457 return p
458
459 p = CSVParser(fileHandle, **args)
460 if p.check() :
461 return p
462
463 raise ValueError, 'file does not match existing parsers'
464
465
466 -def test(fileName) :
473