Commit | Line | Data |
---|---|---|
1e4a197e RD |
1 | """text_file |
2 | ||
3 | provides the TextFile class, which gives an interface to text files | |
4 | that (optionally) takes care of stripping comments, ignoring blank | |
5 | lines, and joining lines with backslashes.""" | |
6 | ||
7 | __revision__ = "$Id$" | |
8 | ||
9 | from types import * | |
10 | import sys, os, string | |
11 | ||
12 | ||
13 | class TextFile: | |
14 | ||
15 | """Provides a file-like object that takes care of all the things you | |
16 | commonly want to do when processing a text file that has some | |
17 | line-by-line syntax: strip comments (as long as "#" is your | |
18 | comment character), skip blank lines, join adjacent lines by | |
19 | escaping the newline (ie. backslash at end of line), strip | |
20 | leading and/or trailing whitespace. All of these are optional | |
21 | and independently controllable. | |
22 | ||
23 | Provides a 'warn()' method so you can generate warning messages that | |
24 | report physical line number, even if the logical line in question | |
25 | spans multiple physical lines. Also provides 'unreadline()' for | |
26 | implementing line-at-a-time lookahead. | |
27 | ||
28 | Constructor is called as: | |
29 | ||
30 | TextFile (filename=None, file=None, **options) | |
31 | ||
32 | It bombs (RuntimeError) if both 'filename' and 'file' are None; | |
33 | 'filename' should be a string, and 'file' a file object (or | |
34 | something that provides 'readline()' and 'close()' methods). It is | |
35 | recommended that you supply at least 'filename', so that TextFile | |
36 | can include it in warning messages. If 'file' is not supplied, | |
37 | TextFile creates its own using the 'open()' builtin. | |
38 | ||
39 | The options are all boolean, and affect the value returned by | |
40 | 'readline()': | |
41 | strip_comments [default: true] | |
42 | strip from "#" to end-of-line, as well as any whitespace | |
43 | leading up to the "#" -- unless it is escaped by a backslash | |
44 | lstrip_ws [default: false] | |
45 | strip leading whitespace from each line before returning it | |
46 | rstrip_ws [default: true] | |
47 | strip trailing whitespace (including line terminator!) from | |
48 | each line before returning it | |
49 | skip_blanks [default: true} | |
50 | skip lines that are empty *after* stripping comments and | |
51 | whitespace. (If both lstrip_ws and rstrip_ws are false, | |
52 | then some lines may consist of solely whitespace: these will | |
53 | *not* be skipped, even if 'skip_blanks' is true.) | |
54 | join_lines [default: false] | |
55 | if a backslash is the last non-newline character on a line | |
56 | after stripping comments and whitespace, join the following line | |
57 | to it to form one "logical line"; if N consecutive lines end | |
58 | with a backslash, then N+1 physical lines will be joined to | |
59 | form one logical line. | |
60 | collapse_join [default: false] | |
61 | strip leading whitespace from lines that are joined to their | |
62 | predecessor; only matters if (join_lines and not lstrip_ws) | |
63 | ||
64 | Note that since 'rstrip_ws' can strip the trailing newline, the | |
65 | semantics of 'readline()' must differ from those of the builtin file | |
66 | object's 'readline()' method! In particular, 'readline()' returns | |
67 | None for end-of-file: an empty string might just be a blank line (or | |
68 | an all-whitespace line), if 'rstrip_ws' is true but 'skip_blanks' is | |
69 | not.""" | |
70 | ||
71 | default_options = { 'strip_comments': 1, | |
72 | 'skip_blanks': 1, | |
73 | 'lstrip_ws': 0, | |
74 | 'rstrip_ws': 1, | |
75 | 'join_lines': 0, | |
76 | 'collapse_join': 0, | |
77 | } | |
78 | ||
79 | def __init__ (self, filename=None, file=None, **options): | |
80 | """Construct a new TextFile object. At least one of 'filename' | |
81 | (a string) and 'file' (a file-like object) must be supplied. | |
82 | They keyword argument options are described above and affect | |
83 | the values returned by 'readline()'.""" | |
84 | ||
85 | if filename is None and file is None: | |
86 | raise RuntimeError, \ | |
87 | "you must supply either or both of 'filename' and 'file'" | |
88 | ||
89 | # set values for all options -- either from client option hash | |
90 | # or fallback to default_options | |
91 | for opt in self.default_options.keys(): | |
92 | if options.has_key (opt): | |
93 | setattr (self, opt, options[opt]) | |
94 | ||
95 | else: | |
96 | setattr (self, opt, self.default_options[opt]) | |
97 | ||
98 | # sanity check client option hash | |
99 | for opt in options.keys(): | |
100 | if not self.default_options.has_key (opt): | |
101 | raise KeyError, "invalid TextFile option '%s'" % opt | |
102 | ||
103 | if file is None: | |
104 | self.open (filename) | |
105 | else: | |
106 | self.filename = filename | |
107 | self.file = file | |
108 | self.current_line = 0 # assuming that file is at BOF! | |
109 | ||
110 | # 'linebuf' is a stack of lines that will be emptied before we | |
111 | # actually read from the file; it's only populated by an | |
112 | # 'unreadline()' operation | |
113 | self.linebuf = [] | |
114 | ||
115 | ||
116 | def open (self, filename): | |
117 | """Open a new file named 'filename'. This overrides both the | |
118 | 'filename' and 'file' arguments to the constructor.""" | |
119 | ||
120 | self.filename = filename | |
121 | self.file = open (self.filename, 'r') | |
122 | self.current_line = 0 | |
123 | ||
124 | ||
125 | def close (self): | |
126 | """Close the current file and forget everything we know about it | |
127 | (filename, current line number).""" | |
128 | ||
129 | self.file.close () | |
130 | self.file = None | |
131 | self.filename = None | |
132 | self.current_line = None | |
133 | ||
134 | ||
135 | def gen_error (self, msg, line=None): | |
136 | outmsg = [] | |
137 | if line is None: | |
138 | line = self.current_line | |
139 | outmsg.append(self.filename + ", ") | |
140 | if type (line) in (ListType, TupleType): | |
141 | outmsg.append("lines %d-%d: " % tuple (line)) | |
142 | else: | |
143 | outmsg.append("line %d: " % line) | |
144 | outmsg.append(str(msg)) | |
145 | return string.join(outmsg, "") | |
146 | ||
147 | ||
148 | def error (self, msg, line=None): | |
149 | raise ValueError, "error: " + self.gen_error(msg, line) | |
150 | ||
151 | def warn (self, msg, line=None): | |
152 | """Print (to stderr) a warning message tied to the current logical | |
153 | line in the current file. If the current logical line in the | |
154 | file spans multiple physical lines, the warning refers to the | |
155 | whole range, eg. "lines 3-5". If 'line' supplied, it overrides | |
156 | the current line number; it may be a list or tuple to indicate a | |
157 | range of physical lines, or an integer for a single physical | |
158 | line.""" | |
159 | sys.stderr.write("warning: " + self.gen_error(msg, line) + "\n") | |
160 | ||
161 | ||
162 | def readline (self): | |
163 | """Read and return a single logical line from the current file (or | |
164 | from an internal buffer if lines have previously been "unread" | |
165 | with 'unreadline()'). If the 'join_lines' option is true, this | |
166 | may involve reading multiple physical lines concatenated into a | |
167 | single string. Updates the current line number, so calling | |
168 | 'warn()' after 'readline()' emits a warning about the physical | |
169 | line(s) just read. Returns None on end-of-file, since the empty | |
170 | string can occur if 'rstrip_ws' is true but 'strip_blanks' is | |
171 | not.""" | |
172 | ||
173 | # If any "unread" lines waiting in 'linebuf', return the top | |
174 | # one. (We don't actually buffer read-ahead data -- lines only | |
175 | # get put in 'linebuf' if the client explicitly does an | |
176 | # 'unreadline()'. | |
177 | if self.linebuf: | |
178 | line = self.linebuf[-1] | |
179 | del self.linebuf[-1] | |
180 | return line | |
181 | ||
182 | buildup_line = '' | |
183 | ||
184 | while 1: | |
185 | # read the line, make it None if EOF | |
186 | line = self.file.readline() | |
187 | if line == '': line = None | |
188 | ||
189 | if self.strip_comments and line: | |
190 | ||
191 | # Look for the first "#" in the line. If none, never | |
192 | # mind. If we find one and it's the first character, or | |
193 | # is not preceded by "\", then it starts a comment -- | |
194 | # strip the comment, strip whitespace before it, and | |
195 | # carry on. Otherwise, it's just an escaped "#", so | |
196 | # unescape it (and any other escaped "#"'s that might be | |
197 | # lurking in there) and otherwise leave the line alone. | |
198 | ||
199 | pos = string.find (line, "#") | |
200 | if pos == -1: # no "#" -- no comments | |
201 | pass | |
202 | ||
203 | # It's definitely a comment -- either "#" is the first | |
204 | # character, or it's elsewhere and unescaped. | |
205 | elif pos == 0 or line[pos-1] != "\\": | |
206 | # Have to preserve the trailing newline, because it's | |
207 | # the job of a later step (rstrip_ws) to remove it -- | |
208 | # and if rstrip_ws is false, we'd better preserve it! | |
209 | # (NB. this means that if the final line is all comment | |
210 | # and has no trailing newline, we will think that it's | |
211 | # EOF; I think that's OK.) | |
212 | eol = (line[-1] == '\n') and '\n' or '' | |
213 | line = line[0:pos] + eol | |
214 | ||
215 | # If all that's left is whitespace, then skip line | |
216 | # *now*, before we try to join it to 'buildup_line' -- | |
217 | # that way constructs like | |
218 | # hello \\ | |
219 | # # comment that should be ignored | |
220 | # there | |
221 | # result in "hello there". | |
222 | if string.strip(line) == "": | |
223 | continue | |
224 | ||
225 | else: # it's an escaped "#" | |
226 | line = string.replace (line, "\\#", "#") | |
227 | ||
228 | ||
229 | # did previous line end with a backslash? then accumulate | |
230 | if self.join_lines and buildup_line: | |
231 | # oops: end of file | |
232 | if line is None: | |
233 | self.warn ("continuation line immediately precedes " | |
234 | "end-of-file") | |
235 | return buildup_line | |
236 | ||
237 | if self.collapse_join: | |
238 | line = string.lstrip (line) | |
239 | line = buildup_line + line | |
240 | ||
241 | # careful: pay attention to line number when incrementing it | |
242 | if type (self.current_line) is ListType: | |
243 | self.current_line[1] = self.current_line[1] + 1 | |
244 | else: | |
245 | self.current_line = [self.current_line, | |
246 | self.current_line+1] | |
247 | # just an ordinary line, read it as usual | |
248 | else: | |
249 | if line is None: # eof | |
250 | return None | |
251 | ||
252 | # still have to be careful about incrementing the line number! | |
253 | if type (self.current_line) is ListType: | |
254 | self.current_line = self.current_line[1] + 1 | |
255 | else: | |
256 | self.current_line = self.current_line + 1 | |
257 | ||
258 | ||
259 | # strip whitespace however the client wants (leading and | |
260 | # trailing, or one or the other, or neither) | |
261 | if self.lstrip_ws and self.rstrip_ws: | |
262 | line = string.strip (line) | |
263 | elif self.lstrip_ws: | |
264 | line = string.lstrip (line) | |
265 | elif self.rstrip_ws: | |
266 | line = string.rstrip (line) | |
267 | ||
268 | # blank line (whether we rstrip'ed or not)? skip to next line | |
269 | # if appropriate | |
270 | if (line == '' or line == '\n') and self.skip_blanks: | |
271 | continue | |
272 | ||
273 | if self.join_lines: | |
274 | if line[-1] == '\\': | |
275 | buildup_line = line[:-1] | |
276 | continue | |
277 | ||
278 | if line[-2:] == '\\\n': | |
279 | buildup_line = line[0:-2] + '\n' | |
280 | continue | |
281 | ||
282 | # well, I guess there's some actual content there: return it | |
283 | return line | |
284 | ||
285 | # readline () | |
286 | ||
287 | ||
288 | def readlines (self): | |
289 | """Read and return the list of all logical lines remaining in the | |
290 | current file.""" | |
291 | ||
292 | lines = [] | |
293 | while 1: | |
294 | line = self.readline() | |
295 | if line is None: | |
296 | return lines | |
297 | lines.append (line) | |
298 | ||
299 | ||
300 | def unreadline (self, line): | |
301 | """Push 'line' (a string) onto an internal buffer that will be | |
302 | checked by future 'readline()' calls. Handy for implementing | |
303 | a parser with line-at-a-time lookahead.""" | |
304 | ||
305 | self.linebuf.append (line) | |
306 | ||
307 | ||
308 | if __name__ == "__main__": | |
309 | test_data = """# test file | |
310 | ||
311 | line 3 \\ | |
312 | # intervening comment | |
313 | continues on next line | |
314 | """ | |
315 | # result 1: no fancy options | |
316 | result1 = map (lambda x: x + "\n", string.split (test_data, "\n")[0:-1]) | |
317 | ||
318 | # result 2: just strip comments | |
319 | result2 = ["\n", | |
320 | "line 3 \\\n", | |
321 | " continues on next line\n"] | |
322 | ||
323 | # result 3: just strip blank lines | |
324 | result3 = ["# test file\n", | |
325 | "line 3 \\\n", | |
326 | "# intervening comment\n", | |
327 | " continues on next line\n"] | |
328 | ||
329 | # result 4: default, strip comments, blank lines, and trailing whitespace | |
330 | result4 = ["line 3 \\", | |
331 | " continues on next line"] | |
332 | ||
333 | # result 5: strip comments and blanks, plus join lines (but don't | |
334 | # "collapse" joined lines | |
335 | result5 = ["line 3 continues on next line"] | |
336 | ||
337 | # result 6: strip comments and blanks, plus join lines (and | |
338 | # "collapse" joined lines | |
339 | result6 = ["line 3 continues on next line"] | |
340 | ||
341 | def test_input (count, description, file, expected_result): | |
342 | result = file.readlines () | |
343 | # result = string.join (result, '') | |
344 | if result == expected_result: | |
345 | print "ok %d (%s)" % (count, description) | |
346 | else: | |
347 | print "not ok %d (%s):" % (count, description) | |
348 | print "** expected:" | |
349 | print expected_result | |
350 | print "** received:" | |
351 | print result | |
352 | ||
353 | ||
354 | filename = "test.txt" | |
355 | out_file = open (filename, "w") | |
356 | out_file.write (test_data) | |
357 | out_file.close () | |
358 | ||
359 | in_file = TextFile (filename, strip_comments=0, skip_blanks=0, | |
360 | lstrip_ws=0, rstrip_ws=0) | |
361 | test_input (1, "no processing", in_file, result1) | |
362 | ||
363 | in_file = TextFile (filename, strip_comments=1, skip_blanks=0, | |
364 | lstrip_ws=0, rstrip_ws=0) | |
365 | test_input (2, "strip comments", in_file, result2) | |
366 | ||
367 | in_file = TextFile (filename, strip_comments=0, skip_blanks=1, | |
368 | lstrip_ws=0, rstrip_ws=0) | |
369 | test_input (3, "strip blanks", in_file, result3) | |
370 | ||
371 | in_file = TextFile (filename) | |
372 | test_input (4, "default processing", in_file, result4) | |
373 | ||
374 | in_file = TextFile (filename, strip_comments=1, skip_blanks=1, | |
375 | join_lines=1, rstrip_ws=1) | |
376 | test_input (5, "join lines without collapsing", in_file, result5) | |
377 | ||
378 | in_file = TextFile (filename, strip_comments=1, skip_blanks=1, | |
379 | join_lines=1, rstrip_ws=1, collapse_join=1) | |
380 | test_input (6, "join lines with collapsing", in_file, result6) | |
381 | ||
382 | os.remove (filename) |