PyFileSlice
From Sfvlug
(Source code to a fileslicing program in python) |
|||
Line 48: | Line 48: | ||
for line in awsdata: | for line in awsdata: | ||
print line | print line | ||
+ | |||
+ | |||
+ | Brian and I had some disagreement about the use of the ''pass'' keyword in Python. I'm of the mind that pass is a similar analog to goto/break/etc. that should be avoided. Research has shown that use of ''pass'' is a normal Python idiom however. In this case, it actually makes the code cleaner, since it's simply slicing through a properly delimited text file. | ||
+ | |||
+ | However, below is an alternate version that uses nested loop and a boolean to signal when to exit the loop. In a C++ world, this is technically the "right" way to do it, but in Python, I'm not so sure. The ''pass'' version is much easier to understand. | ||
+ | |||
+ | signal = True | ||
+ | while signal == True: | ||
+ | line = infile.readline() | ||
+ | if line.startswith( "BEGIN_PAGEREFS"): | ||
+ | line = infile.readline() | ||
+ | while not line.startswith("END_PAGEREFS"): | ||
+ | tempLine = line.split(' ') | ||
+ | # note that the use of search() and more complete patterns | ||
+ | # might be more efficient despite compiling. | ||
+ | if re.search( pat1, tempLine[0] ): | ||
+ | awsParsed.append( [ tempLine[0], tempLine[1] ] ) | ||
+ | elif re.search( pat2, tempLine[0] ): | ||
+ | awsParsed.append( [ tempLine[0], tempLine[1] ] ) | ||
+ | line=infile.readline()[:-1] # remove trailing \ | ||
+ | , similar to chomp in perl. | ||
+ | signal = False | ||
+ | else: | ||
+ | line = infile.readline() |
Revision as of 01:52, 12 December 2006
#!/usr/bin/env python # # Simple tool to spit out referrer information from an awstats database # for later searching an analysis. A good example of file slicing! __author__ = "Nick Guy & Brian Guy" __license__ = "GPL" import sys, string; # lolz, no argc it seems. :P argc = len(sys.argv) if argc > 2 : print sys.argv[0] + " [filename]" print "[filename] is optional, leave out to use stdin" sys.exit(1) # variables instantiated here to keep them in file scope. awsdata = [] infile = False if argc == 2: try: infile = open( sys.argv[1], 'r' ) except IOError: print "Can't open " + sys.argv[1] + " for reading." sys.exit(2) if argc == 1: infile = sys.stdin # fastest method. Note that the strings inside startswith() are # the start and end block tokens we need. Note also that the strings # used to delimit the block we want are NOT included in the final output. while not infile.readline().startswith("BEGIN_PAGEREFS"): pass # This is a syntactic hack to implement do/while loops. line=infile.readline() while not line.startswith("END_PAGEREFS"): awsdata.append(line) line=infile.readline()[:-1] # remove trailing \
, similar to chomp in perl.
infile.close() # send data to stdout. for line in awsdata: print line
Brian and I had some disagreement about the use of the pass keyword in Python. I'm of the mind that pass is a similar analog to goto/break/etc. that should be avoided. Research has shown that use of pass is a normal Python idiom however. In this case, it actually makes the code cleaner, since it's simply slicing through a properly delimited text file.
However, below is an alternate version that uses nested loop and a boolean to signal when to exit the loop. In a C++ world, this is technically the "right" way to do it, but in Python, I'm not so sure. The pass version is much easier to understand.
signal = True while signal == True: line = infile.readline() if line.startswith( "BEGIN_PAGEREFS"): line = infile.readline() while not line.startswith("END_PAGEREFS"): tempLine = line.split(' ') # note that the use of search() and more complete patterns # might be more efficient despite compiling. if re.search( pat1, tempLine[0] ): awsParsed.append( [ tempLine[0], tempLine[1] ] ) elif re.search( pat2, tempLine[0] ): awsParsed.append( [ tempLine[0], tempLine[1] ] ) line=infile.readline()[:-1] # remove trailing \
, similar to chomp in perl.
signal = False else: line = infile.readline()