PyFileSlice
From Sfvlug
|  (Source code to a fileslicing program in python) | |||
| (One intermediate revision not shown) | |||
| Line 48: | Line 48: | ||
|   for line in awsdata: |   for line in awsdata: | ||
|   	print line |   	print line | ||
| + | |||
| + | |||
| + | Brian and I had some disagreement about the use of the ''pass'' keyword in Python.  I'm of the mind that pass is a similar analog to goto/break/etc. that should be avoided.  Research has shown that use of ''pass'' is a normal Python idiom however.  In this case, it actually makes the code cleaner, since it's simply slicing through a properly delimited text file.   | ||
| + | |||
| + | However, below is an alternate version that uses nested loop and a boolean to signal when to exit the loop.  In a C++ world, this is technically the "right" way to do it, but in Python, I'm not so sure.  The ''pass'' version is much easier to understand. | ||
| + | |||
| + |  signal = True | ||
| + |  while signal == True: | ||
| + |  	line = infile.readline() | ||
| + |  	if line.startswith( "BEGIN_PAGEREFS"): | ||
| + |  		line = infile.readline() | ||
| + |  		while not line.startswith("END_PAGEREFS"): | ||
| + |  			tempLine = line.split(' ') | ||
| + |  			# note that the use of search() and more complete patterns | ||
| + |  			# might be more efficient despite compiling. | ||
| + |  			if re.search( pat1, tempLine[0] ): | ||
| + |  				awsParsed.append( [ tempLine[0], tempLine[1] ] ) | ||
| + |  			elif re.search( pat2, tempLine[0] ): | ||
| + |  				awsParsed.append( [ tempLine[0], tempLine[1] ] ) | ||
| + |  			line=infile.readline()[:-1]	# remove trailing \ | ||
| + | , similar to chomp in perl. | ||
| + |  		signal = False | ||
| + |  	else: | ||
| + |  	line = infile.readline() | ||
| + | |||
| + | return to [[Code Vault]] | ||
Current revision as of 01:55, 12 December 2006
#!/usr/bin/env python
#
# Simple tool to spit out referrer information from an awstats database
# for later searching an analysis.  A good example of file slicing!
__author__ = "Nick Guy & Brian Guy"
__license__ = "GPL"
import sys, string;
# lolz, no argc it seems.  :P
argc = len(sys.argv)
if argc > 2 :
	print sys.argv[0] + " [filename]"
	print "[filename] is optional, leave out to use stdin"
	sys.exit(1)
# variables instantiated here to keep them in file scope.
awsdata = []
infile = False
 if argc == 2:
 	try:
		infile = open( sys.argv[1], 'r' )
	except IOError:
		print "Can't open " + sys.argv[1] + " for reading."
		sys.exit(2)
if argc == 1:
	infile = sys.stdin
# fastest method.  Note that the strings inside startswith() are
# the start and end block tokens we need.  Note also that the strings
# used to delimit the block we want are NOT included in the final output.
while not infile.readline().startswith("BEGIN_PAGEREFS"):
	pass
# This is a syntactic hack to implement do/while loops.
line=infile.readline()
while not line.startswith("END_PAGEREFS"):
	awsdata.append(line)
	line=infile.readline()[:-1]	# remove trailing \
, similar to chomp in perl.
infile.close() # send data to stdout. for line in awsdata: print line
Brian and I had some disagreement about the use of the pass keyword in Python.  I'm of the mind that pass is a similar analog to goto/break/etc. that should be avoided.  Research has shown that use of pass is a normal Python idiom however.  In this case, it actually makes the code cleaner, since it's simply slicing through a properly delimited text file.  
However, below is an alternate version that uses nested loop and a boolean to signal when to exit the loop. In a C++ world, this is technically the "right" way to do it, but in Python, I'm not so sure. The pass version is much easier to understand.
signal = True
while signal == True:
	line = infile.readline()
	if line.startswith( "BEGIN_PAGEREFS"):
		line = infile.readline()
		while not line.startswith("END_PAGEREFS"):
			tempLine = line.split(' ')
			# note that the use of search() and more complete patterns
			# might be more efficient despite compiling.
			if re.search( pat1, tempLine[0] ):
				awsParsed.append( [ tempLine[0], tempLine[1] ] )
			elif re.search( pat2, tempLine[0] ):
				awsParsed.append( [ tempLine[0], tempLine[1] ] )
			line=infile.readline()[:-1]	# remove trailing \
, similar to chomp in perl.
signal = False else: line = infile.readline()
return to Code Vault
