We had to count words in a text for a rather simple assignment. It can also count sequences of words: 'a b c d' results in 'a b', 'b c', 'c d'.
Three words: 'a b c d' results in 'a b c', 'b c d'. We had to print some stats for it.
Code:
#!/usr/bin/env python
# Option Parser
from optparse import OptionParser
# itemgetter for sorted()
from operator import itemgetter
# Regex
import re
import sys
parser = OptionParser()
parser.add_option('-f', '--file', \
help='Specify an alternative file. Default is austen.train.txt', \
default='austen.train.txt', dest='filename', \
type='string',action='store')
parser.add_option('-n', '--sequence-length', \
help='Specify an alternative sequence length. Default is 1', \
default=1, dest='sequence', type='int', action='store')
parser.add_option('-l', '--limit-sequence', \
help='Limit shown frequencies. Default is None', \
default=0, dest='limit', type='int', action='store')
parser.add_option('-v', '--verbose', \
help='Verbose.', \
dest='verbose', default=False, action='store_true')
parser.add_option('-s', '--stats', \
help='Shows how many sequences occur with a frequency of \
1 and 2.', \
dest='stats', default=False, action='store_true')
(options, args) = parser.parse_args()
if options.verbose:
print options
if options.sequence < 1:
raise ValueError('Sequence must be larger than 0')
if options.limit < 0:
raise ValueError('Limit must be larger than 0')
try:
f = open(options.filename)
except IOError:
print 'No such file: %s' % (options.filename)
sys.exit(1)
# Read entire file into a string.
s = ''.join(f.readlines())
# Parse all words.
words = re.findall(r'[\w,\']+', s)
if options.verbose:
print 'word count: %d' % len(words)
# Store words in dictionary
d = dict()
for i in range(len(words) - options.sequence + 1):
s = str()
for j in range(options.sequence):
s = s + ' ' + words[i + j]
if d.has_key(s):
d[s] = d[s]+1
else:
d[s] = 1
# Sort by frequency.
sd = sorted(d.iteritems(), key=itemgetter(1),reverse=True)
i = 0
asum = 0
# Count how many words have a frequency of 1 and 2, had to for the
# assignment
freq1,freq2 = 0, 0
# Do the actual counting and possible printing
for word,num in sd:
i = i + 1
if i <= options.limit:
print '%s : %s' % (word, num)
asum += num
if num == 1:
freq1 += 1
if num == 2:
freq2 += 1
print 'Amount of words per sequence: %d' % options.sequence
print 'Sum of frequencies: %d' % asum
if options.stats:
print 'Number of sequences with frequency 1: %d' % freq1
print 'Number of sequences with frequency 2: %d' % freq2
Usage:
Code:
$ ./austen.py -h
Usage: austen.py [options]
Options:
-h, --help show this help message and exit
-f FILENAME, --file=FILENAME
Specify an alternative file. Default is
austen.train.txt
-n SEQUENCE, --sequence-length=SEQUENCE
Specify an alternative sequence length. Default is 1
-l LIMIT, --limit-sequence=LIMIT
Limit shown frequencies. Default is None
-v, --verbose Verbose.
-s, --stats Shows how many sequences occur with a frequency of
1 and 2.