[…] Dictionaries are a common data type, which we have used in several exercises (Mark V. Shaney, Word Frequencies, Dodgson’s Doublets, Anagrams). Hash tables are often used as the underlying implementation […]

# Write a program that takes a filename and a parameter n and prints the n most
# common words in the file, and the count of their occurrences, in descending
# order.
# Require for the command line options processor.
require 'getoptlong'
# Set up the command line options
opts = GetoptLong.new(
["--number", "-n", GetoptLong::REQUIRED_ARGUMENT],
["--verbose", "-v", GetoptLong::NO_ARGUMENT]
)
# Set the default values for the options
number = 10
$verbose = false
# Parse the command line options. If we find one we don't recognize
# an exception will be thrown and we'll rescue with a message.
begin
opts.each do | opt, arg|
case opt
when "--number"
number = arg.to_i
when "--verbose"
$verbose = true
end
end
rescue
puts "Illegal command line option."
exit
end
# Create the word frequency hash.
word_freq = Hash.new(0)
# Loop through the remaining arguments which we'll assume are
# file names.
ARGV.each do |file_name|
File.open(file_name) do | file |
# Loop through each line of the file
while line = file.gets
# Split on non-words or digits. This will throw out punctuation and
# numbers. If numbers are to be included, remove the \d from the regex.
words = line.split(/[\W\d]/)
words.each do |word|
# For some reason we're getting empty strings (probably a bad regex above) so
# just toss them out here.
word_freq[word] += 1 if word != ""
end
end
end
end
# Print out the n most frequent words. First we'll sort which normally
# will sort on the key, we'll pass a block so that we can sort on the
# value. Then we'll reverse to get the largest values first, and finally
# we'll pull out the first n.
word_freq.sort{|a,b| a[1]<=>b[1]}.reverse[0, number].each do |v|
puts "#{v[0]} #{v[1]}"
end