#!/usr/bin/env ruby # =Create Dictionary Program Overview # # Version:: 0.3 | September 27, 2011 # Author:: Jon Stacey # Email:: jon@jonsview.com # Website:: http://jonsview.com # # ==Description # This program creates the valid english word dictionary needed for my # MNGT 852 Database Organization research project. # # This application processes and combines word lists from three sources: # 12-dicts, SCOWL, and Wiktionary. # # The resulting dictionary is written to a text file [one word per line] # # ==Usage # ruby ./create_dictionary.rb # # ==License # Copyright (c) 2011 Jon Stacey. All rights reserved. # # I grant the right of modification and redistribution of this application # for non-profit use under the condition that the above Copyright and # author information is retained. # # ==Disclaimer # This script is provided "AS-IS" with no warranty or guarantees. # # ==Changelog # 0.3 - 9/27/2011: Made pretty for use in paper. # 0.2 - 9/25/2011: Downcase all words; functionized; incorporate SCOWL word lists, incorporate Wiktionary dictionary. # 0.1 - 9/15/2011: Initial creation def postprocess(words) # Only accept ASCII characters. words.delete_if do |word| true unless word.force_encoding('UTF-8').ascii_only? end # Lowercase all words words.each do |word| word.downcase! end words.uniq! words.sort! end def dicts12 dict_dir = File.new('/Users/jon/Documents/School/Database Organization/Paper #3 - Wikipedia Project/Research/Word Dictionary/12dicts-5.0/').path files = ['2+2gfreq.txt', '2+2lemma.txt', '2of4brif.txt', '2of12.txt', '2of12inf.txt', '3esl.txt', '5desk.txt', '6of12.txt', 'neol2007.txt'] files.collect! { |x| dict_dir + x } # Prepend dict_dir to each file words = Array.new files.each do |collection| file = File.open(collection) strings = file.readlines.map(&:chomp) strings.each do |s| # Regex: word boundaries. All letters, numbers, periods, hyphens, # and spaces. s.scan(/[A-Za-z0-9'.\-\s]+/) do |word| word.strip! words << word.strip if word.split('').last != '-' && word.size > 0 end end file.close end puts words.size.to_s + " words in 12-dicts lists." words end def scowl dict_dir = File.new('/Users/jon/Documents/School/Database Organization/Paper #3 - Wikipedia Project/Research/Word Dictionary/SCOWL/').path files = Dir.glob(dict_dir + '**') words = Array.new files.each do |collection| file = File.open(collection) new_words = file.readlines.map(&:chomp) words = words + new_words end puts words.size.to_s + " words in SCOWL." words end def wiktionary require 'nokogiri' # Dump downloaded from http://dumps.wikimedia.org/enwiktionary/latest/ on 9/25/2011 @ 12:44PM # Dump size is around 200MB. input_file = File.new('/Users/jon/Documents/School/Database Organization/Paper #3 - Wikipedia Project/Research/Word Dictionary/enwiktionary-latest-pages-articles.xml').path reader = Nokogiri::XML::Reader(File.open(input_file)) words = Array.new count = 0 reader.each do |node| if node.name == 'page' node.read until node.name == 'title' node.read # To get contents word = node.value next unless word.length > 2 node.read until node.name == 'text' node.read # to get contents language = node.value if language.include?('==English==') && !language.include?('==Suffix==') && !language.include?('==Prefix==') # Wiktionary words could also be phrases, so we have to sort that out [e.g. "booster injection"] phrases = word.split words = words + phrases count += 1 puts count.to_s + ' - ' + word.to_s end # Nasty hack. This loop gets stuck for some reason, so we will preemptively terminate at the last word. # It's not worth the trouble to find out what's goign wrong here. # You'll have to update the count yourself if you use a different wiktionary dump. if count == 377917 puts words.size.to_s + " words from Wiktionary dump." return words end end end # reader.each end # def wiktionary dictionary = dicts12 + scowl + wiktionary dictionary = postprocess(dictionary) puts "" puts "Writing #{dictionary.size} words to dictionary file." # Write the dictionary to a simple text file # One word per line File.open('/Users/jon/Desktop/dictionary.txt', 'w') do |f| dictionary.each { |w| f.puts w } end