#!/usr/bin/env ruby # =Wikipedia Misspellings Sampler # # Version:: 0.2 | September 27, 2011 # Author:: Jon Stacey # Email:: jon@jonsview.com # Website:: http://jonsview.com # # ==Description # This program samples Wikipedia via the API. # # Random pages are selected, then all revisions for each page collected. # Revisions are grouped by year and then a random revision for each # year is chosen. The selected revision is then spell checked against # a valid word dictionary. # # ==Assumptions # There are too many assumptions to discuss. It would be more # beneficial to read the code and understand its severe limitations # and mistakes, or read the corresponding research paper. # # ==Usage # ruby ./sampler.rb # # ==License # Copyright (c) 2011 Jon Stacey. All rights reserved. # # I grant the right of modification and redistribution of this application for # non-profit use under the condition that the above Copyright and author # information is retained. # # ==Disclaimer # This script is provided "AS-IS" with no warranty or guarantees. # # ==Changelog # 0.2 - 9/27/2011: Speed improvement; cleanup # 0.1 - 9/22/2011: Initial creation $:.unshift File.join(File.dirname(__FILE__), "mediawiki-gateway", "lib") require 'media_wiki' require 'pp' require 'wikicloth' require 'sanitize' require 'date' require 'csv' SAMPLE_SIZE = 400 @dictionary = File.open('dictionary.txt').readlines.map(&:chomp).map(&:strip).map(&:downcase) @wikipedia = MediaWiki::Gateway.new('http://en.wikipedia.org/w/api.php', {:retry_count => 20, :maxlag => 30}) @results_file = '/Users/jon/Desktop/results.csv' @record_years = [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011] # Write CSV header CSV.open(@results_file, "a+b") do |csv| csv << ["page title"] + @record_years end def preprocess(wikitext) page = WikiCloth::Parser.new( { :data => wikitext } ) page = page.to_html page = Sanitize.clean(page) # Remove galleries (ugly non regex way) page = page.split("</gallery>").collect { |c| c.split("<gallery>")[0] }.join # Remove "[edit]" remnants page = page.gsub('[edit]', '') # Remove footnotes page = page.gsub(/\[[0-9]+\]/, '') end def words(page) # Get words and downcase them words = page.split.map { |word| word.chomp.strip.downcase } # Remove some punctuation words.map! { |word| word.gsub(/[(\$,?!":;.)]+/, '') } words.delete_if do |word| # Remove words that consist of only numbers: [0-9]+ # Remove words that consist of numbers and a trailing "s" or "'s": (('|)s|) word = word.gsub(/[0-9]+(('|)s|)/, '') # Try to catch percentages word = word.gsub(/[0-9.]+%/, '') # Remove special case date ranges: example: 1980-90 word = word.gsub(/[0-9]+-[0-9]+/, '') # Forgive me for this disaster, oh computer science overlords. # Remove special case numbers such as 19th, 2nd, 1st... word = word.gsub(/[0-9]+th/, '') word = word.gsub(/[0-9]+nd/, '') word = word.gsub(/[0-9]+st/, '') word = word.gsub(/[0-9]+rd/, '') # Remove special case currency numbers such as $1.5billion word = word.gsub(/[0-9]+[a-zA-Z]+/, '') # Attempt to find currency word = word.gsub(/\(\d{1,3}(?:,?\d{3})*(?:\.\d+)?\)|-?\d{1,3}(?:,?\d{3})*(?:\.\d+)?/, '') if word == '' || word.nil? || word.size == 1 true else false end end words end def spellcheck(words) misspellings = words - @dictionary error_percentage = ((misspellings.size.to_f / words.size.to_f) * 100).round(2) # puts "Found #{misspellings.size} misspellings. That's roughly #{error_percentage}% misspelled words." misspellings end def process_page(page_contents) skip_sections = ['references', 'further reading', 'external links'] words = Array.new total_words = 0 preprocessor = WikiCloth::Parser.new( { :data => page_contents } ) preprocessor.section_list.each do |section| next if skip_sections.include?(section.title.downcase.strip) contents = preprocess(section.wikitext) words += words(contents) end misspellings = spellcheck(words) # Group by frequency frequencies = misspellings.inject(Hash.new(0)) { |hash, value| hash[value] += 1; hash } frequencies.sort_by { |value| frequencies[value] }.last return misspellings.size, words.size, frequencies end def random_revision_sample(page_title) sample_revisions = Hash.new hashed_revisions = Hash.new all_revisions = @wikipedia.revisions(page_title) # Group revisions by year all_revisions.each do |rev| timestamp = DateTime.parse(rev.attribute('timestamp').to_s) (hashed_revisions[timestamp.year] ||= []) << rev.attribute('revid').value end # Randomly select a single revision for each year of history hashed_revisions.each { |year, revisions| sample_revisions[year] = revisions[rand(revisions.size)] } sample_revisions end def record_results(page_title, results) # ['page', 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011] output = [page_title] @record_years.each do |year| if results[year].nil? output << '-' else output += [results[year][0]] end end CSV.open(@results_file, "a+b") do |csv| csv << output end end def sample(page_title) sample_revisions = random_revision_sample(page_title) results = Hash.new sample_revisions.each do |year, revision| # puts "new revision for #{year} - #{revision}" page = @wikipedia.get_revision(revision) count, total_words, frequencies = process_page(page) error_percentage = ((count.to_f / total_words.to_f) * 100).round(2) error_percentage = 0.0 unless error_percentage.finite? results[year] = [error_percentage] end record_results(page_title, results) end random_pages = @wikipedia.random(SAMPLE_SIZE, 0) # Get 2,400 random pages from the main namespace random_pages.each_with_index do |page, index| puts "Sampling page #{index+1} of #{SAMPLE_SIZE} (#{page})" sample(page) end puts "ALL DONE! Let's see what the results have to say :-)"