#!/usr/bin/env ruby 

# =Wikipedia Misspellings Sampler
#
# Version:: 0.2 | September 27, 2011
# Author:: Jon Stacey
# Email:: jon@jonsview.com
# Website:: http://jonsview.com
#
# ==Description
# This program samples Wikipedia via the API.
#
# Random pages are selected, then all revisions for each page collected.
# Revisions are grouped by year and then a random revision for each
# year is chosen. The selected revision is then spell checked against
# a valid word dictionary.
#
# ==Assumptions
# There are too many assumptions to discuss. It would be more
# beneficial to read the code and understand its severe limitations
# and mistakes, or read the corresponding research paper.
#
# ==Usage
# ruby ./sampler.rb
#
# ==License
# Copyright (c) 2011 Jon Stacey. All rights reserved.
#
# I grant the right of modification and redistribution of this application for 
# non-profit use under the condition that the above Copyright and author 
# information is retained.
#
# ==Disclaimer
# This script is provided "AS-IS" with no warranty or guarantees.
#
# ==Changelog
# 0.2 - 9/27/2011: Speed improvement; cleanup
# 0.1 - 9/22/2011: Initial creation

$:.unshift File.join(File.dirname(__FILE__), "mediawiki-gateway", "lib")

require 'media_wiki'
require 'pp'
require 'wikicloth'
require 'sanitize'
require 'date'
require 'csv'

SAMPLE_SIZE = 400
@dictionary = File.open('dictionary.txt').readlines.map(&:chomp).map(&:strip).map(&:downcase)
@wikipedia = MediaWiki::Gateway.new('http://en.wikipedia.org/w/api.php', {:retry_count => 20, :maxlag => 30})
@results_file = '/Users/jon/Desktop/results.csv'
@record_years = [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011]

# Write CSV header
CSV.open(@results_file, "a+b") do |csv|
  csv << ["page title"] + @record_years
end

def preprocess(wikitext)
  page = WikiCloth::Parser.new( { :data => wikitext } )
  page = page.to_html
  page = Sanitize.clean(page)

  # Remove galleries (ugly non regex way)
  page = page.split("&lt;/gallery&gt;").collect { |c| c.split("&lt;gallery&gt;")[0] }.join
  
  # Remove "[edit]" remnants
  page = page.gsub('[edit]', '')
  
  # Remove footnotes
  page = page.gsub(/\[[0-9]+\]/, '')
end

def words(page)
  # Get words and downcase them
  words = page.split.map { |word| word.chomp.strip.downcase }
  
  # Remove some punctuation
  words.map! { |word| word.gsub(/[(\$,?!":;.)]+/, '') }
  
  words.delete_if do |word|
    # Remove words that consist of only numbers: [0-9]+
    # Remove words that consist of numbers and a trailing "s" or "'s": (('|)s|)
    word = word.gsub(/[0-9]+(('|)s|)/, '')
    
    # Try to catch percentages
    word = word.gsub(/[0-9.]+%/, '')
    
    # Remove special case date ranges: example: 1980-90
    word = word.gsub(/[0-9]+-[0-9]+/, '')
    
    # Forgive me for this disaster, oh computer science overlords.
    # Remove special case numbers such as 19th, 2nd, 1st...
    word = word.gsub(/[0-9]+th/, '')
    word = word.gsub(/[0-9]+nd/, '')
    word = word.gsub(/[0-9]+st/, '')
    word = word.gsub(/[0-9]+rd/, '')

    
    # Remove special case currency numbers such as $1.5billion
    word = word.gsub(/[0-9]+[a-zA-Z]+/, '')
    
    # Attempt to find currency
    word = word.gsub(/\(\d{1,3}(?:,?\d{3})*(?:\.\d+)?\)|-?\d{1,3}(?:,?\d{3})*(?:\.\d+)?/, '')
    
    if word == '' || word.nil? || word.size == 1
      true
    else
      false
    end
  end
  
  words
end

def spellcheck(words)
  misspellings = words - @dictionary
  error_percentage = ((misspellings.size.to_f / words.size.to_f) * 100).round(2)
  
  # puts "Found #{misspellings.size} misspellings. That's roughly #{error_percentage}% misspelled words."
  
  misspellings
end

def process_page(page_contents)
  skip_sections = ['references', 'further reading', 'external links']
  
  words = Array.new
  total_words = 0
  
  preprocessor = WikiCloth::Parser.new( { :data => page_contents } )

  preprocessor.section_list.each do |section|
    next if skip_sections.include?(section.title.downcase.strip)
    contents = preprocess(section.wikitext)
    words += words(contents)
  end
  
  misspellings = spellcheck(words)
  
  # Group by frequency
  frequencies = misspellings.inject(Hash.new(0)) { |hash, value| hash[value] += 1; hash }
  frequencies.sort_by { |value| frequencies[value] }.last
  
  return misspellings.size, words.size, frequencies
end

def random_revision_sample(page_title)
  sample_revisions = Hash.new
  hashed_revisions = Hash.new
  all_revisions = @wikipedia.revisions(page_title)
  
  # Group revisions by year
  all_revisions.each do |rev|
    timestamp = DateTime.parse(rev.attribute('timestamp').to_s)
    (hashed_revisions[timestamp.year] ||= []) << rev.attribute('revid').value
  end
  
  # Randomly select a single revision for each year of history
  hashed_revisions.each { |year, revisions| sample_revisions[year] = revisions[rand(revisions.size)] }
  
  sample_revisions
end

def record_results(page_title, results)
  # ['page', 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011]
  output = [page_title]
  
  @record_years.each do |year|
    if results[year].nil?
      output << '-'
    else
      output += [results[year][0]]
    end
  end
  
  CSV.open(@results_file, "a+b") do |csv|
    csv << output
  end
end

def sample(page_title)
  sample_revisions = random_revision_sample(page_title)
  results = Hash.new

  sample_revisions.each do |year, revision|
    # puts "new revision for #{year} - #{revision}"
    page = @wikipedia.get_revision(revision)
    count, total_words, frequencies = process_page(page)
    error_percentage = ((count.to_f / total_words.to_f) * 100).round(2)
    error_percentage = 0.0 unless error_percentage.finite?
    results[year] = [error_percentage]
  end
  
  record_results(page_title, results)
end


random_pages = @wikipedia.random(SAMPLE_SIZE, 0) # Get 2,400 random pages from the main namespace

random_pages.each_with_index do |page, index|
  puts "Sampling page #{index+1} of #{SAMPLE_SIZE} (#{page})"
  sample(page)
end

puts "ALL DONE! Let's see what the results have to say :-)"