wikistats

#!/usr/bin/env python                  
# -*- coding: utf-8 -*-
#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+
# Copyright (C) 2009  Joe Blaylock <jrbl@jrbl.org>
#
#This program is free software: you can redistribute it and/or modify it under 
#the terms of the GNU General Public License as published by the Free Software 
#Foundation, either version 3 of the License, or (at your option) any later 
#version.
#
#This program is distributed in the hope that it will be useful, but WITHOUT 
#ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
#FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more 
#details.
#
#You should have received a copy of the GNU General Public License along with 
#this program.  If not, see <http://www.gnu.org/licenses/>.
"""Answers simple questions from Wikimedia dumps.

FIXME: Uses a DOM parser; should obviously get re-built around SAX
"""

# Imports
import os, sys
from xml.dom import minidom as md
from datetime import datetime
import re
import difflib

from DictDB import DictDB
from EasyIO import *         # ewriteln, owriteln, ewrite, owrite, DEBUG_ERR, DEBUG_ERR

VERBOSE    = False
YAML_DATA  = None
YAML_INDEX = None
MONTHS_IN_DATASET = None
DATE_STAMP_LIST = None
SIBLING_REVISIONS = None
REDIRECT_LIST = None

HELP_USAGE_EN = """usage: %prog [wikidump.xml] [usernames.yaml]"""


# Utility Functions
getWallTime = datetime.now

def getDOM(filename):
    DEBUG_ERR("Reading XML Dump into memory...", unicode(getWallTime())+' ')
    #import codecs
    #f = codecs.open(filename, 'rb', encoding='utf-8')
    dd = md.parse(filename)
    if dd.documentElement.tagName != u'mediawiki':
        print "%s does not appear to be a MediaWiki dump.  Skipping..." % filename
        sys.exit()
    DEBUG_ERR("...done.", unicode(getWallTime())+' ')
    return dd

def getElementText(element):
    return element.firstChild.nodeValue.encode("utf8")

def getPageTitleText(page):
    return getElementText(page.getElementsByTagName('title')[0])

def dateTupOnly(dt_string):
    """Convert a string with an ISO date into a tuple of year, month, day."""
    return (dt_string[:4], dt_string[5:7], dt_string[8:10])

def eventStream(wikiDOM):
    times = wikiDOM.getElementsByTagName('timestamp')
    for t in times:
        dt = dateTupOnly(t.firstChild.nodeValue)
        rev = t.parentNode
        pg = t.parentNode.parentNode
        yield (dt, t, rev, pg)

def editorStream(wikiDOM):
    non_content_re = re.compile("^(Media:)|(Special:)|(MediaWiki:)|(Category:)|(File:)|(Help:)|(Template:)")
    editors = wikiDOM.getElementsByTagName('username')
    for e in editors:
        rev = e.parentNode.parentNode
        if non_content_re.match(getPageTitleText(rev.parentNode)):
            continue
        ed = lookupOrAdd(getElementText(e))
        dt = dateTupOnly(getElementText(rev.getElementsByTagName('timestamp')[0]))
        yield (ed, rev, dt)

def allProposals(wikiDOM):
    """Build a list pages matching "Proposal:" and "Proposal talk:"

    Ignore redirects.  Sort by page title.
    """
    props = []
    pages       = wikiDOM.getElementsByTagName('page')
    proposal_re = re.compile("^Proposal:|^Proposal talk:")
    for p in pages:
        if is_redirect(p):
            continue
        ti = getPageTitleText(p)
        if proposal_re.match(ti):
            props.append( (ti, p) )
    return sorted(props)

def lookupOrAdd(name):
    """Looks up name in YAML_DATA, adds it if its not present.

    Tries to dereference to real name.
    """
    if (YAML_DATA == None or YAML_INDEX == None):
        return name
    try:
        yam  = YAML_DATA[YAML_INDEX[name]]
        real = yam['real name']
    except KeyError:
        addNameToYAML(name)
        real = ''
    if real == '': return name
    else:          return real

def _populateDateCaches(dom):
    """Build a set of caches for date manipulations used elsewhere:

    Caches all date strings in the dataset.
    Caches a list of month, year pairs in the dataset.

    If the caches already exist, return.
    
    Cf. timestampList
    Cf. monthTimestampList
    """
    global DATE_STAMP_LIST
    global MONTHS_IN_DATASET
    if DATE_STAMP_LIST == None:
        DATE_STAMP_LIST = timestampList(dom)
    if MONTHS_IN_DATASET == None:
        MONTHS_IN_DATASET = monthTimestampList(dom)
    return

def buildDateCache(dom):
    """Make sure that the date caches are built correctly; error if not."""
    _populateDateCaches(dom)
    if DATE_STAMP_LIST == None or MONTHS_IN_DATASET == None:
        raise Exception, "Date cache initialization failed."
    return

def buildEventIndex(event_stream):
    event_index = {}
    for eventTuple in event_stream:
        dt, ts, rev, pg = eventTuple
        dts = '-'.join(dt)
        if dts not in event_index:
            event_index[dts] = {pg: [(ts, rev)]}
        elif pg not in event_index[dts]:
            event_index[dts][pg] = [(ts, rev)]
        else: 
            event_index[dts][pg].append( (ts, rev) )
    return event_index

def buildEditorIndex(editor_stream):
    editor_index = {}
    for editorTuple in editor_stream:
        ed, rev, dt = editorTuple
        if ed not in editor_index:
            editor_index[ed] = [ (dt, rev) ]
        else: 
            editor_index[ed].append( (dt, rev) )
    return editor_index

def getRevID(revision):
    #return getElementText(revision.getElementsByTagName('id')[0])
    return revision.getElementsByTagName('id')[0].firstChild.nodeValue.encode("utf8")

def getRevEditor(rev, registered_only=False):
    """Given a (timestamp, revision) pair, determine the name of the revs author.

    If registered_only is True, will return the names of registered wiki editors
    or None if unknown.  If registered_only is False (the default), will return 
    the IP address if the username is unknown.
    """
    revision = rev[1]
    usernames = revision.getElementsByTagName('username')
    ips       = revision.getElementsByTagName('ip')
    if len(usernames) == 0:
        if registered_only: return None
        if len(ips) > 0:
            return getElementText(ips[0])
        else:
            return None 
    else:
        return lookupOrAdd(getElementText(usernames[0]))

def getRevisionText(revision):
    text_e = revision.getElementsByTagName('text')
    try:
        return getElementText(text_e[0])
    except AttributeError:
        return ""

#def getPreviousRevision(revision):
#    my_id = getRevID(revision)
#    sibs = revision.parentNode.getElementsByTagName('revision')
#    def cmp(x, y):
#        if getRevID(x) < getRevID(y): return -1
#        elif getRevID(x) > getRevID(y): return 1
#        else: return 0
#    sibs = sorted(sibs, cmp)
#    for i in range(len(sibs)):
#        if getRevID(sibs[i]) == my_id:
#            if i == 0: return None
#            return sibs[i -1]
#    return None

def getSiblingRevisions(revision):
    def cmp(x, y):
        if getRevID(x) < getRevID(y): return -1
        elif getRevID(x) > getRevID(y): return 1
        else: return 0
    global SIBLING_REVISIONS
    if (SIBLING_REVISIONS == None) or (revision not in SIBLING_REVISIONS):
        sibs = revision.parentNode.getElementsByTagName('revision')
        SIBLING_REVISIONS = sorted(sibs, cmp)
    return SIBLING_REVISIONS

def diffTexts(a, b):
    ed_counter = 0
    ed_sizes   = []
    sm = difflib.SequenceMatcher(None, a, b)
    for code, i1, i2, j1, j2 in sm.get_opcodes():
        if code == 'equal':
            continue
        ed_counter += 1
        if code == 'delete':
            ed_sizes.append(i2 - i1)
        elif code == 'insert':
            ed_sizes.append(j2 - j1)
        elif code == 'replace':
            if (j2 - j1) >= (i2 - i1):
                ed_sizes.append(j2 - j1)
            elif (j2 - j2) < (i2 - i1):
                ed_sizes.append(i2 - i1)
    return (ed_counter, ed_sizes)

def editorList(revlist, registered_only=False):
    edlist = []
    for rev in revlist:
        editor = getRevEditor(rev, registered_only)
        if editor not in edlist: edlist.append(editor)
    return sorted(edlist)

def countedEditorList(revlist):
    edlist = {}
    for rev in revlist:
        editor = getRevEditor(rev, True)
        if editor == None: continue
        if editor not in edlist: edlist[editor] = 1
        else: edlist[editor] += 1
    return edlist.items()

def pageCountedEditorList(page):
    edcounts = {}
    for rev in page.getElementsByTagName('revision'):
        editor = getRevEditor((None, rev), False)
        if editor == None: continue
        if editor not in edcounts: edcounts[editor] = 1
        else: edcounts[editor] += 1
    return edcounts

def activeMonths(page, editor):
    active = {}
    for rev in page.getElementsByTagName('revision'):
        ed = getRevEditor((None, rev), False)
        if ed != editor: continue
        time = rev.getElementsByTagName('timestamp')
        if len(time) == 0: continue
        time = dateTupOnly(time[0].firstChild.nodeValue)
        yr = time[0]
        mo = int(time[1]) - 1
        if yr not in active: 
            active[yr] = [False] * 12
        active[yr][mo] = True
    return active

def edNewInMonth(yr, mo, active):
    previous = False
    for year in sorted(active.keys()):
        for month in range(len(active[year])):
            if (active[year][month] == True) and (year != yr) and (month != mo):
                return False
            elif (active[year][month] == True) and (year == yr) and (month == mo):
                return True
            else:
                continue

def timestampList(wikiDOM):
    """Returns a list of every timestamp represented in the data set.
       
    XXX: Makes no attempt to do gap detection."""
    non_content_re = re.compile("^(Media:)|(Special:)|(MediaWiki:)|(Category:)|(File:)|(Help:)|(Template:)")
    allTimes = [dateTupOnly(t.firstChild.nodeValue) for t in wikiDOM.getElementsByTagName('timestamp') if 
                                                             non_content_re.match(getPageTitleText(t.parentNode.parentNode)) == None]
    return sorted(list(set(allTimes)))

def monthTimestampList(wikiDOM):
    """Returns a list of pairs representing year/months in the data set."""
    global DATE_STAMP_LIST
    if DATE_STAMP_LIST == None:
        DATE_STAMP_LIST = timestampList(wikiDOM)
    allTimes = [(t[0], t[1]) for t in DATE_STAMP_LIST]
    return sorted(list(set(allTimes)))

def datesAndCountsForPagesRevised(time_list, page_list):
    """Return a dictionary counting how many revisions for each page are on each date."""
    counts = {}
    times = ['-'.join(t) for t in time_list]
    for t in times:
        counts[t] = {}
        counts['TOTALS'] = {}
    for title,pgref in page_list:
        pageDates = ['-'.join(dateTupOnly(t.firstChild.nodeValue)) for t in pgref.getElementsByTagName('timestamp')]
        for t in times:
            counts[t][title] = 0
        counts['TOTALS'][title] = 0
        #pgref = page_list[title]
        for d in pageDates:
            counts[d][title] += 1
            counts['TOTALS'][title] += 1
    return counts

def is_redirect(page):
    global REDIRECT_LIST
    if REDIRECT_LIST == None:
        redirects =  page.parentNode.getElementsByTagName('redirect')
        REDIRECT_LIST = [redirect.parentNode for redirect in redirects]
    return page in REDIRECT_LIST

def summaryCountsByDate(event_index):
    total_pages         = {}
    total_content_pages = {}
    total_proposals     = {}
    registered_users    = {}
    total_edits         = 0
    new_reg_users       = 0

    proposal_re_str    = "^(Proposal:)"
    non_content_re_str = "^(User:)|(Talk[: ])|(.+ talk:)|(Media:)|(Special:)|(MediaWiki:)|(Category:)|(File:)|(Help:)|(Template:)|(Proposals/)"

    proposal_re = re.compile(proposal_re_str)
    non_content_re = re.compile(non_content_re_str)

    for date in sorted(event_index.keys()):
        good_content     = False
        output = [date]

        proposals_edited     = 0
        new_proposals_today  = 0
        proposal_edits       = 0
        proposal_editors     = 0
        proposal_registered_editors = 0

        edits_by_pages       = [ ]
        new_pages_today      = [ ]
        authors_today        = {}

        for page in sorted(event_index[date].keys()):
            new_flag     = False
            revlist      = event_index[date][page]
            pagename     = getPageTitleText(page)

            # Summary (Philippe) Stats
            if pagename not in total_pages: 
                new_flag = True
                total_pages[pagename] = date

            if non_content_re.match(pagename):             # Page marked for skipping count only towards our grand total
                good_content = good_content                # no-op to make it clear we're doing disjunction of goodness 
                continue                              
            if is_redirect(page):                          # also skip redirects
                good_content = good_content
                continue

            good_content = True                            # One unskipped page on a date makes the date good
            total_content_pages[pagename] = date

            for editor in editorList(revlist, True):
                if editor not in registered_users:
                    registered_users[editor] = date
                    new_reg_users += 1
            total_edits += len(revlist)

            # Proposal Stats
            if proposal_re.match(pagename):
                total_proposals[pagename] = date
                proposals_edited += 1
                proposal_edits  += len(revlist)
                if new_flag: new_proposals_today += 1
                proposal_registered_editors += len(editorList(revlist, True))
                editors = editorList(revlist)
                proposal_editors += len(editors)
                if None in editors: proposal_editors -= 1

            # Erik's stats Pt. 1
            edits_by_pages.append(len(revlist))
            if new_flag:
                new_pages_today.append(pagename)
            for editor in countedEditorList(revlist):
                if editor[0] in authors_today: authors_today[editor[0]] += editor[1]
                else: authors_today[editor[0]] = editor[1]

        if good_content == False:                          # Only skipped pages on a date make the date bad       
            continue

        # Erik's stats Pt. 2
        ed_per_pg_tot = sum(edits_by_pages)    
        ed_per_pg_avg = float(len(event_index[date].keys()))/ed_per_pg_tot if ed_per_pg_tot else 0
        eds10_today = [x[0] for x in authors_today.items() if x[1] >= 10]
        eds10_today_str = ''
        for name in eds10_today:
            eds10_today_str += name + ','

        output.extend( (len(total_content_pages.keys()), len(total_pages.keys()), len(registered_users.keys()), 
                       total_edits, new_reg_users, len(total_proposals.keys()), proposals_edited, 
                       new_proposals_today, proposal_edits, proposal_registered_editors, proposal_editors, 
                       ed_per_pg_avg, len(new_pages_today), len(eds10_today))  )
        output.append( eds10_today_str )
        yield output

def editorCounts(editor_index):
    # { ed: [ ( dt, rev ), ( dt, rev ) ... ] }
    DEBUG_ERR("Starting editor-by-editor processing", unicode(getWallTime())+' ')
    debug_revs2go = sum([len(v) for v in editor_index.values()])
    debug_counter = 0
    debug_2pct = debug_revs2go/50
    for ed in editor_index:
        output = [ ed ]
        revlist = editor_index[ed]

        pages_created = 0
        edit_counts = [ ]
        edit_sizes = 0

        for dt, rev in revlist:

            if VERBOSE:
                if debug_counter % debug_2pct == 0: 
                    sys.stderr.write(".")
                    sys.stderr.flush()
                debug_counter += 1

            sibs = getSiblingRevisions(rev)
            index = sibs.index(rev)
            #prev = getPreviousRevision(rev)
            #if prev == None: 
            if index == 0: 
                prev_text = ''
                pages_created += 1
            #else: prev_text = getRevisionText(prev)
            else: prev_text = getRevisionText(sibs[index - 1])
            cur_text = getRevisionText(rev)
            editCount, editSizes = diffTexts(prev_text, cur_text)
            edit_counts.append(editCount)
            edit_sizes += sum(editSizes)
            
        avg_edit_count_per_revision = float(sum(edit_counts))/len(edit_counts)
        avg_edit_size = float(edit_sizes)/len(edit_counts) 

        output.extend( (len(revlist), pages_created, avg_edit_count_per_revision, avg_edit_size) )
        yield output
    DEBUG_ERR("...done.", unicode(getWallTime())+' ')

def proposalCounts(prop_list, wikiDOM):
    DEBUG_ERR("Starting proposal-based processing...", unicode(getWallTime())+' ')
    buildDateCache(wikiDOM)
    totals = datesAndCountsForPagesRevised(DATE_STAMP_LIST, prop_list)
    for title, pg_ref in prop_list:
        edcnts = pageCountedEditorList(pg_ref)
        edcnt_keys = edcnts.keys()
        output = [title, totals['TOTALS'][title], len(edcnt_keys)]          # Name, Edits, Unique Editors
        for year, month in MONTHS_IN_DATASET:                                # Total Active Editors per Month (Active = 5+ Edits)
            edcounts = {}
            for rev in pg_ref.getElementsByTagName('revision'):
                editor = getRevEditor((None, rev), False)
                timestamp = dateTupOnly(rev.getElementsByTagName('timestamp')[0].firstChild.nodeValue)
                if editor == None: continue
                if timestamp[0] != year: continue
                if timestamp[1] != month: continue
                if editor in edcounts: edcounts[editor] += 1
                else: edcounts[editor] = 1
            output.append( sum([1 for x in edcounts if edcounts[x] >= 10]) )
        for year, month in MONTHS_IN_DATASET:                                # New Editors per Month ...
            counter = 0
            for editor in edcnt_keys:
                if edNewInMonth(year, month, activeMonths(pg_ref, editor)):
                    counter += 1
            output.append(counter)
        for date in DATE_STAMP_LIST:                                         # Total Edits per day ...
            output.append(totals['-'.join(date)][title])
        yield output
    DEBUG_ERR("...done.", unicode(getWallTime())+' ')

def getSummaryCSVHeaders():
    """return the names for the header line for a csv file.  Defined below.

        Date: a day.
        Total Content Pages: The total number of pages whose names don't begin with 
            "User", "Talk", or "Template".  Should increase monotonically.
        Total Pages: The total number of pages in the wiki.  Should increase 
            monotonically.
        Page Editors: The total count of unique usernames seen in the log. 
            Should increase monotonically.
        Total Edits: The total number of revisions to pages since the dawn of time.  
            Should increase monotonically.
        Total Proposals: The total number of pages whose names begine with "Proposal".
            Should increase monotonically.
        Proposals Edited: The number of pages called "Proposal" which have been 
            edited on Date.
        New Proposals: Number of pages called "Proposal" whose first revision is Date.
        Proposal Edits: The number of edits to pages called "Proposal" on Date.
        Proposal Reg Editors: Count of unique registered users editing pages called 
            "Proposal" on Date.
        Proposal Editors: Count of unique signifiers (usernames and IPs) editing pages
            called "Proposal" on Date.
    """

    output = []
    output.extend( ('Date', 'Total Content Pages', 'Total Pages', 'Page Editors', 'Total Edits', 
                    'New Page Editors', 'Total Proposals', 'Proposals Edited', 'New Proposals', 
                    'Proposal Edits', 'Proposal Reg Editors', 'Proposal Editors', 'Edits/Page Today', 
                    'New Pages Today', '# Editors w/10+ Today', 'Editors w/10+ Today') )  
    return output

def getEditorCSVHeaders():
    """return the names for the header line for a csv file.  Defined below.

    Editor: Registered nick of the editor
    Edits: Number of edits since dawn of time
    Pages Created: Number of first revisions since dawn of time
    Avg Changes/Rev: Average number of changes made to the text per revision
    Avg Change Size: Average size of changes made to texts, in characters
    """

    output = []
    output.extend( ('Editor', 'Edits', 'Pages Created', 'Avg Changes/Rev', 'Avg Change Size') )
    return output

def getPageCSVHeaders(wikiDOM):
    # Proposal Page Name, Total Edits, Total Unique Editors, Total Active Editors/Mo, ... , New Editors/Mo, Total Edits/Day01, ...
    buildDateCache(wikiDOM)
    output = ["Page Name", "Total Edits", "Total Unique Editors"]
    for year, month in MONTHS_IN_DATASET:                                # Total Active Editors per Month (Active = 10+ Edits)
        output.append("Active (10+) Eds %s-%s" % (unicode(year), unicode(month)))
    for year, month in MONTHS_IN_DATASET:                                # New Editors per Month ...
        output.append("New Eds %s-%s" % (unicode(year), unicode(month)))
    for date in DATE_STAMP_LIST:                                         # Total Edits per day ...
        output.append("Eds %s" % unicode('-'.join(date)))
    return output

def statsSummary(dumpDOM, output=sys.stdout):
    event_index = buildEventIndex(eventStream(dumpDOM))
    csvOut(summaryCountsByDate(event_index), output, header=getSummaryCSVHeaders())

def statsEditors(dumpDOM, output=sys.stdout):
    try:
        import psyco
        psyco.full()
    except ImportError:
        pass
    editor_index = buildEditorIndex(editorStream(dumpDOM))
    csvOut(editorCounts(editor_index), output, header=getEditorCSVHeaders() ) 

def statsProposals(dumpDOM, output=sys.stdout):
    csvOut( proposalCounts( allProposals(dumpDOM), 
                            dumpDOM), 
            output, 
            header=getPageCSVHeaders(dumpDOM) )

def csvOut(iterable, file_obj=sys.stdout, header=None):
    import csv
    csv.register_dialect("ooffice_like", delimiter=',', skipinitialspace=True, 
                         lineterminator="\n", quoting=csv.QUOTE_NONNUMERIC)
    if header:
        csv.writer(file_obj, dialect="ooffice_like").writerow(header)
    for collection in iterable:
        csv.writer(file_obj, dialect="ooffice_like").writerow(collection)

def read_yaml(filename):
    global YAML_DATA
    global YAML_INDEX
    YAML_DATA = DictDB(filename, format='yaml', verbose=VERBOSE)
    DEBUG_ERR("Building index...", unicode(getWallTime()))
    YAML_INDEX = wikinameIndexFromYAML()
    DEBUG_ERR("...done.", unicode(getWallTime())+' ')
    return YAML_DATA, YAML_INDEX

def close_yaml(filename):
    for key in YAML_DATA.keys():
        for i in range(len(YAML_DATA[key]['wiki'])):
            payload = YAML_DATA[key]['wiki'][i]
            if isinstance(payload, unicode):
                payload = payload.encode("utf-8")
            YAML_DATA[key]['wiki'][i] = payload
    YAML_DATA.sync(filename)

def wikinameIndexFromYAML():
    global YAML_DATA
    global YAML_INDEX
    if YAML_INDEX == None:
        YAML_INDEX = {}
    if YAML_DATA == None: return
    for id in YAML_DATA:
        names = YAML_DATA[id]['wiki']
        for name in names:
            YAML_INDEX[name] = id
    return YAML_INDEX

def addNameToYAML(name):      # NOT THREAD SAFE
    if name in YAML_INDEX: return
    try:
        highest = max(YAML_DATA.keys())
    except ValueError:
        highest = 0
    new = highest + 1
    try:
        YAML_DATA[new] = {'real name': '', 'email': [''], 'irc': [''], 'wiki': [ unicode(name, "utf8") ]}
    except UnicodeDecodeError, msg:
        print name
        raise 
    YAML_INDEX[name] = new


# Test Harness
if __name__ == "__main__":

    import optparse    
    usage = HELP_USAGE_EN
    parser = optparse.OptionParser(usage = usage)
    parser.add_option('-s', '--summary', dest="summary_stats", action="store_true", default=False,
                         help="Whole-archive summary stats, including proposal stats")
    parser.add_option('-e', '--editors', dest="editor_stats", action="store_true", default=False,
                         help="Stats broken out on a per-user basis")
    parser.add_option('-p', '--proposals', dest="proposal_stats", action="store_true", default=False,
                         help="Stats broken out by proposal")
    parser.add_option('-v', '--verbose', dest="verbose_flag", action="store_true", default=False,
                         help="Produce logging output on stderr")
    parser.add_option('-w', '--wikidump', dest="wikidump", action="store", metavar="FILE",
                         help="Process Mediawiki dump file FILE")
    parser.add_option('-y', '--yaml-file', dest="yaml_file", action="store", metavar="FILE",
                         help="Dereference usernames against YAML file FILE")
    parser.add_option('-o', '--output', dest="outfile", action="store", metavar="FILE", default='',
                         help="Write CSV output to FILE")
    
    opts, args = parser.parse_args()
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit()

    wikidump = None
    yaml_file = "usernames.yaml"
    if len(args) >= 1:
        wikidump = args[0]
    if len(args) >= 2:
        yaml_file = args[1]
    if opts.wikidump:
        wikidump = opts.wikidump
    if wikidump == None:
        parser.error("Please specify Mediawiki dump file to process with the -w FILE flag.")
    if opts.yaml_file:
        yaml_file = opts.yaml_file
    VERBOSE = opts.verbose_flag

    outfile = sys.stdout
    if opts.outfile:
        outfile = open(opts.outfile, 'w')
    else: 
        if VERBOSE: ewriteln("Using verbose without specifying an outfile is ill-advised.", 
                             "WARNING: ")

    if (opts.summary_stats or opts.editor_stats or opts.proposal_stats):
        read_yaml(yaml_file)
        dumpDOM = getDOM(wikidump)
    else: 
        sys.stderr.write("Please select from -e, -p, -s\n")
        parser.print_help()
        sys.exit()

    if opts.summary_stats:
        statsSummary(dumpDOM, outfile)
    if opts.editor_stats:
        statsEditors(dumpDOM, outfile)
    if opts.proposal_stats:
        statsProposals(dumpDOM, outfile)

    close_yaml(yaml_file)
    dumpDOM.unlink()