Naar inhoud springen

Gebruiker:Rozebotje/Archief Index/code

Uit Wikipedia, de vrije encyclopedie
#!/usr/bin/python
# -*- coding: utf-8  -*-
"""

Generates indexes of archived talk pages.

The following parameters are supported:

    -debug         If given, doesn't do any real changes, but only shows
                   what would have been changed.

    -log           Writes output to logfile

    -page:pagename
                   Create an index only on this page. 
                   Otherwise all pages which transclude 
                   the hometemplate will be processed.

    -logbook:pagename
                   Write a log to this page 

    -defaulttemplate:pagename
                   Default template to use 

    -hometemplate:pagename
                   page which is transcluded to generate the index.
                   *** This is required! ***

"""
__version__ = '$Id$'
import wikipedia
import pagegenerators
import re
import sys
import zlib
from time import strftime, localtime
from operator import itemgetter

# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
}

# contains handy static functions
class TextFunctions:

    # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/145672/index_txt
    def formatBlock(block):
        '''Format the given block of text, trimming leading/trailing
        empty lines and any leading whitespace that is common to all lines.
        The purpose is to let us list a code block as a multiline,
        triple-quoted Python string, taking care of indentation concerns.'''
        # separate block into lines
        lines = str(block).split('\n')
        # remove leading/trailing empty lines
        while lines and not lines[0]:  del lines[0]
        while lines and not lines[-1]: del lines[-1]
        # look at first line to see how much indentation to trim
        ws = re.match(r'\s*',lines[0]).group(0)
        if ws:
            lines = [ x.replace(ws,'',1) for x in lines ]
        # remove leading/trailing blank lines (after leading ws removal)
        # we do this again in case there were pure-whitespace lines
        while lines and not lines[0]:  del lines[0]
        while lines and not lines[-1]: del lines[-1]
        return '\n'.join(lines)+'\n'
    formatBlock = staticmethod(formatBlock)

    def getanchor(sectiontext):
        ''' 
        get the anchor link of a section based on the title
        on the dutch wikipedia it is not enough to call wikipedia.sectionencode
        '''
        # remove [[ ]] around sectionname    
        anchor=TextFunctions.removeformatting(sectiontext)
        
        anchor = wikipedia.sectionencode(anchor,wikipedia.getSite().encoding())

        # remove spaces at start and end (which are converted to underscores)
        while anchor[:1]=='_':
            anchor = anchor[1:]
        while anchor[-1:]=='_':
            anchor = anchor[:-1]
        return anchor
    getanchor = staticmethod(getanchor)

    def removeformatting(linktext):
        '''
        remove [[ ]] and '' from a string, convert to text which would be shown
        '''
        linktext=re.sub(r'(?x)\[\[ [^|\]]* \|( [^|\]]* ) \]\]',r'\1',linktext)
        linktext=re.sub(r'(?x)\[\[ ( [^\]]* ) \]\]',r'\1',linktext)
        linktext=re.sub(r"'''(.*)'''",r'\1',linktext)
        linktext=re.sub(r"''(.*)''",r'\1',linktext)
        return linktext
    removeformatting = staticmethod(removeformatting)

class Templates:
    '''
    Used to generate a table from an index
    a template is defined as a python dictionary
    use get to load a template from a wikipedia page
    use parsetemplate to create a template from a string
    use processindex to create a textual table of an index based on a template
    '''
    def __init__(self):
        self.templates = {}
        self.default='default'
        self.templates[self.default]=self.getdefaulttemplate()

    def loadpage(self, name):
        if name is None:
            return
        page = wikipedia.Page(wikipedia.getSite(), name)
        try:
            text = page.get()
        except (wikipedia.NoPage, wikipedia.IsRedirectPage):
            return
        self.templates[name]=self.parsetemplate(text)

    def get(self, name):
        if name not in self.templates:
            self.loadpage(name)
        if name in self.templates:
            return self.templates[name]
        return self.templates[self.default]

    def getdefaulttemplate(self):
        text=TextFunctions.formatBlock('''
            <!-- HEADER -->
            {| class="sortable"
            ! Onderwerp !! Link

            <!-- ROW -->
            |-
            | %%topic%% || [[%%link%%|%%page%%]]

            <!-- ALT ROW -->
            |- style="background: #dddddd;"
            | %%topic%% || [[%%link%%|%%page%%]]

            <!-- FOOTER -->
            |}

            <!-- END -->
            ''')
        return self.parsetemplate(text)

    def parsetemplate(self,text):
        section=''
        ret={}
        seperators2 = re.compile(r'<!--([^-]*)-->')
        parts = re.split(r'(<!--[^-]*-->)',text)
        for part in parts:
            t2 = seperators2.match(part)
            if t2 is not None:
                section=t2.group(1).strip()
            else:
                ret[section]=part.strip()+'\n'
        return ret

    def processindex(self,template,index):
        if len(index) == 0:
            return ''

        ret=''
        count=0
        if 'LEAD' in template:
            ret += template['LEAD']
        if 'HEADER' in template:
            ret += template['HEADER']

        for r in index:
            count += 1
            if count%2==0 and 'ALT ROW' in template:
                t = template['ALT ROW']
            else:
                t = template['ROW']
            t = t.replace('%%topic%%',r['topic'])
            t = t.replace('%%link%%',r['link'])
            t = t.replace('%%page%%',r['page'])
            ret += t
            
        if 'FOOTER' in template:
            ret += template['FOOTER']
        if 'TAIL' in template:
            ret += template['TAIL']

        # allow %%subst%% and %%now%% to be replaced in all headers, not just ROW
        ret = ret.replace('%%subst%%','subst:')
        ret = ret.replace('%%now%%',strftime("%d %b %Y %H:%M (%Z)"))
        ret = ret.replace('%%((%%','{{')
        ret = ret.replace('%%))%%','}}')
        ret = ret.replace('%%(%%','{')
        ret = ret.replace('%%)%%','}')
        return ret

class IndexGenerator:
    '''
    Create an archive index of a numbor of pages
    an index is a list of dictonaries with the following keys
        sortkey : lowercase text usefull for sorting
        link    : link to page
        page    : title of the (sub) page
        topic   : title of section
    readoptions is used to read an optionstring
    getoptionstring to return the current options
    addpage is an internal function to process a single page
    retrieve is used to generate the index using the previous set options
    '''
    def __init__(self):
        self.pages=[]
        self.globaloptions={}
        pass

    def setoption(self,name,value):
        if value is None:
            if name in self.globaloptions:
                del self.globaloptions[name]
        else:
            self.globaloptions[name] = str(value)

    def readoption(self,name):
        if name in self.globaloptions:
            return self.globaloptions[name]
        return None

    def changedchecksum(self,checksum):
        if 'checksum' in self.globaloptions and str(self.globaloptions['checksum']) == str(checksum):
            return False
        self.globaloptions['checksum'] = str(checksum)
        return True

    def readoptions(self,txt,pagename):
        options=txt.split(';')
        for option in options:
            opt=option.split('=',2)
            if len(opt)==2:
                if opt[0] in ('page', 'pageprefix'):
                    self.pages.append({opt[0]:opt[1]})
                elif (opt[0] in ('name','include','exclude')) and (len(self.pages) != 0):
                    self.pages[-1][opt[0]] = opt[1]
                elif opt[0] in ('checksum','template'):
                    self.globaloptions[opt[0]] = opt[1]
                else:
                    wikipedia.output('unknown/invalid option: %s=%s' % (opt[0],opt[1]))
        if len(self.pages) == 0:
            self.pages.append({'pageprefix':pagename+'/'})


    def getoptionstring(self):
        ret=''
        if 'template' in self.globaloptions:
            ret += 'template=%s;' % self.globaloptions['template']
        for indexpage in self.pages:
            if 'page' in indexpage:
                ret += 'page=%s;name=%s;' % (indexpage['page'], indexpage['name'])
            elif 'pageprefix' in indexpage:
                ret += 'pageprefix=%s;' % indexpage['pageprefix']
                if 'name' in indexpage:
                    ret += 'name=%s;' % indexpage['name']
                if 'include' in indexpage:
                    ret += 'include=%s;' % indexpage['include']
                elif 'exclude' in indexpage:
                    ret += 'exclude=%s;' % indexpage['exclude']
        if 'checksum' in self.globaloptions:
            ret += 'checksum=%s;' % self.globaloptions['checksum']
        return ret
        

    def addpage(self, page, shortname):
        pagetitle = page.title()
        try:
            text = page.get()
        except (wikipedia.NoPage, wikipedia.IsRedirectPage):
            wikipedia.output('error get()')
            return []

        # \n is enough...
        text=re.sub('\r','',text)

        ret = []

        lasttitle2=""
        lasttitle3=""
        lasttitle4=""
        title2 = re.compile(r'^==\ *([^= ].*[^= ])\ *==$')
        title3 = re.compile(r'^===\ *([^= ].*[^= ])\ *===$')
        title4 = re.compile(r'====\ *([^= ].*[^= ])\ *====')
        parts = re.split(r'(?m)^==.*==$',text)
        for part in parts:
            t2 = title2.match(part)
            t3 = title3.match(part)
            t4 = title4.match(part)
            if t2 is not None:
                lasttitle2=t2.group(1)
                lasttitle3=""
            elif t3 is not None:
                lasttitle3=t3.group(1)
                lasttitle4=""
            elif t4 is not None:
                lasttitle4=t4.group(1)
            else:
                if lasttitle4 != "":
                    lasttitle4=""
                elif lasttitle3 != "":
                    #wikipedia.output(u"===%s===" % lasttitle3)
                    anchor = TextFunctions.getanchor(lasttitle3)
                    linktext = TextFunctions.removeformatting(lasttitle3+" ("+lasttitle2+")")
                    sortkey=linktext.lower()
                    sortkey=re.sub('[^a-z]','',sortkey)
                    ret.append({'sortkey':sortkey, 'link':pagetitle+"#"+anchor, 'page':shortname, 'topic':linktext})
                    #linktext = TextFunctions.removeformatting(lasttitle2+", "+lasttitle3)
                    #sortkey=linktext.lower()
                    #sortkey=re.sub('[^a-z]','',sortkey)
                    #ret.append({'sortkey':sortkey, 'link':pagetitle+"#"+anchor, 'page':shortname, 'topic':linktext})
                elif lasttitle2 != "":
                    #wikipedia.output(u"==%s==" % lasttitle2)
                    anchor = TextFunctions.getanchor(lasttitle2)
                    linktext = TextFunctions.removeformatting(lasttitle2)
                    sortkey=linktext.lower()
                    sortkey=re.sub('[^a-z]','',sortkey)
                    ret.append({'sortkey':sortkey, 'link':pagetitle+"#"+anchor, 'page':shortname, 'topic':linktext})
        return ret
      
    def retrieve(self):
        ret=[]
        for indexpage in self.pages:
            if 'page' in indexpage:
                if 'name' not in indexpage:
                    indexpage['name']=indexpage['page']
                page = wikipedia.Page(wikipedia.getSite(), indexpage['page'])
                ret.extend(self.addpage(page, indexpage['name']))
            elif 'pageprefix' in indexpage:
                if 'name' not in indexpage:
                    indexpage['name']=''
                pagelist=[]
                subpagegen = pagegenerators.PrefixingPageGenerator(prefix = indexpage['pageprefix'])
                for subpage in subpagegen:
                    if 'include' in indexpage:
                        if re.search(indexpage['include'],subpage.title()) is not None:
                            pagelist.append(subpage.title())
                    elif 'exclude' in indexpage:
                        if re.search(indexpage['exclude'],subpage.title()) is None:
                            pagelist.append(subpage.title())
                    else:
                        pagelist.append(subpage.title())
                if pagelist != []:
                    gen = iter([wikipedia.Page(wikipedia.getSite(), t) for t in pagelist])
                    gen = pagegenerators.PreloadingGenerator(gen)
                    for page in gen:
                        ret.extend(self.addpage(page,indexpage['name']+page.title()[len(indexpage['pageprefix']):]))
    
        ret.sort(key=itemgetter('sortkey'))

        return ret


class ArchiveBot:
    '''
    '''
    # Edit summary message that should be used.
    # NOTE: Put a good description here, and add translations, if possible!
    msg = {
        'en': u'Robot: Create archive index',
        'nl': u'robot: Creëer archief index',
    }

    def __init__(self, debug, hometemplate, defaulttemplate, logbook, singlepage):
        """
        Constructor. Parameters:
            * debug     - If True, doesn't do any real changes, but only shows
                          what would have been changed.
        """
        self.generator = None
        self.debug = debug
        self.hometemplate = hometemplate
        self.defaulttemplate = defaulttemplate
        self.logbook = logbook
        self.singlepage = singlepage
        self.acceptall = False
        self.processed = 0
        self.changecount = 0
        self.errorcount = 0

        self.templates = Templates()

        if self.singlepage is not None:
            self.generator = iter([wikipedia.Page(wikipedia.getSite(), self.singlepage)])
        else:
            transclusionPage = wikipedia.Page(wikipedia.getSite(), self.hometemplate)
            self.generator = pagegenerators.ReferringPageGenerator(transclusionPage, onlyTemplateInclusion = True)
            self.generator = pagegenerators.PreloadingGenerator(self.generator)

    def createlog(self):
        if self.logbook is None:
            return
        log_page = wikipedia.Page(wikipedia.getSite(), self.logbook)
        try:
            log_text = log_page.get()
        except (wikipedia.NoPage, wikipedia.IsRedirectPage):
            log_text = ''

        old_log_text = log_text

        args = [wikipedia.decodeArg(sys.argv[0])] + map(lambda s: wikipedia.decodeArg('"%s"' % s), sys.argv[1:])
        
        log_text += '\n* Start: %s\n' % self.starttime
        log_text += r'* Command: <nowiki>' + u' '.join(args) + r'</nowiki>' + '\n'
        log_text += '* Processed: %d pages\n' % self.processed
        log_text += '* Changes: %d pages\n' % self.changecount
        log_text += '* Errors: %d pages\n' % self.errorcount
        log_text += '* End: %s\n' % self.endtime
        log_text += '----\n'

        com = wikipedia.translate(wikipedia.getSite(), self.msg) + ' (Log)'

        wikipedia.showDiff(old_log_text, log_text)

        if not self.debug:
            try:
                log_page.put(log_text, comment = com, minorEdit = True)
                #wikipedia.output('page.put()')
            except:
                wikipedia.output(u'Could not save log')

    def run(self):
        self.starttime = strftime("%d %b %Y %H:%M (%Z)")
        # Set the edit summary message
        wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), self.msg))
        for page in self.generator:
            self.treat(page)

        self.endtime = strftime("%d %b %Y %H:%M (%Z)")
        self.createlog()

    def treat(self, page):
        """
        Loads the given page, does some changes, and saves it.
        """

        self.processed += 1
        
        # Show the title of the page we're working on.
        # Highlight the title in purple.
        wikipedia.output(u"\03{lightpurple}%s\03{default}:" % page.title())

        try:
            # Load the page
            text = page.get()
        except wikipedia.NoPage:
            wikipedia.output(u"Page %s does not exist; skipping." % page.aslink())
            return
        except wikipedia.IsRedirectPage:
            wikipedia.output(u"Page %s is a redirect; skipping." % page.aslink())
            return

        if not page.botMayEdit(wikipedia.getSite().loggedInAs()):
            wikipedia.output(u"Page %s is locked for robot editing; skipping." % page.aslink())
            return

        thisindex = IndexGenerator()

        full=re.search(r'(\{\{'+self.hometemplate+r'(\|[^}]*)?\}\}(.*'+self.hometemplate+r'-->)?)',text,re.DOTALL)
        tmplopt=re.search(r'\{\{'+self.hometemplate+r'\|([^}]*)\}\}',text)
        if tmplopt is not None:
            thisindex.readoptions(tmplopt.group(1), page.title())
        else:
            wikipedia.output('cannot read options, using default')
            thisindex.readoptions('', page.title())

        if thisindex.readoption('template') is None:
            thisindex.setoption('template',self.defaulttemplate)

        idx=thisindex.retrieve()

        checktemplate=self.templates.parsetemplate(r'<!--ROW-->%%link%%')
        checktext=self.templates.processindex(checktemplate,idx)
        checksum = zlib.adler32(checktext.encode('utf8'))&0xffffffffL
        #wikipedia.output("checksum=%X" % checksum)

        t = self.templates.get(thisindex.readoption('template'))
        newtext=self.templates.processindex(t,idx)


        if thisindex.changedchecksum(checksum):
            text=fulltext.sub('{{'+self.hometemplate+'|'+thisindex.getoptionstring()+'}}'+newtext+'<!--'+self.hometemplate+'-->', text)
        else:
            wikipedia.output('Not changed')
            
        ###############################
        # save if something was changed
        
        if text != page.get():
            # show what was changed
            wikipedia.showDiff(page.get(), text)

            if not self.debug:
                if self.acceptall:
                    choice = 'y'
                else:
                    choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N')

                if choice == 'a':
                    choice = 'y'
                    self.acceptall = True
                    
                if choice == 'y':
                    self.changecount += 1
                    try:
                        # Save the page
                        page.put(text)
                        #wikipedia.output('page.put()')
                    except wikipedia.LockedPage:
                        wikipedia.output(u"Page %s is locked; skipping." % page.aslink())
                        self.errorcount += 1
                    except wikipedia.EditConflict:
                        wikipedia.output(u'Skipping %s because of edit conflict' % (page.title()))
                        self.errorcount += 1
                    except wikipedia.SpamfilterError, error:
                        wikipedia.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title(), error.url))
                        self.errorcount += 1


def main():
    # If debug is True, doesn't do any real changes, but only show
    # what would have been changed.
    debug = False
    
    hometemplate = None
    logbook = None
    defaulttemplate = None
    singlepage = None

    # Parse command line arguments
    for arg in wikipedia.handleArgs():
        if arg.startswith("-debug"):
            debug = True
        elif arg.startswith('-page:'):
            singlepage = arg[6:]
        elif arg.startswith('-logbook:'):
            logbook = arg[9:]
        elif arg.startswith('-defaulttemplate:'):
            defaulttemplate = arg[17:]
        elif arg.startswith('-hometemplate:'):
            hometemplate = arg[14:]

    if hometemplate is None:
        wikipedia.output('hometemplate is required')
        return

    bot = ArchiveBot(debug, hometemplate, defaulttemplate, logbook, singlepage)
    bot.run()

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()