Gebruiker:Rozebotje/Archief Index/code
Uiterlijk
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Generates indexes of archived talk pages.
The following parameters are supported:
-debug If given, doesn't do any real changes, but only shows
what would have been changed.
-log Writes output to logfile
-page:pagename
Create an index only on this page.
Otherwise all pages which transclude
the hometemplate will be processed.
-logbook:pagename
Write a log to this page
-defaulttemplate:pagename
Default template to use
-hometemplate:pagename
page which is transcluded to generate the index.
*** This is required! ***
"""
__version__ = '$Id$'
import wikipedia
import pagegenerators
import re
import sys
import zlib
from time import strftime, localtime
from operator import itemgetter
# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
}
# contains handy static functions
class TextFunctions:
# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/145672/index_txt
def formatBlock(block):
'''Format the given block of text, trimming leading/trailing
empty lines and any leading whitespace that is common to all lines.
The purpose is to let us list a code block as a multiline,
triple-quoted Python string, taking care of indentation concerns.'''
# separate block into lines
lines = str(block).split('\n')
# remove leading/trailing empty lines
while lines and not lines[0]: del lines[0]
while lines and not lines[-1]: del lines[-1]
# look at first line to see how much indentation to trim
ws = re.match(r'\s*',lines[0]).group(0)
if ws:
lines = [ x.replace(ws,'',1) for x in lines ]
# remove leading/trailing blank lines (after leading ws removal)
# we do this again in case there were pure-whitespace lines
while lines and not lines[0]: del lines[0]
while lines and not lines[-1]: del lines[-1]
return '\n'.join(lines)+'\n'
formatBlock = staticmethod(formatBlock)
def getanchor(sectiontext):
'''
get the anchor link of a section based on the title
on the dutch wikipedia it is not enough to call wikipedia.sectionencode
'''
# remove [[ ]] around sectionname
anchor=TextFunctions.removeformatting(sectiontext)
anchor = wikipedia.sectionencode(anchor,wikipedia.getSite().encoding())
# remove spaces at start and end (which are converted to underscores)
while anchor[:1]=='_':
anchor = anchor[1:]
while anchor[-1:]=='_':
anchor = anchor[:-1]
return anchor
getanchor = staticmethod(getanchor)
def removeformatting(linktext):
'''
remove [[ ]] and '' from a string, convert to text which would be shown
'''
linktext=re.sub(r'(?x)\[\[ [^|\]]* \|( [^|\]]* ) \]\]',r'\1',linktext)
linktext=re.sub(r'(?x)\[\[ ( [^\]]* ) \]\]',r'\1',linktext)
linktext=re.sub(r"'''(.*)'''",r'\1',linktext)
linktext=re.sub(r"''(.*)''",r'\1',linktext)
return linktext
removeformatting = staticmethod(removeformatting)
class Templates:
'''
Used to generate a table from an index
a template is defined as a python dictionary
use get to load a template from a wikipedia page
use parsetemplate to create a template from a string
use processindex to create a textual table of an index based on a template
'''
def __init__(self):
self.templates = {}
self.default='default'
self.templates[self.default]=self.getdefaulttemplate()
def loadpage(self, name):
if name is None:
return
page = wikipedia.Page(wikipedia.getSite(), name)
try:
text = page.get()
except (wikipedia.NoPage, wikipedia.IsRedirectPage):
return
self.templates[name]=self.parsetemplate(text)
def get(self, name):
if name not in self.templates:
self.loadpage(name)
if name in self.templates:
return self.templates[name]
return self.templates[self.default]
def getdefaulttemplate(self):
text=TextFunctions.formatBlock('''
<!-- HEADER -->
{| class="sortable"
! Onderwerp !! Link
<!-- ROW -->
|-
| %%topic%% || [[%%link%%|%%page%%]]
<!-- ALT ROW -->
|- style="background: #dddddd;"
| %%topic%% || [[%%link%%|%%page%%]]
<!-- FOOTER -->
|}
<!-- END -->
''')
return self.parsetemplate(text)
def parsetemplate(self,text):
section=''
ret={}
seperators2 = re.compile(r'<!--([^-]*)-->')
parts = re.split(r'(<!--[^-]*-->)',text)
for part in parts:
t2 = seperators2.match(part)
if t2 is not None:
section=t2.group(1).strip()
else:
ret[section]=part.strip()+'\n'
return ret
def processindex(self,template,index):
if len(index) == 0:
return ''
ret=''
count=0
if 'LEAD' in template:
ret += template['LEAD']
if 'HEADER' in template:
ret += template['HEADER']
for r in index:
count += 1
if count%2==0 and 'ALT ROW' in template:
t = template['ALT ROW']
else:
t = template['ROW']
t = t.replace('%%topic%%',r['topic'])
t = t.replace('%%link%%',r['link'])
t = t.replace('%%page%%',r['page'])
ret += t
if 'FOOTER' in template:
ret += template['FOOTER']
if 'TAIL' in template:
ret += template['TAIL']
# allow %%subst%% and %%now%% to be replaced in all headers, not just ROW
ret = ret.replace('%%subst%%','subst:')
ret = ret.replace('%%now%%',strftime("%d %b %Y %H:%M (%Z)"))
ret = ret.replace('%%((%%','{{')
ret = ret.replace('%%))%%','}}')
ret = ret.replace('%%(%%','{')
ret = ret.replace('%%)%%','}')
return ret
class IndexGenerator:
'''
Create an archive index of a numbor of pages
an index is a list of dictonaries with the following keys
sortkey : lowercase text usefull for sorting
link : link to page
page : title of the (sub) page
topic : title of section
readoptions is used to read an optionstring
getoptionstring to return the current options
addpage is an internal function to process a single page
retrieve is used to generate the index using the previous set options
'''
def __init__(self):
self.pages=[]
self.globaloptions={}
pass
def setoption(self,name,value):
if value is None:
if name in self.globaloptions:
del self.globaloptions[name]
else:
self.globaloptions[name] = str(value)
def readoption(self,name):
if name in self.globaloptions:
return self.globaloptions[name]
return None
def changedchecksum(self,checksum):
if 'checksum' in self.globaloptions and str(self.globaloptions['checksum']) == str(checksum):
return False
self.globaloptions['checksum'] = str(checksum)
return True
def readoptions(self,txt,pagename):
options=txt.split(';')
for option in options:
opt=option.split('=',2)
if len(opt)==2:
if opt[0] in ('page', 'pageprefix'):
self.pages.append({opt[0]:opt[1]})
elif (opt[0] in ('name','include','exclude')) and (len(self.pages) != 0):
self.pages[-1][opt[0]] = opt[1]
elif opt[0] in ('checksum','template'):
self.globaloptions[opt[0]] = opt[1]
else:
wikipedia.output('unknown/invalid option: %s=%s' % (opt[0],opt[1]))
if len(self.pages) == 0:
self.pages.append({'pageprefix':pagename+'/'})
def getoptionstring(self):
ret=''
if 'template' in self.globaloptions:
ret += 'template=%s;' % self.globaloptions['template']
for indexpage in self.pages:
if 'page' in indexpage:
ret += 'page=%s;name=%s;' % (indexpage['page'], indexpage['name'])
elif 'pageprefix' in indexpage:
ret += 'pageprefix=%s;' % indexpage['pageprefix']
if 'name' in indexpage:
ret += 'name=%s;' % indexpage['name']
if 'include' in indexpage:
ret += 'include=%s;' % indexpage['include']
elif 'exclude' in indexpage:
ret += 'exclude=%s;' % indexpage['exclude']
if 'checksum' in self.globaloptions:
ret += 'checksum=%s;' % self.globaloptions['checksum']
return ret
def addpage(self, page, shortname):
pagetitle = page.title()
try:
text = page.get()
except (wikipedia.NoPage, wikipedia.IsRedirectPage):
wikipedia.output('error get()')
return []
# \n is enough...
text=re.sub('\r','',text)
ret = []
lasttitle2=""
lasttitle3=""
lasttitle4=""
title2 = re.compile(r'^==\ *([^= ].*[^= ])\ *==$')
title3 = re.compile(r'^===\ *([^= ].*[^= ])\ *===$')
title4 = re.compile(r'====\ *([^= ].*[^= ])\ *====')
parts = re.split(r'(?m)^==.*==$',text)
for part in parts:
t2 = title2.match(part)
t3 = title3.match(part)
t4 = title4.match(part)
if t2 is not None:
lasttitle2=t2.group(1)
lasttitle3=""
elif t3 is not None:
lasttitle3=t3.group(1)
lasttitle4=""
elif t4 is not None:
lasttitle4=t4.group(1)
else:
if lasttitle4 != "":
lasttitle4=""
elif lasttitle3 != "":
#wikipedia.output(u"===%s===" % lasttitle3)
anchor = TextFunctions.getanchor(lasttitle3)
linktext = TextFunctions.removeformatting(lasttitle3+" ("+lasttitle2+")")
sortkey=linktext.lower()
sortkey=re.sub('[^a-z]','',sortkey)
ret.append({'sortkey':sortkey, 'link':pagetitle+"#"+anchor, 'page':shortname, 'topic':linktext})
#linktext = TextFunctions.removeformatting(lasttitle2+", "+lasttitle3)
#sortkey=linktext.lower()
#sortkey=re.sub('[^a-z]','',sortkey)
#ret.append({'sortkey':sortkey, 'link':pagetitle+"#"+anchor, 'page':shortname, 'topic':linktext})
elif lasttitle2 != "":
#wikipedia.output(u"==%s==" % lasttitle2)
anchor = TextFunctions.getanchor(lasttitle2)
linktext = TextFunctions.removeformatting(lasttitle2)
sortkey=linktext.lower()
sortkey=re.sub('[^a-z]','',sortkey)
ret.append({'sortkey':sortkey, 'link':pagetitle+"#"+anchor, 'page':shortname, 'topic':linktext})
return ret
def retrieve(self):
ret=[]
for indexpage in self.pages:
if 'page' in indexpage:
if 'name' not in indexpage:
indexpage['name']=indexpage['page']
page = wikipedia.Page(wikipedia.getSite(), indexpage['page'])
ret.extend(self.addpage(page, indexpage['name']))
elif 'pageprefix' in indexpage:
if 'name' not in indexpage:
indexpage['name']=''
pagelist=[]
subpagegen = pagegenerators.PrefixingPageGenerator(prefix = indexpage['pageprefix'])
for subpage in subpagegen:
if 'include' in indexpage:
if re.search(indexpage['include'],subpage.title()) is not None:
pagelist.append(subpage.title())
elif 'exclude' in indexpage:
if re.search(indexpage['exclude'],subpage.title()) is None:
pagelist.append(subpage.title())
else:
pagelist.append(subpage.title())
if pagelist != []:
gen = iter([wikipedia.Page(wikipedia.getSite(), t) for t in pagelist])
gen = pagegenerators.PreloadingGenerator(gen)
for page in gen:
ret.extend(self.addpage(page,indexpage['name']+page.title()[len(indexpage['pageprefix']):]))
ret.sort(key=itemgetter('sortkey'))
return ret
class ArchiveBot:
'''
'''
# Edit summary message that should be used.
# NOTE: Put a good description here, and add translations, if possible!
msg = {
'en': u'Robot: Create archive index',
'nl': u'robot: Creëer archief index',
}
def __init__(self, debug, hometemplate, defaulttemplate, logbook, singlepage):
"""
Constructor. Parameters:
* debug - If True, doesn't do any real changes, but only shows
what would have been changed.
"""
self.generator = None
self.debug = debug
self.hometemplate = hometemplate
self.defaulttemplate = defaulttemplate
self.logbook = logbook
self.singlepage = singlepage
self.acceptall = False
self.processed = 0
self.changecount = 0
self.errorcount = 0
self.templates = Templates()
if self.singlepage is not None:
self.generator = iter([wikipedia.Page(wikipedia.getSite(), self.singlepage)])
else:
transclusionPage = wikipedia.Page(wikipedia.getSite(), self.hometemplate)
self.generator = pagegenerators.ReferringPageGenerator(transclusionPage, onlyTemplateInclusion = True)
self.generator = pagegenerators.PreloadingGenerator(self.generator)
def createlog(self):
if self.logbook is None:
return
log_page = wikipedia.Page(wikipedia.getSite(), self.logbook)
try:
log_text = log_page.get()
except (wikipedia.NoPage, wikipedia.IsRedirectPage):
log_text = ''
old_log_text = log_text
args = [wikipedia.decodeArg(sys.argv[0])] + map(lambda s: wikipedia.decodeArg('"%s"' % s), sys.argv[1:])
log_text += '\n* Start: %s\n' % self.starttime
log_text += r'* Command: <nowiki>' + u' '.join(args) + r'</nowiki>' + '\n'
log_text += '* Processed: %d pages\n' % self.processed
log_text += '* Changes: %d pages\n' % self.changecount
log_text += '* Errors: %d pages\n' % self.errorcount
log_text += '* End: %s\n' % self.endtime
log_text += '----\n'
com = wikipedia.translate(wikipedia.getSite(), self.msg) + ' (Log)'
wikipedia.showDiff(old_log_text, log_text)
if not self.debug:
try:
log_page.put(log_text, comment = com, minorEdit = True)
#wikipedia.output('page.put()')
except:
wikipedia.output(u'Could not save log')
def run(self):
self.starttime = strftime("%d %b %Y %H:%M (%Z)")
# Set the edit summary message
wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), self.msg))
for page in self.generator:
self.treat(page)
self.endtime = strftime("%d %b %Y %H:%M (%Z)")
self.createlog()
def treat(self, page):
"""
Loads the given page, does some changes, and saves it.
"""
self.processed += 1
# Show the title of the page we're working on.
# Highlight the title in purple.
wikipedia.output(u"\03{lightpurple}%s\03{default}:" % page.title())
try:
# Load the page
text = page.get()
except wikipedia.NoPage:
wikipedia.output(u"Page %s does not exist; skipping." % page.aslink())
return
except wikipedia.IsRedirectPage:
wikipedia.output(u"Page %s is a redirect; skipping." % page.aslink())
return
if not page.botMayEdit(wikipedia.getSite().loggedInAs()):
wikipedia.output(u"Page %s is locked for robot editing; skipping." % page.aslink())
return
thisindex = IndexGenerator()
full=re.search(r'(\{\{'+self.hometemplate+r'(\|[^}]*)?\}\}(.*'+self.hometemplate+r'-->)?)',text,re.DOTALL)
tmplopt=re.search(r'\{\{'+self.hometemplate+r'\|([^}]*)\}\}',text)
if tmplopt is not None:
thisindex.readoptions(tmplopt.group(1), page.title())
else:
wikipedia.output('cannot read options, using default')
thisindex.readoptions('', page.title())
if thisindex.readoption('template') is None:
thisindex.setoption('template',self.defaulttemplate)
idx=thisindex.retrieve()
checktemplate=self.templates.parsetemplate(r'<!--ROW-->%%link%%')
checktext=self.templates.processindex(checktemplate,idx)
checksum = zlib.adler32(checktext.encode('utf8'))&0xffffffffL
#wikipedia.output("checksum=%X" % checksum)
t = self.templates.get(thisindex.readoption('template'))
newtext=self.templates.processindex(t,idx)
if thisindex.changedchecksum(checksum):
text=fulltext.sub('{{'+self.hometemplate+'|'+thisindex.getoptionstring()+'}}'+newtext+'<!--'+self.hometemplate+'-->', text)
else:
wikipedia.output('Not changed')
###############################
# save if something was changed
if text != page.get():
# show what was changed
wikipedia.showDiff(page.get(), text)
if not self.debug:
if self.acceptall:
choice = 'y'
else:
choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N')
if choice == 'a':
choice = 'y'
self.acceptall = True
if choice == 'y':
self.changecount += 1
try:
# Save the page
page.put(text)
#wikipedia.output('page.put()')
except wikipedia.LockedPage:
wikipedia.output(u"Page %s is locked; skipping." % page.aslink())
self.errorcount += 1
except wikipedia.EditConflict:
wikipedia.output(u'Skipping %s because of edit conflict' % (page.title()))
self.errorcount += 1
except wikipedia.SpamfilterError, error:
wikipedia.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title(), error.url))
self.errorcount += 1
def main():
# If debug is True, doesn't do any real changes, but only show
# what would have been changed.
debug = False
hometemplate = None
logbook = None
defaulttemplate = None
singlepage = None
# Parse command line arguments
for arg in wikipedia.handleArgs():
if arg.startswith("-debug"):
debug = True
elif arg.startswith('-page:'):
singlepage = arg[6:]
elif arg.startswith('-logbook:'):
logbook = arg[9:]
elif arg.startswith('-defaulttemplate:'):
defaulttemplate = arg[17:]
elif arg.startswith('-hometemplate:'):
hometemplate = arg[14:]
if hometemplate is None:
wikipedia.output('hometemplate is required')
return
bot = ArchiveBot(debug, hometemplate, defaulttemplate, logbook, singlepage)
bot.run()
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()