Gebruiker:Edoderoobot/check-disamb-template-nl.py

Uit Wikipedia, de vrije encyclopedie
import pywikibot
from pywikibot import pagegenerators as pg
import codecs #used in logfiles, unicoded strings
import sys, re
import datetime
import time
import urllib.request
from urllib.parse import quote
from collections import defaultdict

wheretoskip='Wikipedia:Links naar doorverwijspagina\'s/skips'
sourcefromfile='https://tools.wmflabs.org/multichill/queries2/nlwp/links_naar_doorverwijspaginas.txt'
sourcefromfile='https://multichill.toolforge.org/queries2/nlwp/links_naar_doorverwijspaginas.txt'
wikiurl={'nl':u'Wikipedia:Links_naar_doorverwijspagina%27s/data'}
linkstostr=u'https://nl.wikipedia.org/w/index.php?title=Speciaal%3AVerwijzingenNaarHier&namespace=0&target='
template = 'template:Dp'
disamb_addition=' (doorverwijspagina)'
allowed_namespaces = [0]
treshold=1  #less then this number of backlinks? the item will be skipped
max_new_pages=100

def getSkiplinks(site):
 skiplinks = defaultdict(list)
 try:
   skiplinkspage = pywikibot.Page(site, wheretoskip).get()
   lines = re.findall("\*.*", skiplinkspage)
   linkre = re.compile("\[([^\[\|\]]*)[\]\|]")
   for line in lines:
     titles = linkre.findall(line)
     if len(titles) > 1:
       skiplinks[titles[0]] += titles[1:]
 except pywikibot.NoPage:
   pass
 return skiplinks

def getlinksfromfile(filename):
 with urllib.request.urlopen(filename) as response:
   html = response.read().decode("utf-8")
   result=prevx=''
   collect=False
   for i in range(len(html)):
        x=html[i:i+1]
        if (x=='[') and (prevx=='['):
          collect=True
          result=''
        if (collect):
          result=result+x  
        
        if (x==']') and (prevx==']'):
            yield(result[1:len(result)-2])
            result=''
            collect=False
        prevx=x

def getnewpages(site):
  for page in pg.NewpagesPageGenerator(site,0,max_new_pages):
    dt=page.oldest_revision
    timediff=dt.timestamp.today()-dt.timestamp  
    if (timediff<datetime.timedelta(3/24)): #page less 3 hours old (script runs every 3 hrs)
      if (page.namespace().id in allowed_namespaces):
        if page.exists():
          yield(page.title())
    else: 
      break     

def count_links(dppage):
  linksfound=0
  if (dppage.title().find(disamb_addition)==-1):
   for onelink in dppage.backlinks():
    if onelink.namespace().id in allowed_namespaces:
     if not onelink.isRedirectPage():
      if not onelink.isDisambig():
       if not (dppage.title() == onelink.title() + disamb_addition):
        if not onelink.title() in skiplinks[dppage.title()]:
              linksfound +=1
              #print('%s-%s-%s' % (linksfound,dppage.title(),onelink.title()))
  return linksfound

def process_one_disambiguation_page(site,pagetitle,result):
 if (pagetitle.find(disamb_addition)<0):
  dpPage=pywikibot.Page(site,pagetitle)  #pagetitle can contain spaces or underscores
  linksfound = count_links(dpPage)
  if (linksfound>=treshold):
    if (not (dpPage.title() in result)):
      result.update({dpPage.title():linksfound})
    else:
      pass  

def process_one_regular_page(site,pagetitle,result):
   page=pywikibot.Page(site,pagetitle)
   for link in page.linkedPages():
        if link.isDisambig():
            process_one_disambiguation_page(site, link.title(),result)
"""

"""
def get_one_line(page):
    one_line=''
    for x in page.text:
      one_line=one_line+x
      if (x=='\n'):
        yield one_line
        one_line=''
 
def count_wiki_links(line):
    counted=0
    start=line.find('[[')
    while (start>=0) and (counted<10):
      counted+=1
      start=line[start+1:].find('[[')
    return counted    

def check_one_page(page,show):
  for line in get_one_line(page):
    #if (show): print(line)
    if (count_wiki_links(line)>1):
        if (show): print(line)
        return True
  return False              
            
starttime=time.strftime(time.ctime())
print('Start: %s' % starttime)        
site=pywikibot.Site('nl')
skiplinks=getSkiplinks(site)
#print(skiplinks); print(0/0)
result={}
wikistr = u'{{verwijzing2|WP:LND/D}}\n'
wikistr += u'Deze pagina wordt met regelmaat door een bot opnieuw gemaakt.\n'
wikistr += u'Zie de geschiedenis van de pagina wanneer, en door welke bot.\n'
wikistr += u'Als hier links zijn meegeteld die niet gerepareerd hoeven te worden, voeg die dan toe op [[%s]].\n' %(wheretoskip)
wikistr += u'{| class="wikitable sortable"\n|-\n! Artikel !! XtraLinks !! Aantal !! Links \n'


for link in getlinksfromfile(sourcefromfile):
  process_one_disambiguation_page(site,link,result)
for link in getnewpages(site):
  process_one_regular_page(site,link,result)
process_one_regular_page(site,wikiurl['nl'],result)  #the actual page, refresh

for item in result:
    if check_one_page(pywikibot.Page(site,item),False):
        xstr='X'
    else:
        xstr=''
    wikistr+='|-\n|[[%s]]||%s||%s||[%s%s link]\n' % (item,xstr,result[item],linkstostr,quote(item))
wikistr += '|}'   
stoptime=time.strftime(time.ctime())
wikistr += '\n\n%s-%s' % (starttime,stoptime)

pywikibot.Page(site,wikiurl['nl']).put(wikistr,summary='#dp-update')
print('Klaar')