import pywikibot
from pywikibot import pagegenerators as pg
import codecs #used in logfiles, unicoded strings
import sys, re
import datetime
import time
import urllib.request
from urllib.parse import quote
from collections import defaultdict
wheretoskip='Wikipedia:Links naar doorverwijspagina\'s/skips'
sourcefromfile='https://tools.wmflabs.org/multichill/queries2/nlwp/links_naar_doorverwijspaginas.txt'
sourcefromfile='https://multichill.toolforge.org/queries2/nlwp/links_naar_doorverwijspaginas.txt'
wikiurl={'nl':u'Wikipedia:Links_naar_doorverwijspagina%27s/data'}
linkstostr=u'https://nl.wikipedia.org/w/index.php?title=Speciaal%3AVerwijzingenNaarHier&namespace=0&target='
template = 'template:Dp'
disamb_addition=' (doorverwijspagina)'
allowed_namespaces = [0]
treshold=1 #less then this number of backlinks? the item will be skipped
max_new_pages=100
def getSkiplinks(site):
skiplinks = defaultdict(list)
try:
skiplinkspage = pywikibot.Page(site, wheretoskip).get()
lines = re.findall("\*.*", skiplinkspage)
linkre = re.compile("\[([^\[\|\]]*)[\]\|]")
for line in lines:
titles = linkre.findall(line)
if len(titles) > 1:
skiplinks[titles[0]] += titles[1:]
except pywikibot.NoPage:
pass
return skiplinks
def getlinksfromfile(filename):
with urllib.request.urlopen(filename) as response:
html = response.read().decode("utf-8")
result=prevx=''
collect=False
for i in range(len(html)):
x=html[i:i+1]
if (x=='[') and (prevx=='['):
collect=True
result=''
if (collect):
result=result+x
if (x==']') and (prevx==']'):
yield(result[1:len(result)-2])
result=''
collect=False
prevx=x
def getnewpages(site):
for page in pg.NewpagesPageGenerator(site,0,max_new_pages):
dt=page.oldest_revision
timediff=dt.timestamp.today()-dt.timestamp
if (timediff<datetime.timedelta(3/24)): #page less 3 hours old (script runs every 3 hrs)
if (page.namespace().id in allowed_namespaces):
if page.exists():
yield(page.title())
else:
break
def count_links(dppage):
linksfound=0
if (dppage.title().find(disamb_addition)==-1):
for onelink in dppage.backlinks():
if onelink.namespace().id in allowed_namespaces:
if not onelink.isRedirectPage():
if not onelink.isDisambig():
if not (dppage.title() == onelink.title() + disamb_addition):
if not onelink.title() in skiplinks[dppage.title()]:
linksfound +=1
#print('%s-%s-%s' % (linksfound,dppage.title(),onelink.title()))
return linksfound
def process_one_disambiguation_page(site,pagetitle,result):
if (pagetitle.find(disamb_addition)<0):
dpPage=pywikibot.Page(site,pagetitle) #pagetitle can contain spaces or underscores
linksfound = count_links(dpPage)
if (linksfound>=treshold):
if (not (dpPage.title() in result)):
result.update({dpPage.title():linksfound})
else:
pass
def process_one_regular_page(site,pagetitle,result):
page=pywikibot.Page(site,pagetitle)
for link in page.linkedPages():
if link.isDisambig():
process_one_disambiguation_page(site, link.title(),result)
"""
"""
def get_one_line(page):
one_line=''
for x in page.text:
one_line=one_line+x
if (x=='\n'):
yield one_line
one_line=''
def count_wiki_links(line):
counted=0
start=line.find('[[')
while (start>=0) and (counted<10):
counted+=1
start=line[start+1:].find('[[')
return counted
def check_one_page(page,show):
for line in get_one_line(page):
#if (show): print(line)
if (count_wiki_links(line)>1):
if (show): print(line)
return True
return False
starttime=time.strftime(time.ctime())
print('Start: %s' % starttime)
site=pywikibot.Site('nl')
skiplinks=getSkiplinks(site)
#print(skiplinks); print(0/0)
result={}
wikistr = u'{{verwijzing2|WP:LND/D}}\n'
wikistr += u'Deze pagina wordt met regelmaat door een bot opnieuw gemaakt.\n'
wikistr += u'Zie de geschiedenis van de pagina wanneer, en door welke bot.\n'
wikistr += u'Als hier links zijn meegeteld die niet gerepareerd hoeven te worden, voeg die dan toe op [[%s]].\n' %(wheretoskip)
wikistr += u'{| class="wikitable sortable"\n|-\n! Artikel !! XtraLinks !! Aantal !! Links \n'
for link in getlinksfromfile(sourcefromfile):
process_one_disambiguation_page(site,link,result)
for link in getnewpages(site):
process_one_regular_page(site,link,result)
process_one_regular_page(site,wikiurl['nl'],result) #the actual page, refresh
for item in result:
if check_one_page(pywikibot.Page(site,item),False):
xstr='X'
else:
xstr=''
wikistr+='|-\n|[[%s]]||%s||%s||[%s%s link]\n' % (item,xstr,result[item],linkstostr,quote(item))
wikistr += '|}'
stoptime=time.strftime(time.ctime())
wikistr += '\n\n%s-%s' % (starttime,stoptime)
pywikibot.Page(site,wikiurl['nl']).put(wikistr,summary='#dp-update')
print('Klaar')