Naar inhoud springen

Gebruiker:Edoderoobot/ongekoppelde-paginas.py

Uit Wikipedia, de vrije encyclopedie
import json
import sys
import urllib.request, urllib.parse
import datetime
from time import strftime

import pywikibot
#from pywikibot import pagegenerators
#from pywikibot import pagegenerators as pg, textlib, WikidataBot
from pywikibot import textlib

maanden=['nl','januari','februari','maart','april','mei','juni','juli','augustus','september','oktober','november','december']

debugmodus = True
debugmodus = False

wikistr={'taxon':'','person':'','dp':'','list':'','misc':''}
wikiurl={'nl':u'Wikipedia:Wikidata/Ongekoppelde pagina\'s'}

skip_templates = ['','infobox/breedte','!!','0','afbeelding gewenst','infobox generiek','nl','nl-vlag','be','be-vlag','afbeeldingbreedte']
person_templates = ['acteur','artiest','artiest klassieke muziek','atleet','atletiek','auteur','filmregisseur','hoogleraar','medicus','journalist','kunstenaar','persoon','politicus nederland','presentator','sporter','voetballer','wielrenner']
disamb_templates = ['dpintro','dp']
nomination_templates = ['auteur','ne','nuweg','reclame','wb','wiu','samenvoegen naar','weg','samenvoegen']
mustbe='Navigatie infoboxen personen'

isdisamb='Q4167410'

def is_person_template(checktemplate):
  site=pywikibot.Site()
  page=pywikibot.Page(site,checktemplate,10)
  for templ in page.templates():
    if templ.title() in ['Sjabloon:Navigatie infoboxen personen','Sjabloon:Afbeelding gewenst persoon']: 
      return True
  return False

def try2find_taxon(repo,searchstr):
  """
    read Taxon template, find value on worms, match with P850
  """
  site=pywikibot.Site()
  page=pywikibot.Page(site,searchstr)
  pagetext=page.get()
  templates = textlib.extract_templates_and_params(pagetext)
  i = 0
  worms2find=''
  
  for (templ, fielddict) in templates:
    if templ=='Taxobox':
      #taxobox = templ
      for field, value in fielddict.items():
              i+=1
              #print("%d: [%s]: [%s]" % (i,field, value))
              if field=='worms': worms2find=value
  
  
  #print("worms2find: [%s]" % worms2find)
  max=99
  c=0
  searchres = repo.search(searchstr,[0])
  for onetaxon in searchres:
    c+=1
    if c>max: return c,None
    wditem = pywikibot.ItemPage(repo,onetaxon.title())
    wditem.get(get_redirect=True)
    if 'P850' in wditem.claims:
      thisworms = wditem.claims.get('P850')[0].getTarget()
      if thisworms==worms2find:
        return c,wditem.title()
  
  
  return c,None
  
def try2find_person(repo,searchstr):
  
  savegebdate = saveimdb = None
  
  site=pywikibot.Site()
  page=pywikibot.Page(site,searchstr)
  pagetext=page.get()
  templates = textlib.extract_templates_and_params(pagetext)
  for (templ,fielddict) in templates:
    for field,value in fielddict.items():
      if field in ['geboortedatum']:
        savegebdate = value
      if field in ['imdb']:
        saveimdb = value
        
  
  
  searchres = repo.search(searchstr,[0])
  c=0
  for oneperson in searchres:
    c += 1
    wditem = pywikibot.ItemPage(repo,oneperson.title())
    wditem.get(get_redirect=True)
    if (saveimdb != None) and ('P345' in wditem.claims): #imdb   
      claims = wditem.claims.get('P345')
      for thisclaim in claims:
        thisimdb = thisclaim.getTarget().title()
        if thisimdb.rfind(saveimdb):
          return c,wditem.title()        
    if (savegebdate != None) and ('P569' in wditem.claims):  #check if same birth date (full date)
       try:
         thisdate = wditem.claims.get('P569')[0].getTarget()   #date of found wikidata-item
         xstr = str(savegebdate).replace('[','').replace(']','')  #date without [[]] linking brackets
         ystr = '%d %s %d' % (thisdate.day,maanden[thisdate.month],thisdate.year)   #str-date with Dutch named months
         if xstr==ystr:
           return c,wditem.title()
       except:
         pass
  return c,None

def try2find_dp(repo,searchstr):
    max=99
    c=0
    searchres = repo.search(searchstr,[0])
    for oneitem in searchres:
     c+=1
     if (c>max): return c,None 
     wdpage=pywikibot.ItemPage(repo,oneitem.title())
     wdpage.get()
     if 'P31' in wdpage.claims:
       claim31=wdpage.claims.get('P31')[0].getTarget().title()
       if claim31==isdisamb:
         if (wdpage.sitelinks):
           return c,wdpage.title()
    return 0,None
       
def action(pagename):

  isPerson=False
  isRedirect=False
  isList=False
  isDisambigue=False
  isNominated=False
  hasInfobox=None
  gotInfobox=False
  hasCategory=None
  isTaxon=False
  global wikistr 
  suggest_wd=None
  level=0
  
  site = pywikibot.Site('nl')
  repo = site.data_repository()
  page = pywikibot.Page(site,pagename)
  for ptemplate in page.templates():
    thistemplate = ptemplate.title()[9:]
    if thistemplate.lower() in nomination_templates:
      return
    if (hasInfobox==None):
      if (thistemplate=='Taxobox'):
        hasInfobox = 'Taxobox'      
        isTaxon=True
      if ((thistemplate[0:7]=='Infobox') and not(thistemplate.lower() in skip_templates)):
        gotInfobox = True #found one, save name
        hasInfobox = ptemplate.title()[17:]
    
    isPerson = isPerson or is_person_template(thistemplate)
    #isPerson = thistemplate[8:].lower() in person_templates
    #if not(hasInfobox==None):
    #  isPerson = hasInfobox.lower() in person_templates
    
    isDisambigue = thistemplate.lower() in disamb_templates
    #isList = thistemplate.lower[0:7] in ['lijsten']
    
  for pcategory in page.categories():
    thiscat = pcategory.title()[10:]
    if hasCategory==None:
      if thiscat[0:9] != 'Wikipedia':
        hasCategory = thiscat
  
  if isTaxon:
    level,suggest_wd = try2find_taxon(repo,pagename)
  if isDisambigue:
    level,suggest_wd = try2find_dp(repo,pagename)
  if isPerson:
    level,suggest_wd = try2find_person(repo,pagename)
  
  if (suggest_wd != None):
    suggest_wd = "[[:d:%s]]" % suggest_wd
  
  if hasCategory==None: hasCategory='None'
  if hasInfobox==None: hasInfobox='None'  
  onestr = "\n|-\n|[[%s]]\n|%s\n|%s\n|%s\n|%s\n|%s\n|%i\n" % (pagename.replace('_',' '),isDisambigue,isPerson,hasInfobox.replace('_',' '),hasCategory.replace('_',' '), suggest_wd, level )
  if   isTaxon:      wikistr['taxon'] += onestr
  elif isPerson:     wikistr['person'] += onestr
  elif isDisambigue: wikistr['dp'] += onestr
  elif isList:       wikistr['list'] += onestr
  else:
    if (gotInfobox):
      #print("hasInfobox: %s" % hasInfobox)
      if (hasInfobox in wikistr.keys()): 
        wikistr[hasInfobox] += onestr
      else:
        wikistr[hasInfobox] = onestr
    else:
      wikistr['misc'] += onestr
    
    
def main():    
 #exit()  #avoid scheduled runs from crontab 
 dparray = []
 global wikistr
 
 mylanguage=u'nl'
 myproject=u'wikipedia'
 mydepth=u'12'
 mycategory=u'Wikipedia:Doorverwijspagina'
 mycategory=u'Nederlands persoon'
 mycategory=u'Alles'

 query = 'https://petscan.wmflabs.org/?'\
        'language='+mylanguage+\
        '&project='+myproject+\
        '&depth='+mydepth+\
        '&categories='+urllib.parse.quote_plus(mycategory)+\
        '&combination=subset'\
        '&negcats='\
        '&ns%5B0%5D=1'\
        '&larger='\
        '&smaller='\
        '&minlinks='\
        '&maxlinks='\
        '&before='\
        '&after='\
        '&max_age='\
        '&show_redirects=no'\
        '&edits%5Bbots%5D=both'\
        '&edits%5Banons%5D=both'\
        '&edits%5Bflagged%5D=both'\
        '&templates_yes='\
        '&templates_any='\
        '&templates_no='\
        '&outlinks_yes='\
        '&outlinks_any='\
        '&outlinks_no='\
        '&sparql='\
        '&manual_list='\
        '&manual_list_wiki='\
        '&pagepile='\
        '&common_wiki=cats'\
        '&format=json'\
        '&output_compatability=catscan'\
        '&sortby=none'\
        '&sortorder=ascending'\
        '&wikidata_item=without'\
        '&wikidata_label_language='\
        '&regexp_filter='\
        '&doit='\
        '&interface_language=en'\
        '&active_tab=tab_output'\
        '&format=json'

 #if (debugmodus):
 # import pdb

 if False: #or os.isatty(sys.stdin.fileno()):  #detect run from cron or from console
    action('Fred McLeod')
    #action('Joo Kang-eun')
    #action('Lijst van personages uit Smeris')
    #action('Alias Jimmy Valentine')
    #action('Achillas')
 else:        
    print("get query")
    response = urllib.request.urlopen(query)  
    print("process pages")
    rawdate = response.read()
    decoded=rawdate.decode('utf8')
    dps=json.loads(decoded)

    for dp in dps['*'][0]['a']['*']:
     dparray.append(dp['title'])
     #print(dp)

    
    print("-------")  
    for i in range(0,len(dparray)):
     print("%i - %s" % (i,dparray[i]))
     action(dparray[i])

 wikiString = (u'Dit zijn [[Speciaal:OngekoppeldePaginas|pagina\'s die niet gekoppeld zijn aan items]] minus de pagina\'s die genomineerd zijn voor verwijdering.\n\n'+\
              u'Deze pagina wordt automatisch gegenereerd, handmatige updates hebben dus geen zin!\n\n'+\
              u'aangemaakt op %s\n\n' +\
              '{| class=\"wikitable sortable\"\n|-\n!Pagina || Dp || Persoon || Infobox || Categorie || Suggestie \n')  % '{:%d-%m-%Y %H:%m}'.format(datetime.date.today())

 print("=======") 
 for k in wikistr.keys():
   print(wikistr[k])
   wikiString+=wikistr[k]
 wikiString += "\n|}\n"

 if not debugmodus: 
    pass#pywikibot.Page(pywikibot.getSite('nl'), wikiurl['nl']).put(wikiString, comment='Update') #Save page   
 else:
    print("<!----!>")
    print(wikiString)
 
main()