Gebruiker:DajasjBot/archivelinks.py

Uit Wikipedia, de vrije encyclopedie
from bz2file import BZ2File
from lxml import etree
import os
import waybackpy
import pywikibot
import time
import re
from tqdm import notebook
from datetime import datetime
site = pywikibot.Site()
start = datetime.now()

user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"


def citetemplate_to_dict(citetemplate):
    splitted_citetemplate = citetemplate[:-2].split("|")[1:]
    dict_of_citetemplate = {}
    for n, x in enumerate(splitted_citetemplate):

        if "=" in x:
            x_split = x.split("=", 1)
            dict_of_citetemplate[x_split[0]] = x_split[1]
        elif "]]" in x:
            prev_x_split = splitted_citetemplate[n - 1].split("=", 1)
            dict_of_citetemplate[prev_x_split[0]] += "|" + x

    return dict_of_citetemplate


datedict = {
    "januari": 1,
    "februari": 2,
    "maart": 3,
    "april": 4,
    "mei": 5,
    "juni": 6,
    "juli": 7,
    "augustus": 8,
    "september": 9,
    "oktober": 10,
    "november": 11,
    "december": 12
}

# Manually download file
with BZ2File("nlwiki-latest-pages-articles.xml.bz2") as xml_file:
    file = etree.iterparse(
        xml_file, tag='{http://www.mediawiki.org/xml/export-0.10/}page')
    for _, dump_page in notebook.tqdm(file):

        # Get the text
        dump_text = dump_page.findtext(
            '{http://www.mediawiki.org/xml/export-0.10/}revision/{http://www.mediawiki.org/xml/export-0.10/}text'
        )
        # Get the title
        title = dump_page.findtext(
            '{http://www.mediawiki.org/xml/export-0.10/}title')

        # Check if dump has missing archive urls
        list_citetemplates_dump = [
            x for x in re.findall("{{[Cc]iteer web[^{}]*?}}", dump_text)
            if not re.search(
                r"archief-{0,1}url\s*?=\s*?[^\|\s]{10,}|archive-{0,1}url\s*?=\s*?[^\|\s]{10,}",
                x, re.IGNORECASE)
        ]
        if len(list_citetemplates_dump) == 0:
            continue

        # REMOVE
        title = "Gebruiker:DajasjBot/Kladblok"

        # Get page from the live version
        page = pywikibot.Page(site, title)
        live_text = page.text

        # Check protection
        if "edit" in page.protection().keys():
            if page.protection()["edit"][0] == "sysop":
                continue

        # UNCOMMENT
#         # Check namespace
#         if page.namespace().id != 0:
#             continue

        list_citetemplates_live = [
            x for x in re.findall("{{[Cc]iteer web[^{}]*?}}", live_text)
            if not re.search(
                r"archief-{0,1}url\s*?=\s*?[^\|\s]{10,}|archive-{0,1}url\s*?=\s*?[^\|\s]{10,}",
                x, re.IGNORECASE)
        ]
        # Check if also misses archiveurls
        if len(list_citetemplates_live) == 0:
            continue

        changed = False

        for citetemplate in list_citetemplates_live:
            dict_of_citetemplate = citetemplate_to_dict(citetemplate)
            dict_of_citetemplate = {
                key.strip(): value.strip()
                for key, value in dict_of_citetemplate.items()
            }

            # Get url
            if 'url' in dict_of_citetemplate:
                url = dict_of_citetemplate['url']
            elif "URL" in dict_of_citetemplate:
                url = dict_of_citetemplate['URL']
            else:
                continue

            # Get retrieved date
            if 'bezochtdatum' in dict_of_citetemplate:
                retrievedate = dict_of_citetemplate['bezochtdatum']
            elif 'accessdate' in dict_of_citetemplate:
                retrievedate = dict_of_citetemplate['accessdate']
            elif 'datumbezocht' in dict_of_citetemplate:
                retrievedate = dict_of_citetemplate['datumbezocht']
            elif 'datumgeraadpleegd' in dict_of_citetemplate:
                retrievedate = dict_of_citetemplate['datumgeraadpleegd']
            elif 'raadpleegdatum' in dict_of_citetemplate:
                retrievedate = dict_of_citetemplate['raadpleegdatum']
            elif 'access-date      ' in dict_of_citetemplate:
                retrievedate = dict_of_citetemplate['access-date']
            else:
                retrievedate = ""

            wayback = waybackpy.Url(url, user_agent)

            # Check if already archived
            if wayback.total_archives() == 0:
                # Archive page
                try:
                    archive = wayback.save()
                except Exception as e:
                    print(e)
                    print(url)
                    continue
            else:
                # Get archived page
                retrievedate = retrievedate.strip()
                if retrievedate != "":
                    if re.search(r"\d{4}-\d{2}-\d{2}", retrievedate):
                        year = retrievedate[:4]
                        month = retrievedate[5:7]
                        day = retrievedate[8:10]
                    elif re.search(r"\d{1,2}-\d{1,2}-\d{4}", retrievedate):
                        year = retrievedate.split("-")[-1]
                        month = retrievedate.split("-")[1]
                        day = retrievedate.split("-")[0]

                    elif re.search(r"(\d{1,2})\s([a-zA-Z]{3,10})\s\d{4}",
                                   retrievedate, re.IGNORECASE):
                        month = re.search(
                            r"(\d{1,2})\s([a-zA-Z]{3,10})\s\d{4}",
                            "25 augustus 2021").group(2).lower()
                        if month in datedict.keys():
                            month = datedict[month]
                        else:
                            continue
                        day = re.search(r"(\d{1,2})\s([a-zA-Z]{3,10})\s\d{4}",
                                        "25 augustus 2021").group(1)
                        year = retrievedate[-4:]

                    else:
                        print("date is wrong")
                        print(url)
                        print(retrievedate)
                        continue

                    try:
                        archive = wayback.near(year=year, month=month, day=day)
                    except Exception as e:
                        print(url)
                        print(e)
                        continue
                else:
                    archive = wayback.newest()

            # Create new template
            new_dict_of_citetemplate = citetemplate_to_dict(citetemplate)

            # Remove archive related parameters
            new_dict_of_citetemplate = {
                key: value
                for key, value in new_dict_of_citetemplate.items()
                if key.strip() not in [
                    "archiefdatum", "archivedate", "archive-date", "dodeurl",
                    "dode-url", "deadurl", "dead-url", "archiefurl",
                    "archiveurl", "archive-url"
                ]
            }
            new_dict_of_citetemplate['archiefurl'] = archive.archive_url
            new_dict_of_citetemplate[
                'archiefdatum'] = archive.timestamp.strftime("%Y-%m-%d")
            new_dict_of_citetemplate['dodeurl'] = "nee"
            new_citetemplate = citetemplate.split("|")[0] + "|" + "|".join([
                key + "=" + value
                for key, value in new_dict_of_citetemplate.items()
            ]) + "}}"

            # Add to live_text
            live_text = live_text.replace(citetemplate, new_citetemplate)
            changed = True
        if changed:
            page.text = live_text
            page.save(u"Archiefurl toegevoegd")

            # Pause for rate limit
            time.sleep(max(0, 60 - (datetime.now() - start).seconds))
            start = datetime.now()

        break
        dump_page.clear()

        # Also eliminate now-empty references from the root node to elem
        for ancestor in dump_page.xpath('ancestor-or-self::*'):
            while ancestor.getprevious() is not None:
                del ancestor.getparent()[0]