Code source de mlstatpy.data.wikipedia

"""
Functions to retrieve data from Wikipedia


:githublink:`%|py|5`
"""
import os
from pyquickhelper.loghelper import noLOG
from pyquickhelper.filehelper import get_url_content_timeout, ungzip_files
from .data_exceptions import DataException


[docs]def download_pageviews(dt, folder=".", unzip=True, timeout=-1,
                       overwrite=False, fLOG=noLOG):
    """
    Downloads wikipedia pagacount for a precise date (up to the hours),
    the url follows the pattern::

        https://dumps.wikimedia.org/other/pageviews/%Y/%Y-%m/pagecounts-%Y%m%d-%H0000.gz

    :param      dt:          datetime
    :param      folder:      where to download
    :param      unzip:       unzip the file
    :param      timeout:     timeout
    :param      overwrite:   overwrite
    :param      fLOG:        logging function
    :return:                 filename

    More information on page
    `pageviews <https://dumps.wikimedia.org/other/pageviews/>`_.


    :githublink:`%|py|29`
    """
    url = "https://dumps.wikimedia.org/other/pageviews/%Y/%Y-%m/pageviews-%Y%m%d-%H0000.gz"
    url = dt.strftime(url)
    file = url.split("/")[-1]
    name = os.path.join(folder, file)
    unzipname = os.path.splitext(name)[0]
    if overwrite or (not os.path.exists(name) and not os.path.exists(unzipname)):
        get_url_content_timeout(url, timeout=timeout,
                                encoding=None, output=name, chunk=2**20, fLOG=fLOG)
    if unzip and not os.path.exists(unzipname):
        names = ungzip_files(name, unzip=False, where_to=folder)
        os.remove(name)
        if isinstance(names, list):
            if len(names) != 1:
                raise DataException(  # pragma: no cover
                    "Expecting only one file, not '{0}'".format(names))
            return names[0]
        return names
    return name


[docs]def download_dump(country, name, folder=".", unzip=True, timeout=-1,
                  overwrite=False, fLOG=noLOG):
    """
    Downloads *wikipedia dumps* from
    `dumps.wikimedia.org/frwiki/latest/
    <https://dumps.wikimedia.org/frwiki/latest/>`_.

    :param      country:     country
    :param      name:        name of the stream to download
    :param      folder:      where to download
    :param      unzip:       unzip the file
    :param      timeout:     timeout
    :param      overwrite:   overwrite
    :param      fLOG:        logging function


    :githublink:`%|py|64`
    """
    url = "https://dumps.wikimedia.org/{0}wiki/latest/{0}wiki-{1}".format(
        country, name)
    file = url.split("/")[-1]
    name = os.path.join(folder, file)
    unzipname = os.path.splitext(name)[0]
    if overwrite or (not os.path.exists(name) and not os.path.exists(unzipname)):
        get_url_content_timeout(url, timeout=timeout,
                                encoding=None, output=name, chunk=2**20, fLOG=fLOG)
    if unzip and not os.path.exists(unzipname):
        names = ungzip_files(name, unzip=False, where_to=folder)
        os.remove(name)
        if isinstance(names, list):
            if len(names) != 1:
                raise DataException(  # pragma: no cover
                    "Expecting only one file, not '{0}'".format(names))
            return names[0]
        return names
    return name[:-3] if name.endswith('.gz') else name


[docs]def download_titles(country, folder=".", unzip=True, timeout=-1,
                    overwrite=False, fLOG=noLOG):
    """
    Downloads wikipedia titles from
    `dumps.wikimedia.org/frwiki/latest/latest-all-titles-in-ns0.gz
    <https://dumps.wikimedia.org/frwiki/latest/latest-all-titles-in-ns0.gz>`_.

    :param      country:     country
    :param      folder:      where to download
    :param      unzip:       unzip the file
    :param      timeout:     timeout
    :param      overwrite:   overwrite
    :param      fLOG:        logging function


    :githublink:`%|py|98`
    """
    return download_dump(country, "latest-all-titles-in-ns0.gz",
                         folder, unzip=unzip, timeout=timeout,
                         overwrite=overwrite, fLOG=fLOG)


[docs]def normalize_wiki_text(text):
    """
    Normalizes a text such as a wikipedia title.

    :param      text:        text to normalize
    :return:                 normalized text


    :githublink:`%|py|110`
    """
    return text.replace("_", " ").replace("''", '"')


[docs]def enumerate_titles(filename, norm=True, encoding="utf8"):
    """
    Enumerates titles from a file.

    :param      filename:        filename
    :param      norm:            normalize in the function
    :param      encoding:        encoding


    :githublink:`%|py|121`
    """
    if norm:
        with open(filename, "r", encoding=encoding) as f:
            for line in f:
                yield normalize_wiki_text(line.strip(" \r\n\t"))
    else:
        with open(filename, "r", encoding=encoding) as f:
            for line in f:
                yield line.strip(" \r\n\t")
Liens

Contenu

Information

Code source de mlstatpy.data.wikipedia