Coverage for src/mlstatpy/data/wikipedia.py: 83%

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1"""

2@file

3@brief Functions to retrieve data from Wikipedia

4"""

5import os

6from pyquickhelper.loghelper import noLOG

7from pyquickhelper.filehelper import get_url_content_timeout, ungzip_files

8from .data_exceptions import DataException

11def download_pageviews(dt, folder=".", unzip=True, timeout=-1,

12 overwrite=False, fLOG=noLOG):

13 """

14 Downloads wikipedia pagacount for a precise date (up to the hours),

15 the url follows the pattern::

17 https://dumps.wikimedia.org/other/pageviews/%Y/%Y-%m/pagecounts-%Y%m%d-%H0000.gz

19 @param dt datetime

20 @param folder where to download

21 @param unzip unzip the file

22 @param timeout timeout

23 @param overwrite overwrite

24 @param fLOG logging function

25 @return filename

27 More information on page

28 `pageviews <https://dumps.wikimedia.org/other/pageviews/>`_.

29 """

30 url = "https://dumps.wikimedia.org/other/pageviews/%Y/%Y-%m/pageviews-%Y%m%d-%H0000.gz"

31 url = dt.strftime(url)

32 file = url.split("/")[-1]

33 name = os.path.join(folder, file)

34 unzipname = os.path.splitext(name)[0]

35 if overwrite or (not os.path.exists(name) and not os.path.exists(unzipname)):

36 get_url_content_timeout(url, timeout=timeout,

37 encoding=None, output=name, chunk=2**20, fLOG=fLOG)

38 if unzip and not os.path.exists(unzipname):

39 names = ungzip_files(name, unzip=False, where_to=folder)

40 os.remove(name)

41 if isinstance(names, list):

42 if len(names) != 1:

43 raise DataException( # pragma: no cover

44 "Expecting only one file, not '{0}'".format(names))

45 return names[0]

46 return names

47 return name

50def download_dump(country, name, folder=".", unzip=True, timeout=-1,

51 overwrite=False, fLOG=noLOG):

52 """

53 Downloads *wikipedia dumps* from

54 `dumps.wikimedia.org/frwiki/latest/

55 <https://dumps.wikimedia.org/frwiki/latest/>`_.

57 @param country country

58 @param name name of the stream to download

59 @param folder where to download

60 @param unzip unzip the file

61 @param timeout timeout

62 @param overwrite overwrite

63 @param fLOG logging function

64 """

65 url = "https://dumps.wikimedia.org/{0}wiki/latest/{0}wiki-{1}".format(

66 country, name)

67 file = url.split("/")[-1] # pylint: disable=C0207

68 name = os.path.join(folder, file)

69 unzipname = os.path.splitext(name)[0]

70 if overwrite or (not os.path.exists(name) and not os.path.exists(unzipname)):

71 get_url_content_timeout(url, timeout=timeout,

72 encoding=None, output=name, chunk=2**20, fLOG=fLOG)

73 if unzip and not os.path.exists(unzipname):

74 names = ungzip_files(name, unzip=False, where_to=folder)

75 os.remove(name)

76 if isinstance(names, list):

77 if len(names) != 1:

78 raise DataException( # pragma: no cover

79 "Expecting only one file, not '{0}'".format(names))

80 return names[0]

81 return names

82 return name[:-3] if name.endswith('.gz') else name

85def download_titles(country, folder=".", unzip=True, timeout=-1,

86 overwrite=False, fLOG=noLOG):

87 """

88 Downloads wikipedia titles from

89 `dumps.wikimedia.org/frwiki/latest/latest-all-titles-in-ns0.gz

90 <https://dumps.wikimedia.org/frwiki/latest/latest-all-titles-in-ns0.gz>`_.

92 @param country country

93 @param folder where to download

94 @param unzip unzip the file

95 @param timeout timeout

96 @param overwrite overwrite

97 @param fLOG logging function

98 """

99 return download_dump(country, "latest-all-titles-in-ns0.gz",

100 folder, unzip=unzip, timeout=timeout,

101 overwrite=overwrite, fLOG=fLOG)

102

103

104def normalize_wiki_text(text):

105 """

106 Normalizes a text such as a wikipedia title.

107

108 @param text text to normalize

109 @return normalized text

110 """

111 return text.replace("_", " ").replace("''", '"')

112

113

114def enumerate_titles(filename, norm=True, encoding="utf8"):

115 """

116 Enumerates titles from a file.

117

118 @param filename filename

119 @param norm normalize in the function

120 @param encoding encoding

121 """

122 if norm:

123 with open(filename, "r", encoding=encoding) as f:

124 for line in f:

125 yield normalize_wiki_text(line.strip(" \r\n\t"))

126 else:

127 with open(filename, "r", encoding=encoding) as f:

128 for line in f:

129 yield line.strip(" \r\n\t")

Coverage for src/mlstatpy/data/wikipedia.py : 83%

47 statements

Coverage for src/mlstatpy/data/wikipedia.py : 83%

47 statements 39 run 8 missing 2 excluded

47 statements