Coverage for src/mlstatpy/data/wikipedia.py: 83%

47 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-08-08 05:52 +0200

1""" 

2@file 

3@brief Functions to retrieve data from Wikipedia 

4""" 

5import os 

6from pyquickhelper.loghelper import noLOG 

7from pyquickhelper.filehelper import get_url_content_timeout, ungzip_files 

8from .data_exceptions import DataException 

9 

10 

11def download_pageviews(dt, folder=".", unzip=True, timeout=-1, 

12 overwrite=False, fLOG=noLOG): 

13 """ 

14 Downloads wikipedia pagacount for a precise date (up to the hours), 

15 the url follows the pattern:: 

16 

17 https://dumps.wikimedia.org/other/pageviews/%Y/%Y-%m/pagecounts-%Y%m%d-%H0000.gz 

18 

19 @param dt datetime 

20 @param folder where to download 

21 @param unzip unzip the file 

22 @param timeout timeout 

23 @param overwrite overwrite 

24 @param fLOG logging function 

25 @return filename 

26 

27 More information on page 

28 `pageviews <https://dumps.wikimedia.org/other/pageviews/>`_. 

29 """ 

30 url = "https://dumps.wikimedia.org/other/pageviews/%Y/%Y-%m/pageviews-%Y%m%d-%H0000.gz" 

31 url = dt.strftime(url) 

32 file = url.split("/")[-1] 

33 name = os.path.join(folder, file) 

34 unzipname = os.path.splitext(name)[0] 

35 if overwrite or (not os.path.exists(name) and not os.path.exists(unzipname)): 

36 get_url_content_timeout(url, timeout=timeout, 

37 encoding=None, output=name, chunk=2**20, fLOG=fLOG) 

38 if unzip and not os.path.exists(unzipname): 

39 names = ungzip_files(name, unzip=False, where_to=folder) 

40 os.remove(name) 

41 if isinstance(names, list): 

42 if len(names) != 1: 

43 raise DataException( # pragma: no cover 

44 f"Expecting only one file, not '{names}'") 

45 return names[0] 

46 return names 

47 return name 

48 

49 

50def download_dump(country, name, folder=".", unzip=True, timeout=-1, 

51 overwrite=False, fLOG=noLOG): 

52 """ 

53 Downloads *wikipedia dumps* from 

54 `dumps.wikimedia.org/frwiki/latest/ 

55 <https://dumps.wikimedia.org/frwiki/latest/>`_. 

56 

57 @param country country 

58 @param name name of the stream to download 

59 @param folder where to download 

60 @param unzip unzip the file 

61 @param timeout timeout 

62 @param overwrite overwrite 

63 @param fLOG logging function 

64 """ 

65 url = "https://dumps.wikimedia.org/{0}wiki/latest/{0}wiki-{1}".format( 

66 country, name) 

67 file = url.split("/")[-1] # pylint: disable=C0207 

68 name = os.path.join(folder, file) 

69 unzipname = os.path.splitext(name)[0] 

70 if overwrite or (not os.path.exists(name) and not os.path.exists(unzipname)): 

71 get_url_content_timeout(url, timeout=timeout, 

72 encoding=None, output=name, chunk=2**20, fLOG=fLOG) 

73 if unzip and not os.path.exists(unzipname): 

74 names = ungzip_files(name, unzip=False, where_to=folder) 

75 os.remove(name) 

76 if isinstance(names, list): 

77 if len(names) != 1: 

78 raise DataException( # pragma: no cover 

79 f"Expecting only one file, not '{names}'") 

80 return names[0] 

81 return names 

82 return name[:-3] if name.endswith('.gz') else name 

83 

84 

85def download_titles(country, folder=".", unzip=True, timeout=-1, 

86 overwrite=False, fLOG=noLOG): 

87 """ 

88 Downloads wikipedia titles from 

89 `dumps.wikimedia.org/frwiki/latest/latest-all-titles-in-ns0.gz 

90 <https://dumps.wikimedia.org/frwiki/latest/latest-all-titles-in-ns0.gz>`_. 

91 

92 @param country country 

93 @param folder where to download 

94 @param unzip unzip the file 

95 @param timeout timeout 

96 @param overwrite overwrite 

97 @param fLOG logging function 

98 """ 

99 return download_dump(country, "latest-all-titles-in-ns0.gz", 

100 folder, unzip=unzip, timeout=timeout, 

101 overwrite=overwrite, fLOG=fLOG) 

102 

103 

104def normalize_wiki_text(text): 

105 """ 

106 Normalizes a text such as a wikipedia title. 

107 

108 @param text text to normalize 

109 @return normalized text 

110 """ 

111 return text.replace("_", " ").replace("''", '"') 

112 

113 

114def enumerate_titles(filename, norm=True, encoding="utf8"): 

115 """ 

116 Enumerates titles from a file. 

117 

118 @param filename filename 

119 @param norm normalize in the function 

120 @param encoding encoding 

121 """ 

122 if norm: 

123 with open(filename, "r", encoding=encoding) as f: 

124 for line in f: 

125 yield normalize_wiki_text(line.strip(" \r\n\t")) 

126 else: 

127 with open(filename, "r", encoding=encoding) as f: 

128 for line in f: 

129 yield line.strip(" \r\n\t")