Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

# -*- coding: utf-8 -*- 

""" 

@file 

@brief Script to process the date from Cresus for the hackathon 2016 

""" 

import os 

import sqlite3 

import pandas 

from pyquickhelper.loghelper import fLOG 

from pyensae.sql import Database 

 

 

def process_cresus_whole_process(infile, outfold, ratio=0.20, fLOG=fLOG): # pylint: disable=W0621 

""" 

Processes the database from :epkg:`Cresus` until it splits the data into two 

two sets of files. 

""" 

if not os.path.exists(outfold): 

os.mkdir(outfold) 

out_clean_sql = os.path.join( 

outfold, os.path.split(infile)[-1] + ".clean.sql") 

outdb = os.path.join(outfold, os.path.split(infile)[-1] + ".db3") 

process_cresus_sql(infile, out_clean_sql=out_clean_sql, 

outdb=outdb, fLOG=fLOG) 

tables = prepare_cresus_data(outdb, outfold=outfold, fLOG=fLOG) 

train, test = split_train_test_cresus_data( 

tables, outfold, ratio=ratio, fLOG=fLOG) 

nf = split_XY_bind_dataset_cresus_data(test["dossier"], fLOG) 

test.update(nf) 

nf = split_XY_bind_dataset_cresus_data(train["dossier"], fLOG) 

train.update(nf) 

return train, test 

 

 

def cresus_dummy_file(): 

""" 

@return local filename 

""" 

this = os.path.abspath(os.path.dirname(__file__)) 

name = os.path.join(this, "hackathon_2016_cresus", 

"bdd_anonyme_cresus.enc") 

return name 

 

 

def process_cresus_sql(infile, out_clean_sql=None, outdb=None, fLOG=fLOG): # pylint: disable=W0621 

""" 

Processes the database sent by cresus and produces 

a list of flat files. 

 

@param infile dump of a sql database 

@param out_clean_sql filename which contains the cleaned sql 

@param outdb sqlite3 file (removed if it exists) 

@param fLOG logging function 

@return dataframe with a list 

""" 

if out_clean_sql is None: 

out_clean_sql = os.path.splitext(infile)[0] + ".cleaned.sql" 

if outdb is None: 

outdb = os.path.splitext(infile)[0] + ".db3" 

 

dbfile = outdb 

if os.path.exists(dbfile): 

os.remove(dbfile) 

 

# READING 

fLOG("[process_cresus_sql] reading", infile) 

with open(infile, "r", encoding="utf-8") as f: 

content = f.read() 

fLOG("done") 

 

# CLEANING 

fLOG("[process_cresus_sql] cleaning") 

lines = content.split("\n") 

repl = [("ENGINE=InnoDB DEFAULT CHARSET=latin1", ""), 

("CHARACTER SET latin1", ""), 

("CHARACTER SET utf8", ""), 

("\\'", "''"), 

("ENGINE=InnoDB AUTO_INCREMENT=100 DEFAULT CHARSET=utf8 ROW_FORMAT=COMPACT", ""), 

("ENGINE=MyISAM AUTO_INCREMENT=5 DEFAULT CHARSET=utf8", ""), 

("ENGINE=MyISAM AUTO_INCREMENT=4 DEFAULT CHARSET=utf8", ""), 

("ENGINE=MyISAM AUTO_INCREMENT=6 DEFAULT CHARSET=utf8", ""), 

("int(11) NOT NULL AUTO_INCREMENT", "INTEGER"), 

] 

new_lines = [] 

modified = 0 

for line in lines: 

if line.startswith("CREATE DATABASE "): 

line = "-- " + line 

modified += 1 

if line.startswith("USE `cresus_anonyme`"): 

line = "-- " + line 

modified += 1 

for rep, to in repl: 

if rep in line: 

line = line.replace(rep, to) 

modified += 1 

# if line.startswith("REPLACE INTO "): 

# line = "INSERT" + line[len("REPLACE"):] 

new_lines.append(line) 

content = "\n".join(new_lines) 

fLOG("done l=", len(content), " modified", modified) 

 

# DATABASE 

fLOG("[process_cresus_sql] execute", dbfile) 

con = sqlite3.connect(dbfile) 

try: 

con.executescript(content) 

except Exception as e: 

try: 

exp = str(e).split('"')[1] 

except Exception: 

raise e 

lines = content.split("\n") 

lines = ["{0}/{2}:{1}".format(i, _, len(lines)) 

for i, _ in enumerate(lines) if exp in _] 

raise Exception("\n".join(lines)) from e 

con.commit() 

 

# CHECK 

fLOG("[process_cresus_sql] CHECK") 

sql = """SELECT name FROM (SELECT * FROM sqlite_master UNION ALL SELECT * FROM sqlite_temp_master) AS temptbl 

WHERE type in('table','temp') AND name != 'sqlite_sequence' ORDER BY name;""" 

df = pandas.read_sql(sql, con) 

fLOG("[process_cresus_sql]", df) 

fLOG("[process_cresus_sql] done") 

 

# COUNT 

fLOG("[process_cresus_sql] COUNT") 

sql = " UNION ALL ".join("SELECT \"{0}\" AS name, COUNT(*) AS nb FROM {0}".format(_) 

for _ in sorted(df.name)) 

df = pandas.read_sql(sql, con) 

fLOG("[process_cresus_sql]", df) 

fLOG("[process_cresus_sql] done") 

 

con.close() 

 

with open(out_clean_sql, "w", encoding="utf-8") as f: 

f.write(content) 

 

fLOG("[process_cresus_sql] done") 

return df 

 

 

def prepare_cresus_data(dbfile, outfold=None, fLOG=fLOG): # pylint: disable=W0621 

""" 

Prepares the data for the challenge. 

 

@param dbfile database file 

@param outfold output folder 

@param fLOG logging function 

@return dictionary of table files 

""" 

db = Database(dbfile) 

db.connect() 

 

if outfold is None: 

outfold = "." 

 

remove_column = ['nom', 'prenom', 

'tel_fixe', 'tel_mobile', 'email', 

'adresse', 'rdv1', 'rdv2', 'rdv3', 

'fichier_suivi', 'fichier_suivi2', 'media', 

'indicateur_suivi', 'memo', 'num_dossier', 

'etat_old', 'orientation_old', 'indicateur_suivi_old', 

'transfert', 'plan_bdf', 'effacement_dett', "etat", 

# 

'tel_fixe', 'tel_port', 

] 

 

new_mapping = {'': 'nul1', None: 'nul2', 

'Sur-endettement': 'Surendettement', '0': 'nul'} 

 

res = {} 

tables = db.get_table_list() 

for table in tables: 

fLOG("[prepare_cresus_data] exporting", table) 

df = pandas.read_sql("select * from " + table, db._connection) 

cols = [_ for _ in df.columns if _ not in remove_column] 

cols.sort() 

if "orientation" in cols: 

cols = [_ for _ in cols if _ not in ("orientation", "nature")] 

cols += ["orientation", "nature"] 

df["nature"] = df.nature.apply( 

lambda x: new_mapping.get(x, x).replace("é", "e").lower()) 

fLOG(set(df["nature"])) 

df = df[cols] 

name = os.path.join(outfold, "tbl_" + table + ".txt") 

df.to_csv(name, sep="\t", encoding="utf-8", index=False) 

res[table] = name 

db.close() 

return res 

 

 

def split_train_test_cresus_data(tables, outfold, ratio=0.20, fLOG=fLOG): # pylint: disable=W0621 

""" 

Splits the tables into two sets for tables (based on users). 

 

@param tables dictionary of tables, 

@see fn prepare_cresus_data 

@param outfold if not None, output all tables in this folder 

@param fLOG logging function 

@return couple of dictionaries of table files 

""" 

splits = ["user", "agenda", "dossier", "budget"] 

df = pandas.read_csv(tables["dossier"], sep="\t", encoding="utf-8") 

short = df[["id", "id_user", "date_ouverture"] 

].sort_values("date_ouverture") 

nb = len(short) 

train = int(nb * (1 - ratio)) 

dossiers = set(short.loc[:train, "id"]) 

users = set(short.loc[:train, "id_user"]) 

 

train_tables = {} 

test_tables = {} 

for k, v in tables.items(): 

if k not in splits: 

fLOG("[split_train_test_cresus_data] no split for", k) 

data = pandas.read_csv(v, sep="\t", encoding="utf-8") 

train_tables[k] = data 

test_tables[k] = data 

else: 

if k == "dossier": 

train_tables[k] = df[:train].copy() 

test_tables[k] = df[train:].copy() 

else: 

data = pandas.read_csv(v, sep="\t", encoding="utf-8") 

if "id_dossier" in data.columns: 

key = "id_dossier" 

select = dossiers 

elif k == "user": 

key = "id" 

select = users 

else: 

raise Exception("unexpected: {0}".format(k)) 

try: 

spl = data[key].apply(lambda x, ens=select: x in ens) 

except KeyError as e: 

raise Exception("issue for table '{0}' and columns={1}".format( 

k, data.columns)) from e 

train_tables[k] = data[spl].copy() 

test_tables[k] = data[~spl].copy() 

fLOG("[split_train_test_cresus_data] split for", k, 

train_tables[k].shape, test_tables[k].shape) 

 

rtrain = {} 

for k, v in train_tables.items(): 

name = os.path.join(outfold, "tbl_train_" + k + ".txt") 

v.to_csv(name, index=False, sep="\t", encoding="utf-8") 

rtrain[k] = name 

rtest = {} 

for k, v in test_tables.items(): 

name = os.path.join(outfold, "tbl_test_" + k + ".txt") 

v.to_csv(name, index=False, sep="\t", encoding="utf-8") 

rtest[k] = name 

return rtrain, rtest 

 

 

def split_XY_bind_dataset_cresus_data(filename, fLOG=fLOG): # pylint: disable=W0621 

""" 

Splits XY for the blind set. 

 

@param filename table to split 

@param fLOG logging function 

@return dictionary of created files 

 

It assumes the targets are columns *orientation*, *nature*. 

""" 

df = pandas.read_csv(filename, sep="\t", encoding="utf-8") 

isnull = df["orientation"].isnull() | df["nature"].isnull() 

df = df[~isnull] 

X = df.drop(["orientation", "nature"], axis=1) 

Y = df[["orientation", "nature"]] 

xname = os.path.splitext(filename)[0] + ".X.txt" 

yname = os.path.splitext(filename)[0] + ".Y.txt" 

X.to_csv(xname, sep="\t", index=False, encoding="utf-8") 

Y.to_csv(yname, sep="\t", index=False, encoding="utf-8") 

return {"dossier.X": xname, "dossier.Y": yname}