Coverage for src/mlstatpy/ml/matrices.py: 100%

1# -*- coding: utf-8 -*-

2"""

3@file

4@brief Algorithms about matrices.

5"""

6import warnings

7import numpy

8import numpy.linalg

9from scipy.linalg.lapack import dtrtri # pylint: disable=E0611

12def gram_schmidt(mat, change=False):

13 """

14 Applies the `Gram–Schmidt process

15 <https://en.wikipedia.org/wiki/Gram%E2%80%93Schmidt_process>`_.

16 Due to performance, every row is considered as a vector.

18 @param mat matrix

19 @param change returns the matrix to change the basis

20 @return new matrix or (new matrix, change matrix)

22 The function assumes the matrix *mat* is

23 horizontal: it has more columns than rows.

25 .. note::

26 The implementation could be improved

27 by directly using :epkg:`BLAS` function.

29 .. runpython::

30 :showcode:

32 import numpy

33 from mlstatpy.ml.matrices import gram_schmidt

35 X = numpy.array([[1., 2., 3., 4.],

36 [5., 6., 6., 6.],

37 [5., 6., 7., 8.]])

38 T, P = gram_schmidt(X, change=True)

39 print(T)

40 print(P)

41 """

42 if len(mat.shape) != 2:

43 raise ValueError("mat must be a matrix.") # pragma: no cover

44 if mat.shape[1] < mat.shape[0]:

45 raise RuntimeError( # pragma: no cover

46 "The function only works if the number of rows is less "

47 "than the number of columns.")

48 if change:

49 base = numpy.identity(mat.shape[0])

50 # The following code is equivalent to:

51 # res = numpy.empty(mat.shape)

52 # for i in range(0, mat.shape[0]):

53 # res[i, :] = mat[i, :]

54 # for j in range(0, i):

55 # d = numpy.dot(res[j, :], mat[i, :])

56 # res[i, :] -= res[j, :] * d

57 # if change:

58 # base[i, :] -= base[j, :] * d

59 # d = numpy.dot(res[i, :], res[i, :])

60 # if d > 0:

61 # d **= 0.5

62 # res[i, :] /= d

63 # if change:

64 # base[i, :] /= d

65 # But it is faster to write it this way:

66 res = numpy.empty(mat.shape)

67 for i in range(0, mat.shape[0]):

68 res[i, :] = mat[i, :]

69 if i > 0:

70 d = numpy.dot(res[:i, :], mat[i, :])

71 m = numpy.multiply(res[:i, :], d.reshape((len(d), 1)))

72 m = numpy.sum(m, axis=0)

73 res[i, :] -= m

74 if change:

75 m = numpy.multiply(base[:i, :], d.reshape((len(d), 1)))

76 m = numpy.sum(m, axis=0)

77 base[i, :] -= m

78 d = numpy.dot(res[i, :], res[i, :])

79 if d > 0:

80 d **= 0.5

81 res[i, :] /= d

82 if change:

83 base[i, :] /= d

84 return (res, base) if change else res

87def linear_regression(X, y, algo=None):

88 """

89 Solves the linear regression problem,

90 find :math:`\\beta` which minimizes

91 :math:`\\norme{y - X\\beta}`, based on the algorithm

92 :ref:`Arbre de décision optimisé pour les régressions linéaires

93 <algo_decision_tree_mselin>`.

95 @param X features

96 @param y targets

97 @param algo None to use the standard algorithm

98 :math:`\\beta = (X'X)^{-1} X'y`,

99 `'gram'`, `'qr'`

100 @return beta

101

102 .. runpython::

103 :showcode:

104

105 import numpy

106 from mlstatpy.ml.matrices import linear_regression

107

108 X = numpy.array([[1., 2., 3., 4.],

109 [5., 6., 6., 6.],

110 [5., 6., 7., 8.]]).T

111 y = numpy.array([0.1, 0.2, 0.19, 0.29])

112 beta = linear_regression(X, y, algo="gram")

113 print(beta)

114

115 ``algo=None`` computes :math:`\\beta = (X'X)^{-1} X'y`.

116 ``algo='qr'`` uses a `QR <https://docs.scipy.org/doc/numpy/reference

117 /generated/numpy.linalg.qr.html>`_ decomposition and calls function

118 `dtrtri <https://docs.scipy.org/doc/scipy/reference/generated/scipy.

119 linalg.lapack.dtrtri.html>`_ to invert an upper triangular matrix.

120 ``algo='gram'`` uses :func:`gram_schmidt

121 <mlstatpy.ml.matrices.gram_schmidt>` and then computes

122 the solution of the linear regression (see above for a link

123 to the algorithm).

124 """

125 if len(y.shape) != 1:

126 warnings.warn( # pragma: no cover

127 "This function is not tested for a multidimensional linear regression.")

128 if algo is None:

129 inv = numpy.linalg.inv(X.T @ X)

130 return inv @ (X.T @ y)

131 if algo == "gram":

132 T, P = gram_schmidt(X.T, change=True)

133 # T = P X

134 return (y.T @ T.T @ P).ravel()

135 if algo == "qr":

136 Q, R = numpy.linalg.qr(X, "full")

137 Ri = dtrtri(R)[0]

138 gamma = (y.T @ Q).ravel()

139 return (gamma @ Ri.T).ravel()

140 raise ValueError( # pragma: no cover

141 f"Unknwown algo='{algo}'.")

142

143

144def norm2(X):

145 """

146 Computes the square norm for all rows of a

147 matrix.

148 """

149 res = numpy.empty(X.shape[1])

150 for i in range(X.shape[1]):

151 res[i] = numpy.dot(X[i], X[i])

152 return res

153

154

155def streaming_gram_schmidt_update(Xk, Pk):

156 """

157 Updates matrix :math:`P_k` to produce :math:`P_{k+1}`

158 which is the matrix *P* in algorithm

159 :ref:`Streaming Linear Regression

160 <algo_reg_lin_gram_schmidt_streaming>`.

161 The function modifies the matrix *Pk*

162 given as an input.

163

164 @param Xk kth row

165 @param Pk matrix *P* at iteration *k-1*

166 """

167 tki = Pk @ Xk

168 idi = numpy.identity(Pk.shape[0])

169

170 for i in range(0, Pk.shape[0]):

171 val = tki[i]

172

173 if i > 0:

174 # for j in range(0, i):

175 # d = tki[j] * val

176 # tki[i] -= tki[j] * d

177 # Pk[i, :] -= Pk[j, :] * d

178 # idi[i, :] -= idi[j, :] * d

179

180 dv = tki[:i] * val

181 tki[i] -= numpy.dot(dv, tki[:i])

182 dv = dv.reshape((i, 1))

183 Pk[i, :] -= numpy.multiply(Pk[:i, :], dv).sum(axis=0)

184 idi[i, :] -= numpy.multiply(idi[:i, :], dv).sum(axis=0)

185

186 d = numpy.square(idi[i, :]).sum() # pylint: disable=E1101

187 d = tki[i] ** 2 + d

188 if d > 0:

189 d **= 0.5

190 d = 1. / d

191 tki[i] *= d

192 Pk[i, :] *= d

193 idi[i, :] *= d

194

195

196def streaming_gram_schmidt(mat, start=None):

197 """

198 Solves the linear regression problem,

199 find :math:`\\beta` which minimizes

200 :math:`\\norme{y - X\\beta}`, based on

201 algorithm :ref:`Streaming Gram-Schmidt

202 <algo_reg_lin_gram_schmidt_streaming>`.

203

204 @param mat matrix

205 @param start first row to start iteration, ``X.shape[1]`` by default

206 @return iterator on

207

208 The function assumes the matrix *mat* is

209 horizontal: it has more columns than rows.

210

211 .. runpython::

212 :showcode:

213

214 import numpy

215 from mlstatpy.ml.matrices import streaming_gram_schmidt

216

217 X = numpy.array([[1, 0.5, 10., 5., -2.],

218 [0, 0.4, 20, 4., 2.],

219 [0, 0.7, 20, 4., 2.]], dtype=float).T

220

221 for i, p in enumerate(streaming_gram_schmidt(X.T)):

222 print("iteration", i, "\\n", p)

223 t = X[:i+3] @ p.T

224 print(t.T @ t)

225 """

226 if len(mat.shape) != 2:

227 raise ValueError("mat must be a matrix.") # pragma: no cover

228 if mat.shape[1] < mat.shape[0]:

229 raise RuntimeError("The function only works if the number of rows is less "

230 "than the number of columns.")

231 if start is None:

232 start = mat.shape[0]

233 mats = mat[:, :start]

234 _, Pk = gram_schmidt(mats, change=True)

235 yield Pk

236

237 k = start

238 while k < mat.shape[1]:

239 streaming_gram_schmidt_update(mat[:, k], Pk)

240 yield Pk

241 k += 1

242

243

244def streaming_linear_regression_update(Xk, yk, XkXk, bk):

245 """

246 Updates coefficients :math:`\\beta_k` to produce :math:`\\beta_{k+1}`

247 in :ref:`l-piecewise-linear-regression`.

248 The function modifies the matrix *Pk*

249 given as an input.

250

251 @param Xk kth row

252 @param yk kth target

253 @param XkXk matrix :math:`X_{1..k}'X_{1..k}', updated by the function

254 @param bk current coefficient (updated by the function)

255 """

256 Xk = Xk.reshape((1, XkXk.shape[0]))

257 xxk = Xk.T @ Xk

258 XkXk += xxk

259 err = Xk.T * (yk - Xk @ bk)

260 bk[:] += (numpy.linalg.inv(XkXk) @ err).flatten()

261

262

263def streaming_linear_regression(mat, y, start=None):

264 """

265 Streaming algorithm to solve a linear regression.

266 See :ref:`l-piecewise-linear-regression`.

267

268 @param mat features

269 @param y expected target

270 @return iterator on coefficients

271

272 .. runpython::

273 :showcode:

274

275 import numpy

276 from mlstatpy.ml.matrices import streaming_linear_regression, linear_regression

277

278 X = numpy.array([[1, 0.5, 10., 5., -2.],

279 [0, 0.4, 20, 4., 2.],

280 [0, 0.7, 20, 4., 3.]], dtype=float).T

281 y = numpy.array([1., 0.3, 10, 5.1, -3.])

282

283 for i, bk in enumerate(streaming_linear_regression(X, y)):

284 bk0 = linear_regression(X[:i+3], y[:i+3])

285 print("iteration", i, bk, bk0)

286 """

287 if len(mat.shape) != 2:

288 raise ValueError("mat must be a matrix.") # pragma: no cover

289 if mat.shape[0] < mat.shape[1]:

290 raise RuntimeError("The function only works if the number of rows is more "

291 "than the number of columns.")

292 if len(y.shape) != 1:

293 warnings.warn( # pragma: no cover

294 "This function is not tested for a multidimensional linear regression.")

295 if start is None:

296 start = mat.shape[1]

297

298 Xk = mat[:start]

299 XkXk = Xk.T @ Xk

300 bk = numpy.linalg.inv(XkXk) @ (Xk.T @ y[:start])

301 yield bk

302

303 k = start

304 while k < mat.shape[0]:

305 streaming_linear_regression_update(mat[k], y[k], XkXk, bk)

306 yield bk

307 k += 1

308

309

310def streaming_linear_regression_gram_schmidt_update(Xk, yk, Xkyk, Pk, bk):

311 """

312 Updates coefficients :math:`\\beta_k` to produce :math:`\\beta_{k+1}`

313 in :ref:`Streaming Linear Regression

314 <l-piecewise-linear-regression-gram_schmidt>`.

315 The function modifies the matrix *Pk*

316 given as an input.

317

318 @param Xk kth row

319 @param yk kth target

320 @param Xkyk matrix :math:`X_{1..k}' y_{1..k}' (updated by the function)

321 @param Pk Gram-Schmidt matrix produced by the streaming algorithm

322 (updated by the function)

323 @param bk current coefficient (updated by the function)

324 """

325 Xk = Xk.T

326 streaming_gram_schmidt_update(Xk, Pk)

327 Xkyk += (Xk * yk).reshape(Xkyk.shape)

328 bk[:] = Pk @ Xkyk @ Pk

329

330

331def streaming_linear_regression_gram_schmidt(mat, y, start=None):

332 """

333 Streaming algorithm to solve a linear regression with

334 Gram-Schmidt algorithm.

335 See :ref:`l-piecewise-linear-regression-gram_schmidt`.

336

337 @param mat features

338 @param y expected target

339 @return iterator on coefficients

340

341 .. runpython::

342 :showcode:

343

344 import numpy

345 from mlstatpy.ml.matrices import streaming_linear_regression, linear_regression

346

347 X = numpy.array([[1, 0.5, 10., 5., -2.],

348 [0, 0.4, 20, 4., 2.],

349 [0, 0.7, 20, 4., 3.]], dtype=float).T

350 y = numpy.array([1., 0.3, 10, 5.1, -3.])

351

352 for i, bk in enumerate(streaming_linear_regression(X, y)):

353 bk0 = linear_regression(X[:i+3], y[:i+3])

354 print("iteration", i, bk, bk0)

355 """

356 if len(mat.shape) != 2:

357 raise ValueError("mat must be a matrix.") # pragma: no cover

358 if mat.shape[0] < mat.shape[1]:

359 raise RuntimeError("The function only works if the number of rows is more "

360 "than the number of columns.")

361 if len(y.shape) != 1:

362 warnings.warn( # pragma: no cover

363 "This function is not tested for a multidimensional linear regression.")

364 if start is None:

365 start = mat.shape[1]

366

367 Xk = mat[:start]

368 xyk = Xk.T @ y[:start]

369 _, Pk = gram_schmidt(Xk.T, change=True)

370 bk = Pk @ xyk @ Pk

371 yield bk

372

373 k = start

374 while k < mat.shape[0]:

375 streaming_linear_regression_gram_schmidt_update(

376 mat[k], y[k], xyk, Pk, bk)

377 yield bk

378 k += 1