Coverage for src/ensae_teaching_cs/homeblog/program_helper.py: 96%
55 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-04-28 06:23 +0200
« prev ^ index » next coverage.py v7.1.0, created at 2023-04-28 06:23 +0200
1"""
2@file
3@brief Various function about programs such as guessing the language of a code
4"""
5import re
8def guess_language_code(code):
9 """
10 Guess the language of a piece of code.
11 The result can be: js, xml, html, cpp, py, sql, vba, css
13 @param code code
14 @return type of language or None if None if not found, score (in [0,1], 1 is good)
16 The algorithm is to compare two languages bamong each others on keywords they don't have in common
17 """
18 code = code.replace(" ", " ").replace(
19 "\r", "").replace("\n", " ").replace("\t", " ")
20 stripcode = code.strip()
21 if stripcode.startswith("<html>") or \
22 stripcode.startswith("<xml") or \
23 stripcode.startswith("<!DOCTYPE html>"):
24 return ('xml', 1.0)
25 exp1 = re.compile("[^a-z]([a-z]{2,8})[^a-z0-9]")
26 exp2 = re.compile("(</?[a-z]{2,8}( |>))")
27 keywords = {"py": set(("format with len from numpy enumerate as and or ord range try except " +
28 "raise for while if else elif with self assert " +
29 "for in if not import del from map random sys append except in range elif " +
30 "float str def raise except none").split()),
31 "sql": set("on outer full as count and or desc asc from select group by order where join inner".split()),
32 "xml": set("<body> <xml> </body> <script> <script </script> <head> </head> <meta> <meta </meta>".split()),
33 "css": set("border font background size".split()),
34 "vb": set("error for sub function while wend then to end next dim set".split()),
35 "cpp": set(("ord try catch throw try for while if else push for foreach delete vector map if " +
36 "catch void double string new throw null").split()),
37 "js": set("try catch throw for while if else push for in if catch var throw new function null".split()),
38 }
39 comments = {"py": re.compile("#[^#]"),
40 "sql": re.compile("--[^-]"),
41 "css": re.compile("//[/]"),
42 "vb": re.compile("'' "),
43 "xml": re.compile("<!--[^-]"),
44 }
45 comments["cpp"] = comments["js"] = comments["css"]
47 mat = {}
48 for k, v in keywords.items():
49 for k2, v2 in keywords.items():
50 if k == k2:
51 continue
52 inter = v.intersection(v2)
53 vd = v - inter
54 v2d = v2 - inter
55 mat[k, k2] = (vd, v2d)
56 if comments[k] != comments[k2]:
57 mat[k, k2] += (comments[k], comments[k2])
59 token = exp1.findall(code) + exp2.findall(code)
61 counts = {}
62 for k, v in mat.items():
63 c = [0, 0, 0, 0, [], [], None, None]
64 for t in token:
65 if t in v[0]:
66 c[0] += 1
67 c[4].append(t)
68 if t in v[1]:
69 c[1] += 1
70 c[5].append(t)
71 if len(v) > 2:
72 co1 = v[2].findall(code)
73 co2 = v[3].findall(code)
74 c[6] = co1
75 c[7] = co2
76 c[2], c[3] = len(co1), len(co2)
77 counts[k] = c
79 # ~ for k in sorted(counts) :
80 #~ print (k,counts[k])
81 # ~ if sum(counts[k][:4]) == 0 :
82 #~ print (k, mat[k])
83 #~ print (token)
85 # we find a language which wins every battle
86 better = {}
87 for k, c in counts.items():
88 if c[0] + c[2] >= c[1] + c[3]:
89 better[k[0]] = better.get(k[0], 0) + 1
91 #print (better)
93 li = [(v, k) for k, v in better.items()]
94 li.sort()
95 if len(li) > 0:
96 if li[-1][0] == len(keywords) - 1 and (len(li) == 1 or li[-2][0] < len(keywords) - 1):
97 ans = li[-1][1]
98 sh = [(v, k) for k, v in counts.items() if k[0] == ans]
99 co = [((v[0] + v[2]) / sum(v[:4]), k) for v, k in sh]
100 co.sort()
101 #print (co)
102 return (ans, co[0][0])
103 else:
104 return None
105 else:
106 return None