Coverage for pyquickhelper/texthelper/text_diff.py: 93%

60 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-03 02:21 +0200

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Freely inspired from 

5`Showing Side-by-Side Diffs in Jupyter 

6<https://skeptric.com/python-diffs/>`_. 

7""" 

8import difflib 

9import re 

10from itertools import zip_longest 

11import html 

12 

13 

14whitespace = re.compile('\\s+') 

15end_sentence = re.compile('\\n+') 

16 

17 

18def _tokenize(s): 

19 '''Split a string into tokens''' 

20 return whitespace.split(s) 

21 

22 

23def _untokenize(ts): 

24 '''Join a list of tokens into a string''' 

25 return ' '.join(ts) 

26 

27 

28def _sentencize(s): 

29 '''Split a string into a list of sentences''' 

30 return end_sentence.split(s) 

31 

32 

33def _unsentencise(ts): 

34 '''Join a list of sentences into a string''' 

35 return '. '.join(ts) 

36 

37 

38def _html_unsentencise(ts): 

39 '''Joing a list of sentences into HTML for display''' 

40 return ''.join(f'<p>{t}</p>' for t in ts) 

41 

42 

43def _mark_text(text): 

44 return f'<span style="color: red;">{text}</span>' 

45 

46 

47def _mark_span(text): 

48 return [_mark_text(token) for token in text] 

49 

50 

51def _mark_span(text): 

52 if len(text) > 0: 

53 text[0] = '<span style="background: #69E2FB;">' + text[0] 

54 text[-1] += '</span>' 

55 return text 

56 

57 

58def _markup_diff(a, b, mark=_mark_span, default_mark=None, isjunk=None): 

59 """ 

60 Returns a and b with any differences processed by mark. 

61 Junk is ignored by the differ. 

62 """ 

63 if default_mark is None: 

64 default_mark = lambda x: x 

65 seqmatcher = difflib.SequenceMatcher( 

66 isjunk=isjunk, a=a, b=b, autojunk=False) 

67 out_a, out_b = [], [] 

68 for tag, a0, a1, b0, b1 in seqmatcher.get_opcodes(): 

69 markup = default_mark if tag == 'equal' else mark 

70 out_a += markup(a[a0:a1]) 

71 out_b += markup(b[b0:b1]) 

72 return out_a, out_b 

73 

74 

75def align_seqs(a, b, fill=''): 

76 """ 

77 Aligns two sequences of strings after comparing them. 

78 """ 

79 out_a, out_b = [], [] 

80 seqmatcher = difflib.SequenceMatcher(a=a, b=b, autojunk=False) 

81 for _, a0, a1, b0, b1 in seqmatcher.get_opcodes(): 

82 delta = (a1 - a0) - (b1 - b0) 

83 out_a += a[a0:a1] + [fill] * max(-delta, 0) 

84 out_b += b[b0:b1] + [fill] * max(delta, 0) 

85 return out_a, out_b 

86 

87 

88def _html_sidebyside(a, b): 

89 out = '<div style="display: grid;grid-template-columns: 1fr 1fr;grid-gap: 20px;">\n' 

90 for left, right in zip_longest(a, b, fillvalue=''): 

91 out += f'<p><tt>{left}</tt></p>' 

92 out += f'<p><tt>{right}</tt></p>\n' 

93 out += '</div>' 

94 return out 

95 

96 

97def html_diffs(a, b): 

98 """ 

99 Comparares two strings and renders the 

100 results as HTML. 

101 """ 

102 a = html.escape(a) 

103 b = html.escape(b) 

104 

105 out_a, out_b = [], [] 

106 for sent_a, sent_b in zip(*align_seqs(_sentencize(a), _sentencize(b))): 

107 mark_a, mark_b = _markup_diff(_tokenize(sent_a), _tokenize(sent_b)) 

108 out_a.append(_untokenize(mark_a)) 

109 out_b.append(_untokenize(mark_b)) 

110 

111 return _html_sidebyside(out_a, out_b)