Coverage for pyquickhelper/texthelper/text

1# -*- coding: utf-8 -*-

2"""

3@file

4@brief Freely inspired from

5`Showing Side-by-Side Diffs in Jupyter

6<https://skeptric.com/python-diffs/>`_.

7"""

8import difflib

9import re

10from itertools import zip_longest

11import html

14whitespace = re.compile('\\s+')

15end_sentence = re.compile('\\n+')

18def _tokenize(s):

19 '''Split a string into tokens'''

20 return whitespace.split(s)

23def _untokenize(ts):

24 '''Join a list of tokens into a string'''

25 return ' '.join(ts)

28def _sentencize(s):

29 '''Split a string into a list of sentences'''

30 return end_sentence.split(s)

33def _unsentencise(ts):

34 '''Join a list of sentences into a string'''

35 return '. '.join(ts)

38def _html_unsentencise(ts):

39 '''Joing a list of sentences into HTML for display'''

40 return ''.join(f'{t}' for t in ts)

43def _mark_text(text):

44 return f'{text}'

47def _mark_span(text):

48 return [_mark_text(token) for token in text]

51def _mark_span(text):

52 if len(text) > 0:

53 text[0] = '' + text[0]

54 text[-1] += ''

55 return text

58def _markup_diff(a, b, mark=_mark_span, default_mark=None, isjunk=None):

59 """

60 Returns a and b with any differences processed by mark.

61 Junk is ignored by the differ.

62 """

63 if default_mark is None:

64 default_mark = lambda x: x

65 seqmatcher = difflib.SequenceMatcher(

66 isjunk=isjunk, a=a, b=b, autojunk=False)

67 out_a, out_b = [], []

68 for tag, a0, a1, b0, b1 in seqmatcher.get_opcodes():

69 markup = default_mark if tag == 'equal' else mark

70 out_a += markup(a[a0:a1])

71 out_b += markup(b[b0:b1])

72 return out_a, out_b

75def align_seqs(a, b, fill=''):

76 """

77 Aligns two sequences of strings after comparing them.

78 """

79 out_a, out_b = [], []

80 seqmatcher = difflib.SequenceMatcher(a=a, b=b, autojunk=False)

81 for _, a0, a1, b0, b1 in seqmatcher.get_opcodes():

82 delta = (a1 - a0) - (b1 - b0)

83 out_a += a[a0:a1] + [fill] * max(-delta, 0)

84 out_b += b[b0:b1] + [fill] * max(delta, 0)

85 return out_a, out_b

88def _html_sidebyside(a, b):

89 out = '<div style="display: grid;grid-template-columns: 1fr 1fr;grid-gap: 20px;">\n'

90 for left, right in zip_longest(a, b, fillvalue=''):

91 out += f'<tt>{left}</tt>'

92 out += f'<tt>{right}</tt>\n'

93 out += '</div>'

94 return out

97def html_diffs(a, b):

98 """

99 Comparares two strings and renders the

100 results as HTML.

101 """

102 a = html.escape(a)

103 b = html.escape(b)

104

105 out_a, out_b = [], []

106 for sent_a, sent_b in zip(*align_seqs(_sentencize(a), _sentencize(b))):

107 mark_a, mark_b = _markup_diff(_tokenize(sent_a), _tokenize(sent_b))

108 out_a.append(_untokenize(mark_a))

109 out_b.append(_untokenize(mark_b))

110

111 return _html_sidebyside(out_a, out_b)

Coverage for pyquickhelper/texthelper/text_diff.py: 93%

60 statements