Coverage for pyquickhelper/texthelper/text_diff.py: 93%
60 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-03 02:21 +0200
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-03 02:21 +0200
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Freely inspired from
5`Showing Side-by-Side Diffs in Jupyter
6<https://skeptric.com/python-diffs/>`_.
7"""
8import difflib
9import re
10from itertools import zip_longest
11import html
14whitespace = re.compile('\\s+')
15end_sentence = re.compile('\\n+')
18def _tokenize(s):
19 '''Split a string into tokens'''
20 return whitespace.split(s)
23def _untokenize(ts):
24 '''Join a list of tokens into a string'''
25 return ' '.join(ts)
28def _sentencize(s):
29 '''Split a string into a list of sentences'''
30 return end_sentence.split(s)
33def _unsentencise(ts):
34 '''Join a list of sentences into a string'''
35 return '. '.join(ts)
38def _html_unsentencise(ts):
39 '''Joing a list of sentences into HTML for display'''
40 return ''.join(f'<p>{t}</p>' for t in ts)
43def _mark_text(text):
44 return f'<span style="color: red;">{text}</span>'
47def _mark_span(text):
48 return [_mark_text(token) for token in text]
51def _mark_span(text):
52 if len(text) > 0:
53 text[0] = '<span style="background: #69E2FB;">' + text[0]
54 text[-1] += '</span>'
55 return text
58def _markup_diff(a, b, mark=_mark_span, default_mark=None, isjunk=None):
59 """
60 Returns a and b with any differences processed by mark.
61 Junk is ignored by the differ.
62 """
63 if default_mark is None:
64 default_mark = lambda x: x
65 seqmatcher = difflib.SequenceMatcher(
66 isjunk=isjunk, a=a, b=b, autojunk=False)
67 out_a, out_b = [], []
68 for tag, a0, a1, b0, b1 in seqmatcher.get_opcodes():
69 markup = default_mark if tag == 'equal' else mark
70 out_a += markup(a[a0:a1])
71 out_b += markup(b[b0:b1])
72 return out_a, out_b
75def align_seqs(a, b, fill=''):
76 """
77 Aligns two sequences of strings after comparing them.
78 """
79 out_a, out_b = [], []
80 seqmatcher = difflib.SequenceMatcher(a=a, b=b, autojunk=False)
81 for _, a0, a1, b0, b1 in seqmatcher.get_opcodes():
82 delta = (a1 - a0) - (b1 - b0)
83 out_a += a[a0:a1] + [fill] * max(-delta, 0)
84 out_b += b[b0:b1] + [fill] * max(delta, 0)
85 return out_a, out_b
88def _html_sidebyside(a, b):
89 out = '<div style="display: grid;grid-template-columns: 1fr 1fr;grid-gap: 20px;">\n'
90 for left, right in zip_longest(a, b, fillvalue=''):
91 out += f'<p><tt>{left}</tt></p>'
92 out += f'<p><tt>{right}</tt></p>\n'
93 out += '</div>'
94 return out
97def html_diffs(a, b):
98 """
99 Comparares two strings and renders the
100 results as HTML.
101 """
102 a = html.escape(a)
103 b = html.escape(b)
105 out_a, out_b = [], []
106 for sent_a, sent_b in zip(*align_seqs(_sentencize(a), _sentencize(b))):
107 mark_a, mark_b = _markup_diff(_tokenize(sent_a), _tokenize(sent_b))
108 out_a.append(_untokenize(mark_a))
109 out_b.append(_untokenize(mark_b))
111 return _html_sidebyside(out_a, out_b)