close
Skip to content

Commit 29b531f

Browse files
authored
fix(markdown): stop WYSIWYG hard breaks rendering as literal "<br />" (#13076)
The help/FAQ pages (e.g. /help/faq/editing) showed literal `<br />` text because Open Library has two markdown engines that disagree on hard breaks: * Display: OLMarkdown (vendored Python-Markdown) turns every newline into an injected `<br/>` and has no escape syntax for hard breaks. * Editing: the Tiptap WYSIWYG component (tiptap-markdown) serializes a hard break as CommonMark `\` + newline. When editor-saved content is displayed, OLMarkdown appends its `<br/>` right after the editor's trailing `\`; the backslash escapes the `<`, so readers see the literal text `<br />`. Every multi-line paragraph edited via the editor accumulated this (and stray backslashes — see the page history). Fixes both sides: * Editor: a new OLHardBreak node serializes hard breaks as a bare newline (OLMarkdown's dialect) instead of CommonMark `\`. Re-saving a page now also cleans up legacy backslash/`<br>` cruft. * Renderer: LineBreaksPreprocessor strips a lone trailing `\` before injecting `<br/>`, so already-corrupted pages display correctly without a re-save, and no longer glues a `<br/>` onto the line above a link-reference definition. Adds JS round-trip tests and Python render tests; both fail without the fix. Ref: #13074
2 parents 29ef7e0 + 692c77a commit 29b531f

7 files changed

Lines changed: 144 additions & 7 deletions

File tree

‎openlibrary/components/lit/editor-core.js‎

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import { Markdown } from 'tiptap-markdown';
88
import Placeholder from '@tiptap/extension-placeholder';
99
import Image from '@tiptap/extension-image';
1010
import { HtmlBlock } from './html-block.js';
11+
import { OLHardBreak } from './hard-break.js';
1112

1213
/**
1314
* Creates a configured Tiptap editor instance.
@@ -29,8 +30,12 @@ export function createEditor({ element, content, placeholder, onUpdate, onTransa
2930
codeBlock: enableCode ? undefined : false,
3031
code: enableCode ? undefined : false,
3132
link: { openOnClick: false, autolink: true },
32-
strike: false
33+
strike: false,
34+
// Replaced with OLHardBreak below so hard breaks serialize to
35+
// OLMarkdown's dialect (a bare newline, not CommonMark "\").
36+
hardBreak: false
3337
}),
38+
OLHardBreak,
3439
Markdown.configure({
3540
breaks: true,
3641
linkify: true
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import { HardBreak } from '@tiptap/extension-hard-break';
2+
3+
/**
4+
* OLHardBreak – a hard-break node that serializes to OLMarkdown's dialect.
5+
*
6+
* The server-side display renderer (OLMarkdown, a vendored Python-Markdown)
7+
* treats every newline inside a paragraph as a `<br>` and has no escape syntax
8+
* for hard breaks. tiptap-markdown's default serializer emits CommonMark's
9+
* `\<newline>`; that trailing backslash makes OLMarkdown escape the `<` of the
10+
* `<br/>` it injects, so readers see the literal text `<br />`.
11+
*
12+
* Emitting a bare newline instead keeps the two renderers in agreement, and
13+
* re-saving a page through the editor cleans up any legacy backslash/`<br>`
14+
* cruft. See internetarchive/openlibrary#13074.
15+
*/
16+
export const OLHardBreak = HardBreak.extend({
17+
addStorage() {
18+
return {
19+
markdown: {
20+
serialize(state, node, parent, index) {
21+
for (let i = index + 1; i < parent.childCount; i++) {
22+
if (parent.child(i).type !== node.type) {
23+
state.write('\n');
24+
return;
25+
}
26+
}
27+
},
28+
parse: {
29+
// handled by markdown-it
30+
},
31+
},
32+
};
33+
},
34+
});

‎openlibrary/core/olmarkdown.py‎

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,12 +64,28 @@ def run(self, lines):
6464
class LineBreaksPreprocessor(markdown.Preprocessor):
6565
def run(self, lines):
6666
for i in range(len(lines) - 1):
67-
# append <br/> to all lines expect blank lines and the line before blankline.
67+
# Only consider non-blank lines followed by another non-blank line,
68+
# and never touch indented code (tabbed) lines.
69+
if not (lines[i].strip() and lines[i + 1].strip()):
70+
continue
71+
if markdown.RE.regExp["tabbed"].match(lines[i]):
72+
continue
73+
74+
# A lone trailing backslash is CommonMark's hard-break marker, which
75+
# the Tiptap WYSIWYG editor emits between wrapped lines. OLMarkdown
76+
# has no such syntax: left alone the backslash either escapes the "<"
77+
# of the <br /> we append (so the tag renders as the literal text
78+
# "<br />" to readers) or shows as a stray "\". Drop it either way so
79+
# the two renderers agree. See issue #13074.
80+
lines[i] = re.sub(r"(?<!\\)\\$", "", lines[i])
81+
6882
if (
69-
lines[i].strip()
70-
and lines[i + 1].strip()
71-
and not markdown.RE.regExp["tabbed"].match(lines[i])
72-
and not LINK_REFERENCE_RE.match(lines[i])
83+
not LINK_REFERENCE_RE.match(lines[i])
84+
# Don't glue a hard break onto a line that immediately precedes a
85+
# link-reference definition. This runs before REFERENCE_PREPROCESSOR
86+
# strips those definitions, so a <br /> appended here gets orphaned
87+
# inside the reference block and leaks into the page as literal markup.
88+
and not LINK_REFERENCE_RE.match(lines[i + 1])
7389
and not lines[i].lstrip().startswith(">")
7490
):
7591
lines[i] += "<br />"

‎openlibrary/tests/core/test_olmarkdown.py‎

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,63 @@ def p(html):
1818
assert md("a\nb") == p("a<br/>\n b")
1919

2020

21+
def test_olmarkdown_no_br_leak_into_reference_block():
22+
"""Regression: a hard line break must not be glued onto a line that
23+
immediately precedes a markdown link-reference definition.
24+
25+
LineBreaksPreprocessor runs before REFERENCE_PREPROCESSOR strips the
26+
``[id]: url`` definitions, so a ``<br />`` appended to the line above a
27+
reference block gets orphaned inside it and leaks into the rendered page
28+
as literal markup. This was visible at the bottom of /help/faq/editing
29+
(internetarchive/openlibrary#13074), where a wrapped/malformed footnote
30+
block rendered ``...Tortilla<br /> [24]:`` etc.
31+
"""
32+
33+
def md(text):
34+
return OLMarkdown(text).convert().strip()
35+
36+
# Text line immediately followed by a reference definition: no <br>.
37+
assert md("foo\n [1]: https://example.com") == "<p>foo\n</p>"
38+
39+
# A wrapped reference value (continuation line sitting between two
40+
# definitions, carrying the editor's "\" hard-break marker) must come out
41+
# clean: no leaked <br /> and no stray backslash.
42+
body = (
43+
"intro\n\n"
44+
" [23]: https://openlibrary.org/works/OL23185W/Short_Novels\n"
45+
"(Cannery_Row_Of_Mice_and_Men_Tortilla\\\n"
46+
" [24]: https://openlibrary.org/works/OL14876179W/Adventures\n"
47+
)
48+
rendered = md(body)
49+
assert "<br" not in rendered
50+
assert "\\" not in rendered
51+
52+
# Sanity: an ordinary line break between two plain lines is untouched.
53+
assert md("a\nb") == "<p>a<br/>\n b\n</p>"
54+
55+
56+
def test_olmarkdown_commonmark_hard_break_does_not_leak_br():
57+
"""Regression: a CommonMark hard break (trailing ``\\``) must render as a
58+
line break, not the literal text ``<br />``.
59+
60+
The Tiptap WYSIWYG editor serializes hard breaks as ``\\`` + newline. With
61+
no handling, OLMarkdown appends its own ``<br />`` right after the
62+
backslash, the backslash escapes the ``<``, and readers see the literal
63+
string ``<br />`` (internetarchive/openlibrary#13074).
64+
"""
65+
66+
def md(text):
67+
return OLMarkdown(text).convert().strip()
68+
69+
out = md("first line\\\nsecond line")
70+
assert "&lt;br" not in out # no escaped/literal <br /> shown to the reader
71+
assert "\\" not in out # the hard-break backslash is consumed
72+
assert out == "<p>first line<br/>\n second line\n</p>"
73+
74+
# A genuine escaped backslash (``\\``) is preserved, not swallowed.
75+
assert "\\" in md("a\\\\b\nc")
76+
77+
2178
def test_fenced_code_preprocessor():
2279
pre = FencedCodePreprocessor()
2380

‎package-lock.json‎

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎package.json‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
"@playwright/test": "^1.60.0",
4040
"@sentry/browser": "^10.60.0",
4141
"@tiptap/core": "^3.20.4",
42+
"@tiptap/extension-hard-break": "^3.26.0",
4243
"@tiptap/extension-image": "^3.22.1",
4344
"@tiptap/extension-placeholder": "^3.20.4",
4445
"@tiptap/pm": "^3.20.4",

‎tests/unit/js/OLMarkdownEditor.test.js‎

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import StarterKit from '@tiptap/starter-kit';
1313
import { Markdown } from 'tiptap-markdown';
1414
import Image from '@tiptap/extension-image';
1515
import { HtmlBlock } from '../../../openlibrary/components/lit/html-block.js';
16+
import { OLHardBreak } from '../../../openlibrary/components/lit/hard-break.js';
1617

1718
/* ------------------------------------------------------------------ */
1819
/* Helper: create an editor identical to production, roundtrip text */
@@ -30,8 +31,10 @@ function createTestEditor(markdownContent, { enableCode = false } = {}) {
3031
codeBlock: enableCode ? undefined : false,
3132
code: enableCode ? undefined : false,
3233
link: { openOnClick: false, autolink: true },
33-
strike: false
34+
strike: false,
35+
hardBreak: false
3436
}),
37+
OLHardBreak,
3538
Markdown.configure({
3639
breaks: true,
3740
linkify: true
@@ -439,6 +442,26 @@ Visit [the website](https://example.org) for more.
439442
expect(output.length).toBeGreaterThan(input.length * 0.9);
440443
expect(output).toContain('books and reading');
441444
});
445+
446+
/* ---------- hard breaks (OLMarkdown dialect) ---------- */
447+
448+
test('hard break serializes as a bare newline, not CommonMark "\\"', () => {
449+
// OLMarkdown treats every newline as a <br>. Emitting CommonMark's
450+
// "\<newline>" makes the display renderer escape its own injected
451+
// <br/> and show a literal "<br />" to readers. Regression for
452+
// internetarchive/openlibrary#13074.
453+
const output = roundTrip('first line\nsecond line');
454+
expect(output).not.toContain('\\');
455+
expect(output).not.toContain('<br');
456+
expect(normalize(output)).toBe('first line\nsecond line');
457+
});
458+
459+
test('multi-line paragraph round-trips without backslashes', () => {
460+
const input = 'line one\nline two\nline three';
461+
const output = roundTrip(input);
462+
expect(output).not.toContain('\\');
463+
expect(normalize(output)).toBe(input);
464+
});
442465
});
443466

444467
/* ================================================================== */

0 commit comments

Comments
 (0)