From c7a9403e3b9a445ec5ebf2a140132daf4f63326e Mon Sep 17 00:00:00 2001 From: Jaro Habiger Date: Thu, 8 Jan 2026 00:50:09 +0100 Subject: [PATCH 1/2] =?UTF-8?q?=F0=9F=90=A3=20add=20tools=20for=20reflowin?= =?UTF-8?q?g=20the=20transcript?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit into one paragraph per sentence / speaker --- .../src/editor/automerge_websocket_editor.ts | 24 +++- frontend/src/editor/text_tools.tsx | 112 ++++++++++++++++++ frontend/src/pages/document.tsx | 2 + 3 files changed, 135 insertions(+), 3 deletions(-) create mode 100644 frontend/src/editor/text_tools.tsx diff --git a/frontend/src/editor/automerge_websocket_editor.ts b/frontend/src/editor/automerge_websocket_editor.ts index f06b66c8..6a963da9 100644 --- a/frontend/src/editor/automerge_websocket_editor.ts +++ b/frontend/src/editor/automerge_websocket_editor.ts @@ -16,14 +16,18 @@ enum MessageSyncType { FullDoc = 3, } +export type EditorWithWebsocket = Editor & { + update: (changeFn: (doc: Document) => void) => void; +}; + export function useAutomergeWebsocketEditor( url: string, { onInitialSyncComplete }: { onInitialSyncComplete: (editor?: Editor) => void }, -): [Editor?, Paragraph[]?] { +): [EditorWithWebsocket?, Paragraph[]?] { const debug = useDebugMode(); const sentChanges = useRef>(new Set()); const [editorAndInitialValue, setEditorAndInitialValue] = useState(null); const editorRef = useRef(); @@ -54,7 +58,9 @@ export function useAutomergeWebsocketEditor( const createNewEditor = (doc: Automerge.Doc) => { const baseEditor = createEditor(); const editorWithReact = withReact(baseEditor); - const editor = withHistory(withAutomergeDoc(editorWithReact, Automerge.init())); + const editor = withHistory( + withAutomergeDoc(editorWithReact, Automerge.init()), + ) as EditorWithWebsocket; editor.addDocChangeListener(sendDocChange); const migratedDoc = migrateDocument(doc as Automerge.Doc); @@ -68,6 +74,18 @@ export function useAutomergeWebsocketEditor( migratedDoc.children !== undefined ? JSON.parse(JSON.stringify(migratedDoc.children)) : []; + + editor.update = (changeFn: (doc: Document) => void) => { + console.time('changeFn'); + const changed = Automerge.change(editor.doc, changeFn); + console.timeEnd('changeFn'); + console.time('setDoc'); + editor.setDoc(changed); + console.timeEnd('setDoc'); + console.time('sendDocChange'); + sendDocChange(changed); + console.timeEnd('sendDocChange'); + }; return { editor: editor, initialValue: initialValue }; }); }; diff --git a/frontend/src/editor/text_tools.tsx b/frontend/src/editor/text_tools.tsx new file mode 100644 index 00000000..3db54dd1 --- /dev/null +++ b/frontend/src/editor/text_tools.tsx @@ -0,0 +1,112 @@ +import { TbHammer } from 'react-icons/tb'; +import { IconButton } from '../components/button'; +import { EditorWithWebsocket } from './automerge_websocket_editor'; +import { Document, Paragraph } from '../editor/types'; +import { Popup } from '../components/popup'; +import { primitiveWithClassname } from '../styled'; + +export const MenuItemButton = primitiveWithClassname('button', [ + 'hover:bg-gray-200 dark:hover:bg-neutral-700', + 'rounded-md', + 'w-full', + 'text-left', + 'px-2', + 'py-1', + 'block', +]); + +export function TextTools({ editor }: { editor: EditorWithWebsocket }) { + return ( + } + onClick={(e) => { + e.preventDefault(); + }} + > + { + const mergePoints: number[] = []; + for (let i = 0; i < editor.doc.children.length - 1; i++) { + const paragraph = editor.doc.children[i]; + const nextParagraph = editor.doc.children[i + 1]; + if (paragraph.speaker == nextParagraph.speaker) { + mergePoints.push(i); + } + } + editor.update((doc: Document) => { + let removed = 0; + mergePoints.forEach((index) => { + const i = index - removed; + doc.children[i].children.push( + ...JSON.parse(JSON.stringify(doc.children[i + 1].children)), + ); + doc.children.splice(i + 1, 1); + removed++; + }); + }); + }} + > + Reflow to One Paragraph per Speaker + + + { + const punctuations = ['.', '?', '!']; + const non_punctuations = ['...']; + const contains_punctuation = (text: string) => + punctuations.some((punct) => text.includes(punct)) && + !non_punctuations.some((np) => text.includes(np)); + + // stategy: we first merge everything that could possibly be merged... + const mergePoints: number[] = []; + for (let i = 0; i < editor.doc.children.length - 1; i++) { + const paragraph = editor.doc.children[i]; + const nextParagraph = editor.doc.children[i + 1]; + if ( + !contains_punctuation(paragraph.children[paragraph.children.length - 1].text) && + paragraph.speaker == nextParagraph.speaker + ) { + mergePoints.push(i); + } + } + editor.update((doc: Document) => { + let removed = 0; + mergePoints.forEach((index) => { + const i = index - removed; + doc.children[i].children.push( + ...JSON.parse(JSON.stringify(doc.children[i + 1].children)), + ); + doc.children.splice(i + 1, 1); + removed++; + }); + + // ...and only then break up + const newChildren: Paragraph[] = []; + doc.children.forEach((paragraph) => { + let currentParagraph = { + ...paragraph, + children: [] as { text: string }[], + }; + paragraph.children.forEach((token) => { + currentParagraph.children.push(JSON.parse(JSON.stringify(token))); + if (contains_punctuation(token.text)) { + newChildren.push(currentParagraph); + currentParagraph = { + ...paragraph, + children: [], + }; + } + }); + if (currentParagraph.children.length > 0) { + newChildren.push(currentParagraph); + } + }); + doc.children = newChildren; + }); + }} + > + Reflow to One Paragraph per Sentence + + + ); +} diff --git a/frontend/src/pages/document.tsx b/frontend/src/pages/document.tsx index 6252f207..0d8e8119 100644 --- a/frontend/src/pages/document.tsx +++ b/frontend/src/pages/document.tsx @@ -18,6 +18,7 @@ import { Helmet } from 'react-helmet'; import { ShareModal } from '../editor/share'; import { getDocumentWsUrl, useAuthData } from '../utils/auth'; import { ExportModal } from '../editor/export'; +import { TextTools } from '../editor/text_tools'; const LazyDebugPanel = lazy(() => import('../editor/debug_panel').then((module) => ({ default: module.DebugPanel })), @@ -163,6 +164,7 @@ export function DocumentPage({ )} + {editor && } {data?.has_full_access && ( Date: Thu, 8 Jan 2026 01:42:56 +0100 Subject: [PATCH 2/2] =?UTF-8?q?=E2=9C=A8=20add=20smart=20reflow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- frontend/src/editor/text_tools.tsx | 166 ++++++++++++++++++++--------- 1 file changed, 118 insertions(+), 48 deletions(-) diff --git a/frontend/src/editor/text_tools.tsx b/frontend/src/editor/text_tools.tsx index 3db54dd1..8ec83a91 100644 --- a/frontend/src/editor/text_tools.tsx +++ b/frontend/src/editor/text_tools.tsx @@ -15,6 +15,33 @@ export const MenuItemButton = primitiveWithClassname('button', [ 'block', ]); +function mergeSameSpeakerParagraphs(doc: Document) { + const mergePoints: number[] = []; + for (let i = 0; i < doc.children.length - 1; i++) { + const paragraph = doc.children[i]; + const nextParagraph = doc.children[i + 1]; + if (paragraph.speaker == nextParagraph.speaker) { + mergePoints.push(i); + } + } + let removed = 0; + mergePoints.forEach((index) => { + const i = index - removed; + doc.children[i].children.push(...JSON.parse(JSON.stringify(doc.children[i + 1].children))); + doc.children.splice(i + 1, 1); + removed++; + }); +} + +const punctuations = ['.', '?', '!']; +const non_punctuations = ['...']; +function containsSentenceEnd(text: string) { + return ( + punctuations.some((punct) => text.includes(punct)) && + !non_punctuations.some((np) => text.includes(np)) + ); +} + export function TextTools({ editor }: { editor: EditorWithWebsocket }) { return ( { - const mergePoints: number[] = []; - for (let i = 0; i < editor.doc.children.length - 1; i++) { - const paragraph = editor.doc.children[i]; - const nextParagraph = editor.doc.children[i + 1]; - if (paragraph.speaker == nextParagraph.speaker) { - mergePoints.push(i); - } - } - editor.update((doc: Document) => { - let removed = 0; - mergePoints.forEach((index) => { - const i = index - removed; - doc.children[i].children.push( - ...JSON.parse(JSON.stringify(doc.children[i + 1].children)), - ); - doc.children.splice(i + 1, 1); - removed++; - }); - }); + editor.update(mergeSameSpeakerParagraphs); }} > Reflow to One Paragraph per Speaker @@ -51,36 +60,11 @@ export function TextTools({ editor }: { editor: EditorWithWebsocket }) { { - const punctuations = ['.', '?', '!']; - const non_punctuations = ['...']; - const contains_punctuation = (text: string) => - punctuations.some((punct) => text.includes(punct)) && - !non_punctuations.some((np) => text.includes(np)); - - // stategy: we first merge everything that could possibly be merged... - const mergePoints: number[] = []; - for (let i = 0; i < editor.doc.children.length - 1; i++) { - const paragraph = editor.doc.children[i]; - const nextParagraph = editor.doc.children[i + 1]; - if ( - !contains_punctuation(paragraph.children[paragraph.children.length - 1].text) && - paragraph.speaker == nextParagraph.speaker - ) { - mergePoints.push(i); - } - } editor.update((doc: Document) => { - let removed = 0; - mergePoints.forEach((index) => { - const i = index - removed; - doc.children[i].children.push( - ...JSON.parse(JSON.stringify(doc.children[i + 1].children)), - ); - doc.children.splice(i + 1, 1); - removed++; - }); + // stategy: we first merge everything that could possibly be merged... + mergeSameSpeakerParagraphs(doc); - // ...and only then break up + // ...and only then break up on sentence boundaries const newChildren: Paragraph[] = []; doc.children.forEach((paragraph) => { let currentParagraph = { @@ -89,7 +73,7 @@ export function TextTools({ editor }: { editor: EditorWithWebsocket }) { }; paragraph.children.forEach((token) => { currentParagraph.children.push(JSON.parse(JSON.stringify(token))); - if (contains_punctuation(token.text)) { + if (containsSentenceEnd(token.text)) { newChildren.push(currentParagraph); currentParagraph = { ...paragraph, @@ -107,6 +91,92 @@ export function TextTools({ editor }: { editor: EditorWithWebsocket }) { > Reflow to One Paragraph per Sentence + + { + // this strategy tries to split paragraphs at sentence boundaries, but only if there is a pause between the sentences + // or the paragraphs would become too long. + const initial = 2; + const decay = 0.95; + + const getPause = (i: number, paragraph: Paragraph) => { + const token = paragraph.children[i]; + const nextToken = paragraph.children[i + 1]; + if (nextToken?.start !== undefined && token?.end !== undefined) { + return nextToken.start - token.end; + } + return 0; + }; + + editor.update((doc: Document) => { + mergeSameSpeakerParagraphs(doc); + const newChildren: Paragraph[] = []; + const addNewChild = (paragraph: Paragraph) => { + // if the paragraph is very long and does not contain any sentence ends, we still want to break it up + if (paragraph.children.length <= 100) { + newChildren.push(paragraph); + } else { + const silences = paragraph.children + .map((x, i) => ({ ...x, pause: getPause(i, paragraph) })) + .filter((token) => token.text.includes(',')) + .map((token) => token.pause); + silences.sort(); + const thresholdIndex = Math.floor(paragraph.children.length / 100); // aim for paragraphs of max ~50 tokens + const threshold = silences[silences.length - 1 - thresholdIndex]; + let currentParagraph = { + ...paragraph, + children: [] as { text: string }[], + }; + paragraph.children.forEach((token, i) => { + currentParagraph.children.push(JSON.parse(JSON.stringify(token))); + if ( + getPause(i, paragraph) >= threshold && + token.text.includes(',') && + currentParagraph.children.length > 3 + ) { + newChildren.push(currentParagraph); + currentParagraph = { + ...paragraph, + children: [], + }; + } + }); + if (currentParagraph.children.length > 0) { + newChildren.push(currentParagraph); + } + } + }; + doc.children.forEach((paragraph) => { + let minPauseBetweenSentences = initial; // this gets reduced with every additional token + let currentParagraph = { + ...paragraph, + children: [] as { text: string }[], + }; + paragraph.children.forEach((token, i) => { + currentParagraph.children.push(JSON.parse(JSON.stringify(token))); + minPauseBetweenSentences *= decay; + if ( + getPause(i, paragraph) >= minPauseBetweenSentences && + containsSentenceEnd(token.text) + ) { + addNewChild(currentParagraph); + minPauseBetweenSentences = initial; + currentParagraph = { + ...paragraph, + children: [], + }; + } + }); + if (currentParagraph.children.length > 0) { + addNewChild(currentParagraph); + } + }); + doc.children = newChildren; + }); + }} + > + Smart Reflow ✨ + ); }