-
-
Notifications
You must be signed in to change notification settings - Fork 35
🐣 add tools for reflowing the transcript into one paragraph per sentence / speaker #510
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,182 @@ | ||
| import { TbHammer } from 'react-icons/tb'; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we have tests for the transformations in this file? :) |
||
| import { IconButton } from '../components/button'; | ||
| import { EditorWithWebsocket } from './automerge_websocket_editor'; | ||
| import { Document, Paragraph } from '../editor/types'; | ||
| import { Popup } from '../components/popup'; | ||
| import { primitiveWithClassname } from '../styled'; | ||
|
|
||
| export const MenuItemButton = primitiveWithClassname('button', [ | ||
| 'hover:bg-gray-200 dark:hover:bg-neutral-700', | ||
| 'rounded-md', | ||
| 'w-full', | ||
| 'text-left', | ||
| 'px-2', | ||
| 'py-1', | ||
| 'block', | ||
| ]); | ||
|
|
||
| function mergeSameSpeakerParagraphs(doc: Document) { | ||
| const mergePoints: number[] = []; | ||
| for (let i = 0; i < doc.children.length - 1; i++) { | ||
| const paragraph = doc.children[i]; | ||
| const nextParagraph = doc.children[i + 1]; | ||
| if (paragraph.speaker == nextParagraph.speaker) { | ||
| mergePoints.push(i); | ||
| } | ||
| } | ||
| let removed = 0; | ||
| mergePoints.forEach((index) => { | ||
| const i = index - removed; | ||
| doc.children[i].children.push(...JSON.parse(JSON.stringify(doc.children[i + 1].children))); | ||
| doc.children.splice(i + 1, 1); | ||
| removed++; | ||
| }); | ||
| } | ||
|
|
||
| const punctuations = ['.', '?', '!']; | ||
| const non_punctuations = ['...']; | ||
| function containsSentenceEnd(text: string) { | ||
| return ( | ||
| punctuations.some((punct) => text.includes(punct)) && | ||
| !non_punctuations.some((np) => text.includes(np)) | ||
| ); | ||
| } | ||
|
|
||
| export function TextTools({ editor }: { editor: EditorWithWebsocket }) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we add a warning to these if applied to a document that is in a non latin style language? |
||
| return ( | ||
| <Popup | ||
| button={<IconButton icon={TbHammer} label={'text tools'} />} | ||
| onClick={(e) => { | ||
| e.preventDefault(); | ||
| }} | ||
| > | ||
| <MenuItemButton | ||
| onClick={() => { | ||
| editor.update(mergeSameSpeakerParagraphs); | ||
| }} | ||
| > | ||
| Reflow to One Paragraph per Speaker | ||
| </MenuItemButton> | ||
|
|
||
| <MenuItemButton | ||
| onClick={() => { | ||
| editor.update((doc: Document) => { | ||
| // stategy: we first merge everything that could possibly be merged... | ||
| mergeSameSpeakerParagraphs(doc); | ||
|
|
||
| // ...and only then break up on sentence boundaries | ||
| const newChildren: Paragraph[] = []; | ||
| doc.children.forEach((paragraph) => { | ||
| let currentParagraph = { | ||
| ...paragraph, | ||
| children: [] as { text: string }[], | ||
| }; | ||
| paragraph.children.forEach((token) => { | ||
| currentParagraph.children.push(JSON.parse(JSON.stringify(token))); | ||
| if (containsSentenceEnd(token.text)) { | ||
| newChildren.push(currentParagraph); | ||
| currentParagraph = { | ||
| ...paragraph, | ||
| children: [], | ||
| }; | ||
| } | ||
| }); | ||
| if (currentParagraph.children.length > 0) { | ||
| newChildren.push(currentParagraph); | ||
| } | ||
| }); | ||
| doc.children = newChildren; | ||
| }); | ||
| }} | ||
| > | ||
| Reflow to One Paragraph per Sentence | ||
| </MenuItemButton> | ||
|
|
||
| <MenuItemButton | ||
| onClick={() => { | ||
| // this strategy tries to split paragraphs at sentence boundaries, but only if there is a pause between the sentences | ||
| // or the paragraphs would become too long. | ||
| const initial = 2; | ||
| const decay = 0.95; | ||
|
|
||
| const getPause = (i: number, paragraph: Paragraph) => { | ||
| const token = paragraph.children[i]; | ||
| const nextToken = paragraph.children[i + 1]; | ||
| if (nextToken?.start !== undefined && token?.end !== undefined) { | ||
| return nextToken.start - token.end; | ||
| } | ||
| return 0; | ||
| }; | ||
|
|
||
| editor.update((doc: Document) => { | ||
| mergeSameSpeakerParagraphs(doc); | ||
| const newChildren: Paragraph[] = []; | ||
| const addNewChild = (paragraph: Paragraph) => { | ||
| // if the paragraph is very long and does not contain any sentence ends, we still want to break it up | ||
| if (paragraph.children.length <= 100) { | ||
| newChildren.push(paragraph); | ||
| } else { | ||
| const silences = paragraph.children | ||
| .map((x, i) => ({ ...x, pause: getPause(i, paragraph) })) | ||
| .filter((token) => token.text.includes(',')) | ||
| .map((token) => token.pause); | ||
| silences.sort(); | ||
| const thresholdIndex = Math.floor(paragraph.children.length / 100); // aim for paragraphs of max ~50 tokens | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This says ~50 tokens but divides by 100, this seems contradictory, or am I missing something? Also the magic paragraph length could probably be a constant that is used here and for the |
||
| const threshold = silences[silences.length - 1 - thresholdIndex]; | ||
| let currentParagraph = { | ||
| ...paragraph, | ||
| children: [] as { text: string }[], | ||
| }; | ||
| paragraph.children.forEach((token, i) => { | ||
| currentParagraph.children.push(JSON.parse(JSON.stringify(token))); | ||
| if ( | ||
| getPause(i, paragraph) >= threshold && | ||
| token.text.includes(',') && | ||
| currentParagraph.children.length > 3 | ||
| ) { | ||
| newChildren.push(currentParagraph); | ||
| currentParagraph = { | ||
| ...paragraph, | ||
| children: [], | ||
| }; | ||
| } | ||
| }); | ||
| if (currentParagraph.children.length > 0) { | ||
| newChildren.push(currentParagraph); | ||
| } | ||
| } | ||
| }; | ||
| doc.children.forEach((paragraph) => { | ||
| let minPauseBetweenSentences = initial; // this gets reduced with every additional token | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why does it get reduces with every additional token? |
||
| let currentParagraph = { | ||
| ...paragraph, | ||
| children: [] as { text: string }[], | ||
| }; | ||
| paragraph.children.forEach((token, i) => { | ||
| currentParagraph.children.push(JSON.parse(JSON.stringify(token))); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why the |
||
| minPauseBetweenSentences *= decay; | ||
| if ( | ||
| getPause(i, paragraph) >= minPauseBetweenSentences && | ||
| containsSentenceEnd(token.text) | ||
| ) { | ||
| addNewChild(currentParagraph); | ||
| minPauseBetweenSentences = initial; | ||
| currentParagraph = { | ||
| ...paragraph, | ||
| children: [], | ||
| }; | ||
| } | ||
| }); | ||
| if (currentParagraph.children.length > 0) { | ||
| addNewChild(currentParagraph); | ||
| } | ||
| }); | ||
| doc.children = newChildren; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think doing it this way totally fucks up collaborative editing... |
||
| }); | ||
| }} | ||
| > | ||
| Smart Reflow ✨ | ||
| </MenuItemButton> | ||
| </Popup> | ||
| ); | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ import { Helmet } from 'react-helmet'; | |
| import { ShareModal } from '../editor/share'; | ||
| import { getDocumentWsUrl, useAuthData } from '../utils/auth'; | ||
| import { ExportModal } from '../editor/export'; | ||
| import { TextTools } from '../editor/text_tools'; | ||
|
|
||
| const LazyDebugPanel = lazy(() => | ||
| import('../editor/debug_panel').then((module) => ({ default: module.DebugPanel })), | ||
|
|
@@ -163,6 +164,7 @@ export function DocumentPage({ | |
| )} | ||
| </TopBarPart> | ||
| <TopBarPart> | ||
| {editor && <TextTools editor={editor} />} | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be gated on |
||
| {data?.has_full_access && ( | ||
| <IconButton | ||
| icon={TbShare3} | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.