const MAX_HEADING_LENGTH = 7; // 最大标题长度const MAX_HEADING_CONTENT_LENGTH = 200; // 最大标题内容长度const MAX_HEADING_UNDERLINE_LENGTH = 200; // 最大标题下划线长度const MAX_HTML_HEADING_ATTRIBUTES_LENGTH = 100; // 最大HTML标题属性长度const MAX_LIST_ITEM_LENGTH = 200; // 最大列表项长度const MAX_NESTED_LIST_ITEMS = 6; // 最大嵌套列表项数const MAX_LIST_INDENT_SPACES = 7; // 最大列表缩进空格数const MAX_BLOCKQUOTE_LINE_LENGTH = 200; // 最大块引用行长度const MAX_BLOCKQUOTE_LINES = 15; // 最大块引用行数const MAX_CODE_BLOCK_LENGTH = 1500; // 最大代码块长度const MAX_CODE_LANGUAGE_LENGTH = 20; // 最大代码语言长度const MAX_INDENTED_CODE_LINES = 20; // 最大缩进代码行数const MAX_TABLE_CELL_LENGTH = 200; // 最大表格单元格长度const MAX_TABLE_ROWS = 20; // 最大表格行数const MAX_HTML_TABLE_LENGTH = 2000; // 最大HTML表格长度const MIN_HORIZONTAL_RULE_LENGTH = 3; // 最小水平分隔线长度const MAX_SENTENCE_LENGTH = 400; // 最大句子长度const MAX_QUOTED_TEXT_LENGTH = 300; // 最大引用文本长度const MAX_PARENTHETICAL_CONTENT_LENGTH = 200; // 最大括号内容长度const MAX_NESTED_PARENTHESES = 5; // 最大嵌套括号数const MAX_MATH_INLINE_LENGTH = 100; // 最大行内数学公式长度const MAX_MATH_BLOCK_LENGTH = 500; // 最大数学公式块长度const MAX_PARAGRAPH_LENGTH = 1000; // 最大段落长度const MAX_STANDALONE_LINE_LENGTH = 800; // 最大独立行长度const MAX_HTML_TAG_ATTRIBUTES_LENGTH = 100; // 最大HTML标签属性长度const MAX_HTML_TAG_CONTENT_LENGTH = 1000; // 最大HTML标签内容长度const LOOKAHEAD_RANGE = 100; // 向前查找句子边界的字符数const AVOID_AT_START = `[\\s\\]})>,']`; // 避免在开头匹配的字符const PUNCTUATION = `[.!?…]|\\.{3}|[\\u2026\\u2047-\\u2049]|[\\p{Emoji_Presentation}\\p{Extended_Pictographic}]`; // 标点符号const QUOTE_END = `(?:'(?=\`)|''(?=\`\`))`; // 引号结束const SENTENCE_END = `(?:${PUNCTUATION}(?<!${AVOID_AT_START}(?=${PUNCTUATION}))|${QUOTE_END})(?=\\S|$)`; // 句子结束const SENTENCE_BOUNDARY = `(?:${SENTENCE_END}|(?=[\\r\\n]|$))`; // 句子边界const LOOKAHEAD_PATTERN = `(?:(?!${SENTENCE_END}).){1,${LOOKAHEAD_RANGE}}${SENTENCE_END}`; // 向前查找句子结束的模式const NOT_PUNCTUATION_SPACE = `(?!${PUNCTUATION}\\s)`; // 非标点符号空格const SENTENCE_PATTERN = `${NOT_PUNCTUATION_SPACE}(?:[^\\r\\n]{1,{MAX_LENGTH}}${SENTENCE_BOUNDARY}|[^\\r\\n]{1,{MAX_LENGTH}}(?=${PUNCTUATION}|$ {QUOTE_END})(?:${LOOKAHEAD_PATTERN})?)${AVOID_AT_START}*`; // 句子模式const regex = new RegExp( "(" + // 1. Headings (Setext-style, Markdown, and HTML-style, with length constraints) `(?:^(?:[#*=-]{1,${MAX_HEADING_LENGTH}}|\\w[^\\r\\n]{0,${MAX_HEADING_CONTENT_LENGTH}}\\r?\\n[-=]{2,${MAX_HEADING_UNDERLINE_LENGTH}}|<h[1-6][^>] {0,${MAX_HTML_HEADING_ATTRIBUTES_LENGTH}}>)[^\\r\\n]{1,${MAX_HEADING_CONTENT_LENGTH}}(?:</h[1-6]>)?(?:\\r?\\n|$))` + "|" + // New pattern for citations `(?:\\[[0-9]+\\][^\\r\\n]{1,${MAX_STANDALONE_LINE_LENGTH}})` + "|" + // 2. List items (bulleted, numbered, lettered, or task lists, including nested, up to three levels, with length constraints) `(?:(?:^|\\r?\\n)[ \\t]{0,3}(?:[-*+•]|\\d{1,3}\\.\\w\\.|\\[[ xX]\\])[ \\t]+${SENTENCE_PATTERN.replace(/{MAX_LENGTH}/g, String (MAX_LIST_ITEM_LENGTH))}` + `(?:(?:\\r?\\n[ \\t]{2,5}(?:[-*+•]|\\d{1,3}\\.\\w\\.|\\[[ xX]\\])[ \\t]+${SENTENCE_PATTERN.replace(/{MAX_LENGTH}/g, String (MAX_LIST_ITEM_LENGTH))}){0,${MAX_NESTED_LIST_ITEMS}}` + `(?:\\r?\\n[ \\t]{4,${MAX_LIST_INDENT_SPACES}}(?:[-*+•]|\\d{1,3}\\.\\w\\.|\\[[ xX]\\])[ \\t]+${SENTENCE_PATTERN.replace(/{MAX_LENGTH}/g, String (MAX_LIST_ITEM_LENGTH))}){0,${MAX_NESTED_LIST_ITEMS}})?)` + "|" + // 3. Block quotes (including nested quotes and citations, up to three levels, with length constraints) `(?:(?:^>(?:>|\\s{2,}){0,2}${SENTENCE_PATTERN.replace(/{MAX_LENGTH}/g, String(MAX_BLOCKQUOTE_LINE_LENGTH))}\\r?\\n?){1,$ {MAX_BLOCKQUOTE_LINES}})` + "|" + // 4. Code blocks (fenced, indented, or HTML pre/code tags, with length constraints) `(?:(?:^|\\r?\\n)(?:\`\`\`|~~~)(?:\\w{0,${MAX_CODE_LANGUAGE_LENGTH}})?\\r?\\n[\\s\\S]{0,${MAX_CODE_BLOCK_LENGTH}}?(?:\`\`\`|~~~)\\r?\\n?` + `|(?:(?:^|\\r?\\n)(?: {4}|\\t)[^\\r\\n]{0,${MAX_LIST_ITEM_LENGTH}}(?:\\r?\\n(?: {4}|\\t)[^\\r\\n]{0,${MAX_LIST_ITEM_LENGTH}}){0,$ {MAX_INDENTED_CODE_LINES}}\\r?\\n?)` + `|(?:<pre>(?:<code>)?[\\s\\S]{0,${MAX_CODE_BLOCK_LENGTH}}?(?:</code>)?</pre>))` + "|" + // 5. Tables (Markdown, grid tables, and HTML tables, with length constraints) `(?:(?:^|\\r?\\n)(?:\\|[^\\r\\n]{0,${MAX_TABLE_CELL_LENGTH}}\\|(?:\\r?\\n\\|[-:]{1,${MAX_TABLE_CELL_LENGTH}}\\|){0,1}(?:\\r?\\n\\|[^\\r\\n]{0,$ {MAX_TABLE_CELL_LENGTH}}\\|){0,${MAX_TABLE_ROWS}}` + `|<table>[\\s\\S]{0,${MAX_HTML_TABLE_LENGTH}}?</table>))` + "|" + // 6. Horizontal rules (Markdown and HTML hr tag) `(?:^(?:[-*_]){${MIN_HORIZONTAL_RULE_LENGTH},}\\s*$|<hr\\s*/?>)` + "|" + // 10. Standalone lines or phrases (including single-line blocks and HTML elements, with length constraints) `(?!${AVOID_AT_START})(?:^(?:<[a-zA-Z][^>]{0,${MAX_HTML_TAG_ATTRIBUTES_LENGTH}}>)?${SENTENCE_PATTERN.replace(/{MAX_LENGTH}/g, String (MAX_STANDALONE_LINE_LENGTH))}(?:</[a-zA-Z]+>)?(?:\\r?\\n|$))` + "|" + // 7. Sentences or phrases ending with punctuation (including ellipsis and Unicode punctuation) `(?!${AVOID_AT_START})${SENTENCE_PATTERN.replace(/{MAX_LENGTH}/g, String(MAX_SENTENCE_LENGTH))}` + "|" + // 8. Quoted text, parenthetical phrases, or bracketed content (with length constraints) "(?:" + `(?<!\\w)\"\"\"[^\"]{0,${MAX_QUOTED_TEXT_LENGTH}}\"\"\"(?!\\w)` + `|(?<!\\w)(?:['\"\`'"])[^\\r\\n]{0,${MAX_QUOTED_TEXT_LENGTH}}\\1(?!\\w)` + `|(?<!\\w)\`[^\\r\\n]{0,${MAX_QUOTED_TEXT_LENGTH}}'(?!\\w)` + `|(?<!\\w)\`\`[^\\r\\n]{0,${MAX_QUOTED_TEXT_LENGTH}}''(?!\\w)` + `|\\([^\\r\\n()]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}(?:\\([^\\r\\n()]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}\\)[^\\r\\n()]{0,$ {MAX_PARENTHETICAL_CONTENT_LENGTH}}){0,${MAX_NESTED_PARENTHESES}}\\)` + `|\\[[^\\r\\n\\[\\]]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}(?:\\[[^\\r\\n\\[\\]]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}\\][^\\r\\n\\[\\]]{0,$ {MAX_PARENTHETICAL_CONTENT_LENGTH}}){0,${MAX_NESTED_PARENTHESES}}\\]` + `|\\$[^\\r\\n$]{0,${MAX_MATH_INLINE_LENGTH}}\\$` + `|\`[^\`\\r\\n]{0,${MAX_MATH_INLINE_LENGTH}}\`` + ")" + "|" + // 9. Paragraphs (with length constraints) `(?!${AVOID_AT_START})(?:(?:^|\\r?\\n\\r?\\n)(?:<p>)?${SENTENCE_PATTERN.replace(/{MAX_LENGTH}/g, String(MAX_PARAGRAPH_LENGTH))}(?:</p>)?(?=\\r? \\n\\r?\\n|$))` + "|" + // 11. HTML-like tags and their content (including self-closing tags and attributes, with length constraints) `(?:<[a-zA-Z][^>]{0,${MAX_HTML_TAG_ATTRIBUTES_LENGTH}}(?:>[\\s\\S]{0,${MAX_HTML_TAG_CONTENT_LENGTH}}?</[a-zA-Z]+>|\\s*/>))` + "|" + // 12. LaTeX-style math expressions (inline and block, with length constraints) `(?:(?:\\$\\$[\\s\\S]{0,${MAX_MATH_BLOCK_LENGTH}}?\\$\\$)|(?:\\$[^\\$\\r\\n]{0,${MAX_MATH_INLINE_LENGTH}}\\$))` + "|" + // 14. Fallback for any remaining content (with length constraints) `(?!${AVOID_AT_START})${SENTENCE_PATTERN.replace(/{MAX_LENGTH}/g, String(MAX_STANDALONE_LINE_LENGTH))}` + ")", "gmu");function main({text}){ const chunks = []; let currentChunk = ''; const tokens = countToken(text) const matches = text.match(regex); if (matches) { matches.forEach((match) => { if (currentChunk.length + match.length <= 1000) { currentChunk += match; } else { if (currentChunk) { chunks.push(currentChunk); } currentChunk = match; } }); if (currentChunk) { chunks.push(currentChunk); } } return {chunks, tokens};}