diff --git a/web/app/(commonLayout)/datasets/template/template.en.mdx b/web/app/(commonLayout)/datasets/template/template.en.mdx index d3dcfc4b24d598..f2db83e47ec727 100644 --- a/web/app/(commonLayout)/datasets/template/template.en.mdx +++ b/web/app/(commonLayout)/datasets/template/template.en.mdx @@ -52,6 +52,15 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - high_quality High quality: embedding using embedding model, built as vector database index - economy Economy: Build using inverted index of keyword table index + + Format of indexed content + - text_model Text documents are directly embedded; `economy` mode defaults to using this form + - hierarchical_model Parent-child mode + - qa_model Q&A Mode: Generates Q&A pairs for segmented documents and then embeds the questions + + + In Q&A mode, specify the language of the document, for example: English, Chinese + Processing rules - mode (string) Cleaning, segmentation mode, automatic / custom @@ -65,6 +74,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - segmentation (object) Segmentation rules - separator Custom segment identifier, currently only allows one delimiter to be set. Default is \n - max_tokens Maximum length (token) defaults to 1000 + - parent_mode Retrieval mode of parent chunks: full-doc full text retrieval / paragraph paragraph retrieval + - subchunk_segmentation (object) Child chunk rules + - separator Segmentation identifier. Currently, only one delimiter is allowed. The default is *** + - max_tokens The maximum length (tokens) must be validated to be shorter than the length of the parent chunk @@ -155,6 +168,13 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - high_quality High quality: embedding using embedding model, built as vector database index - economy Economy: Build using inverted index of keyword table index + - doc_form Format of indexed content + - text_model Text documents are directly embedded; `economy` mode defaults to using this form + - hierarchical_model Parent-child mode + - qa_model Q&A Mode: Generates Q&A pairs for segmented documents and then embeds the questions + + - doc_language In Q&A mode, specify the language of the document, for example: English, Chinese + - process_rule Processing rules - mode (string) Cleaning, segmentation mode, automatic / custom - rules (object) Custom rules (in automatic mode, this field is empty) @@ -167,6 +187,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - segmentation (object) Segmentation rules - separator Custom segment identifier, currently only allows one delimiter to be set. Default is \n - max_tokens Maximum length (token) defaults to 1000 + - parent_mode Retrieval mode of parent chunks: full-doc full text retrieval / paragraph paragraph retrieval + - subchunk_segmentation (object) Child chunk rules + - separator Segmentation identifier. Currently, only one delimiter is allowed. The default is *** + - max_tokens The maximum length (tokens) must be validated to be shorter than the length of the parent chunk Files that need to be uploaded. @@ -449,6 +473,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - segmentation (object) Segmentation rules - separator Custom segment identifier, currently only allows one delimiter to be set. Default is \n - max_tokens Maximum length (token) defaults to 1000 + - parent_mode Retrieval mode of parent chunks: full-doc full text retrieval / paragraph paragraph retrieval + - subchunk_segmentation (object) Child chunk rules + - separator Segmentation identifier. Currently, only one delimiter is allowed. The default is *** + - max_tokens The maximum length (tokens) must be validated to be shorter than the length of the parent chunk @@ -546,6 +574,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - segmentation (object) Segmentation rules - separator Custom segment identifier, currently only allows one delimiter to be set. Default is \n - max_tokens Maximum length (token) defaults to 1000 + - parent_mode Retrieval mode of parent chunks: full-doc full text retrieval / paragraph paragraph retrieval + - subchunk_segmentation (object) Child chunk rules + - separator Segmentation identifier. Currently, only one delimiter is allowed. The default is *** + - max_tokens The maximum length (tokens) must be validated to be shorter than the length of the parent chunk @@ -984,7 +1016,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from @@ -1009,6 +1041,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - answer (text) Answer content, passed if the knowledge is in Q&A mode (optional) - keywords (list) Keyword (optional) - enabled (bool) False / true (optional) + - regenerate_child_chunks (bool) Whether to regenerate child chunks (optional) diff --git a/web/app/(commonLayout)/datasets/template/template.zh.mdx b/web/app/(commonLayout)/datasets/template/template.zh.mdx index db15ede9fcabf2..24418dea579cfb 100644 --- a/web/app/(commonLayout)/datasets/template/template.zh.mdx +++ b/web/app/(commonLayout)/datasets/template/template.zh.mdx @@ -52,6 +52,15 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - high_quality 高质量:使用 embedding 模型进行嵌入,构建为向量数据库索引 - economy 经济:使用 keyword table index 的倒排索引进行构建 + + 索引内容的形式 + - text_model text 文档直接 embedding,经济模式默认为该模式 + - hierarchical_model parent-child 模式 + - qa_model Q&A 模式:为分片文档生成 Q&A 对,然后对问题进行 embedding + + + 在 Q&A 模式下,指定文档的语言,例如:EnglishChinese + 处理规则 - mode (string) 清洗、分段模式 ,automatic 自动 / custom 自定义 @@ -63,8 +72,12 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - remove_urls_emails 删除 URL、电子邮件地址 - enabled (bool) 是否选中该规则,不传入文档 ID 时代表默认值 - segmentation (object) 分段规则 - - separator 自定义分段标识符,目前仅允许设置一个分隔符。默认为 \n + - separator 自定义分段标识符,目前仅允许设置一个分隔符。默认为 \n - max_tokens 最大长度(token)默认为 1000 + - parent_mode 父分段的召回模式 full-doc 全文召回 / paragraph 段落召回 + - subchunk_segmentation (object) 子分段规则 + - separator 分段标识符,目前仅允许设置一个分隔符。默认为 *** + - max_tokens 最大长度 (token) 需要校验小于父级的长度 @@ -155,6 +168,13 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - high_quality 高质量:使用 embedding 模型进行嵌入,构建为向量数据库索引 - economy 经济:使用 keyword table index 的倒排索引进行构建 + - doc_form 索引内容的形式 + - text_model text 文档直接 embedding,经济模式默认为该模式 + - hierarchical_model parent-child 模式 + - qa_model Q&A 模式:为分片文档生成 Q&A 对,然后对问题进行 embedding + + - doc_language 在 Q&A 模式下,指定文档的语言,例如:EnglishChinese + - process_rule 处理规则 - mode (string) 清洗、分段模式 ,automatic 自动 / custom 自定义 - rules (object) 自定义规则(自动模式下,该字段为空) @@ -167,6 +187,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - segmentation (object) 分段规则 - separator 自定义分段标识符,目前仅允许设置一个分隔符。默认为 \n - max_tokens 最大长度(token)默认为 1000 + - parent_mode 父分段的召回模式 full-doc 全文召回 / paragraph 段落召回 + - subchunk_segmentation (object) 子分段规则 + - separator 分段标识符,目前仅允许设置一个分隔符。默认为 *** + - max_tokens 最大长度 (token) 需要校验小于父级的长度 需要上传的文件。 @@ -411,7 +435,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from @@ -449,6 +473,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - segmentation (object) 分段规则 - separator 自定义分段标识符,目前仅允许设置一个分隔符。默认为 \n - max_tokens 最大长度(token)默认为 1000 + - parent_mode 父分段的召回模式 full-doc 全文召回 / paragraph 段落召回 + - subchunk_segmentation (object) 子分段规则 + - separator 分段标识符,目前仅允许设置一个分隔符。默认为 *** + - max_tokens 最大长度 (token) 需要校验小于父级的长度 @@ -508,7 +536,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from @@ -546,6 +574,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - segmentation (object) 分段规则 - separator 自定义分段标识符,目前仅允许设置一个分隔符。默认为 \n - max_tokens 最大长度(token)默认为 1000 + - parent_mode 父分段的召回模式 full-doc 全文召回 / paragraph 段落召回 + - subchunk_segmentation (object) 子分段规则 + - separator 分段标识符,目前仅允许设置一个分隔符。默认为 *** + - max_tokens 最大长度 (token) 需要校验小于父级的长度 @@ -1009,6 +1041,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - answer (text) 答案内容,非必填,如果知识库的模式为 Q&A 模式则传值 - keywords (list) 关键字,非必填 - enabled (bool) false/true,非必填 + - regenerate_child_chunks (bool) 是否重新生成子分段,非必填