diff --git a/web/app/(commonLayout)/datasets/template/template.en.mdx b/web/app/(commonLayout)/datasets/template/template.en.mdx
index d3dcfc4b24d598..f2db83e47ec727 100644
--- a/web/app/(commonLayout)/datasets/template/template.en.mdx
+++ b/web/app/(commonLayout)/datasets/template/template.en.mdx
@@ -52,6 +52,15 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from
- high_quality
High quality: embedding using embedding model, built as vector database index
- economy
Economy: Build using inverted index of keyword table index
+
+ Format of indexed content
+ - text_model
Text documents are directly embedded; `economy` mode defaults to using this form
+ - hierarchical_model
Parent-child mode
+ - qa_model
Q&A Mode: Generates Q&A pairs for segmented documents and then embeds the questions
+
+
+ In Q&A mode, specify the language of the document, for example: English
, Chinese
+
Processing rules
- mode
(string) Cleaning, segmentation mode, automatic / custom
@@ -65,6 +74,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from
- segmentation
(object) Segmentation rules
- separator
Custom segment identifier, currently only allows one delimiter to be set. Default is \n
- max_tokens
Maximum length (token) defaults to 1000
+ - parent_mode
Retrieval mode of parent chunks: full-doc
full text retrieval / paragraph
paragraph retrieval
+ - subchunk_segmentation
(object) Child chunk rules
+ - separator
Segmentation identifier. Currently, only one delimiter is allowed. The default is ***
+ - max_tokens
The maximum length (tokens) must be validated to be shorter than the length of the parent chunk
@@ -155,6 +168,13 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from
- high_quality
High quality: embedding using embedding model, built as vector database index
- economy
Economy: Build using inverted index of keyword table index
+ - doc_form
Format of indexed content
+ - text_model
Text documents are directly embedded; `economy` mode defaults to using this form
+ - hierarchical_model
Parent-child mode
+ - qa_model
Q&A Mode: Generates Q&A pairs for segmented documents and then embeds the questions
+
+ - doc_language
In Q&A mode, specify the language of the document, for example: English
, Chinese
+
- process_rule
Processing rules
- mode
(string) Cleaning, segmentation mode, automatic / custom
- rules
(object) Custom rules (in automatic mode, this field is empty)
@@ -167,6 +187,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from
- segmentation
(object) Segmentation rules
- separator
Custom segment identifier, currently only allows one delimiter to be set. Default is \n
- max_tokens
Maximum length (token) defaults to 1000
+ - parent_mode
Retrieval mode of parent chunks: full-doc
full text retrieval / paragraph
paragraph retrieval
+ - subchunk_segmentation
(object) Child chunk rules
+ - separator
Segmentation identifier. Currently, only one delimiter is allowed. The default is ***
+ - max_tokens
The maximum length (tokens) must be validated to be shorter than the length of the parent chunk
Files that need to be uploaded.
@@ -449,6 +473,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from
- segmentation
(object) Segmentation rules
- separator
Custom segment identifier, currently only allows one delimiter to be set. Default is \n
- max_tokens
Maximum length (token) defaults to 1000
+ - parent_mode
Retrieval mode of parent chunks: full-doc
full text retrieval / paragraph
paragraph retrieval
+ - subchunk_segmentation
(object) Child chunk rules
+ - separator
Segmentation identifier. Currently, only one delimiter is allowed. The default is ***
+ - max_tokens
The maximum length (tokens) must be validated to be shorter than the length of the parent chunk
@@ -546,6 +574,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from
- segmentation
(object) Segmentation rules
- separator
Custom segment identifier, currently only allows one delimiter to be set. Default is \n
- max_tokens
Maximum length (token) defaults to 1000
+ - parent_mode
Retrieval mode of parent chunks: full-doc
full text retrieval / paragraph
paragraph retrieval
+ - subchunk_segmentation
(object) Child chunk rules
+ - separator
Segmentation identifier. Currently, only one delimiter is allowed. The default is ***
+ - max_tokens
The maximum length (tokens) must be validated to be shorter than the length of the parent chunk
@@ -984,7 +1016,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from
@@ -1009,6 +1041,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from
- answer
(text) Answer content, passed if the knowledge is in Q&A mode (optional)
- keywords
(list) Keyword (optional)
- enabled
(bool) False / true (optional)
+ - regenerate_child_chunks
(bool) Whether to regenerate child chunks (optional)
diff --git a/web/app/(commonLayout)/datasets/template/template.zh.mdx b/web/app/(commonLayout)/datasets/template/template.zh.mdx
index db15ede9fcabf2..24418dea579cfb 100644
--- a/web/app/(commonLayout)/datasets/template/template.zh.mdx
+++ b/web/app/(commonLayout)/datasets/template/template.zh.mdx
@@ -52,6 +52,15 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from
- high_quality
高质量:使用 embedding 模型进行嵌入,构建为向量数据库索引
- economy
经济:使用 keyword table index 的倒排索引进行构建
+
+ 索引内容的形式
+ - text_model
text 文档直接 embedding,经济模式默认为该模式
+ - hierarchical_model
parent-child 模式
+ - qa_model
Q&A 模式:为分片文档生成 Q&A 对,然后对问题进行 embedding
+
+
+ 在 Q&A 模式下,指定文档的语言,例如:English
、Chinese
+
处理规则
- mode
(string) 清洗、分段模式 ,automatic 自动 / custom 自定义
@@ -63,8 +72,12 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from
- remove_urls_emails
删除 URL、电子邮件地址
- enabled
(bool) 是否选中该规则,不传入文档 ID 时代表默认值
- segmentation
(object) 分段规则
- - separator
自定义分段标识符,目前仅允许设置一个分隔符。默认为 \n
+ - separator
自定义分段标识符,目前仅允许设置一个分隔符。默认为 \n
- max_tokens
最大长度(token)默认为 1000
+ - parent_mode
父分段的召回模式 full-doc
全文召回 / paragraph
段落召回
+ - subchunk_segmentation
(object) 子分段规则
+ - separator
分段标识符,目前仅允许设置一个分隔符。默认为 ***
+ - max_tokens
最大长度 (token) 需要校验小于父级的长度
@@ -155,6 +168,13 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from
- high_quality
高质量:使用 embedding 模型进行嵌入,构建为向量数据库索引
- economy
经济:使用 keyword table index 的倒排索引进行构建
+ - doc_form
索引内容的形式
+ - text_model
text 文档直接 embedding,经济模式默认为该模式
+ - hierarchical_model
parent-child 模式
+ - qa_model
Q&A 模式:为分片文档生成 Q&A 对,然后对问题进行 embedding
+
+ - doc_language
在 Q&A 模式下,指定文档的语言,例如:English
、Chinese
+
- process_rule
处理规则
- mode
(string) 清洗、分段模式 ,automatic 自动 / custom 自定义
- rules
(object) 自定义规则(自动模式下,该字段为空)
@@ -167,6 +187,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from
- segmentation
(object) 分段规则
- separator
自定义分段标识符,目前仅允许设置一个分隔符。默认为 \n
- max_tokens
最大长度(token)默认为 1000
+ - parent_mode
父分段的召回模式 full-doc
全文召回 / paragraph
段落召回
+ - subchunk_segmentation
(object) 子分段规则
+ - separator
分段标识符,目前仅允许设置一个分隔符。默认为 ***
+ - max_tokens
最大长度 (token) 需要校验小于父级的长度
需要上传的文件。
@@ -411,7 +435,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from
@@ -449,6 +473,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from
- segmentation
(object) 分段规则
- separator
自定义分段标识符,目前仅允许设置一个分隔符。默认为 \n
- max_tokens
最大长度(token)默认为 1000
+ - parent_mode
父分段的召回模式 full-doc
全文召回 / paragraph
段落召回
+ - subchunk_segmentation
(object) 子分段规则
+ - separator
分段标识符,目前仅允许设置一个分隔符。默认为 ***
+ - max_tokens
最大长度 (token) 需要校验小于父级的长度
@@ -508,7 +536,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from
@@ -546,6 +574,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from
- segmentation
(object) 分段规则
- separator
自定义分段标识符,目前仅允许设置一个分隔符。默认为 \n
- max_tokens
最大长度(token)默认为 1000
+ - parent_mode
父分段的召回模式 full-doc
全文召回 / paragraph
段落召回
+ - subchunk_segmentation
(object) 子分段规则
+ - separator
分段标识符,目前仅允许设置一个分隔符。默认为 ***
+ - max_tokens
最大长度 (token) 需要校验小于父级的长度
@@ -1009,6 +1041,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from
- answer
(text) 答案内容,非必填,如果知识库的模式为 Q&A 模式则传值
- keywords
(list) 关键字,非必填
- enabled
(bool) false/true,非必填
+ - regenerate_child_chunks
(bool) 是否重新生成子分段,非必填