adding language analyzers (#8591) (#8757)

opensearch-project · Nov 14, 2024 · 03360a6 · 03360a6
1 parent 284965a
commit 03360a6
Show file tree

Hide file tree

Showing 38 changed files with 5,499 additions and 46 deletions.
diff --git a/_analyzers/language-analyzers.md b/_analyzers/language-analyzers.md
diff --git a/_analyzers/language-analyzers/arabic.md b/_analyzers/language-analyzers/arabic.md
@@ -0,0 +1,182 @@
+---
+layout: default
+title: Arabic
+parent: Language analyzers
+grand_parent: Analyzers
+nav_order: 10
+---
+
+# Arabic analyzer
+
+The built-in `arabic` analyzer can be applied to a text field using the following command:
+
+```json
+PUT /arabic-index
+{
+  "mappings": {
+    "properties": {
+      "content": {
+        "type": "text",
+        "analyzer": "arabic"
+      }
+    }
+  }
+}
+```
+{% include copy-curl.html %}
+
+## Stem exclusion
+
+You can use `stem_exclusion` with this language analyzer using the following command:
+
+```json
+PUT index_with_stem_exclusion_arabic
+{
+  "settings": {
+    "analysis": {
+      "analyzer": {
+        "stem_exclusion_arabic_analyzer":{
+          "type":"arabic",
+          "stem_exclusion":["تكنولوجيا","سلطة "]
+        }
+      }
+    }
+  }
+}
+```
+{% include copy-curl.html %}
+
+## Arabic analyzer internals
+
+The `arabic` analyzer is built using the following components:
+
+- Tokenizer: `standard`
+
+- Token filters:
+  - lowercase
+  - decimal_digit
+  - stop (Arabic)
+  - normalization (Arabic)
+  - keyword
+  - stemmer (Arabic)
+
+## Custom Arabic analyzer
+
+You can create a custom Arabic analyzer using the following command:
+
+```json
+PUT /arabic-index
+{
+  "settings": {
+    "analysis": {
+      "filter": {
+        "arabic_stop": {
+          "type": "stop",
+          "stopwords": "_arabic_"
+        },
+        "arabic_stemmer": {
+          "type": "stemmer",
+          "language": "arabic"
+        },
+        "arabic_normalization": {
+          "type": "arabic_normalization"
+        },
+        "decimal_digit": {
+          "type": "decimal_digit"
+        },
+        "arabic_keywords": {
+          "type":       "keyword_marker",
+          "keywords":   [] 
+        }
+      },
+      "analyzer": {
+        "arabic_analyzer": {
+          "type": "custom",
+          "tokenizer": "standard",
+          "filter": [
+            "lowercase",
+            "arabic_normalization",
+            "decimal_digit",
+            "arabic_stop",
+            "arabic_keywords",
+            "arabic_stemmer"
+          ]
+        }
+      }
+    }
+  },
+  "mappings": {
+    "properties": {
+      "content": {
+        "type": "text",
+        "analyzer": "arabic_analyzer"
+      }
+    }
+  }
+}
+```
+{% include copy-curl.html %}
+
+## Generated tokens
+
+Use the following request to examine the tokens generated using the analyzer:
+
+```json
+POST /arabic-index/_analyze
+{
+  "field": "content",
+  "text": "الطلاب يدرسون في الجامعات العربية. أرقامهم ١٢٣٤٥٦."
+}
+```
+{% include copy-curl.html %}
+
+The response contains the generated tokens:
+
+```json
+{
+  "tokens": [
+    {
+      "token": "طلاب",
+      "start_offset": 0,
+      "end_offset": 6,
+      "type": "<ALPHANUM>",
+      "position": 0
+    },
+    {
+      "token": "يدرس",
+      "start_offset": 7,
+      "end_offset": 13,
+      "type": "<ALPHANUM>",
+      "position": 1
+    },
+    {
+      "token": "جامع",
+      "start_offset": 17,
+      "end_offset": 25,
+      "type": "<ALPHANUM>",
+      "position": 3
+    },
+    {
+      "token": "عرب",
+      "start_offset": 26,
+      "end_offset": 33,
+      "type": "<ALPHANUM>",
+      "position": 4
+    },
+    {
+      "token": "ارقامهم",
+      "start_offset": 35,
+      "end_offset": 42,
+      "type": "<ALPHANUM>",
+      "position": 5
+    },
+    {
+      "token": "123456",
+      "start_offset": 43,
+      "end_offset": 49,
+      "type": "<NUM>",
+      "position": 6
+    }
+  ]
+}
+```
diff --git a/_analyzers/language-analyzers/armenian.md b/_analyzers/language-analyzers/armenian.md
@@ -0,0 +1,137 @@
+---
+layout: default
+title: Armenian
+parent: Language analyzers
+grand_parent: Analyzers
+nav_order: 20
+---
+
+# Armenian analyzer
+
+The built-in `armenian` analyzer can be applied to a text field using the following command:
+
+```json
+PUT /arabic-index
+{
+  "mappings": {
+    "properties": {
+      "content": {
+        "type": "text",
+        "analyzer": "armenian"
+      }
+    }
+  }
+}
+```
+{% include copy-curl.html %}
+
+## Stem exclusion
+
+You can use `stem_exclusion` with this language analyzer using the following command:
+
+```json
+PUT index_with_stem_exclusion_armenian_analyzer
+{
+  "settings": {
+    "analysis": {
+      "analyzer": {
+        "stem_exclusion_armenian_analyzer": {
+          "type": "armenian",
+          "stem_exclusion": ["բարև", "խաղաղություն"] 
+        }
+      }
+    }
+  }
+}
+```
+{% include copy-curl.html %}
+
+## Armenian analyzer internals
+
+The `armenian` analyzer is built using the following components:
+
+- Tokenizer: `standard`
+
+- Token filters:
+  - lowercase
+  - stop (Armenian)
+  - keyword
+  - stemmer (Armenian)
+
+## Custom Armenian analyzer
+
+You can create a custom Armenian analyzer using the following command:
+
+```json
+PUT /armenian-index
+{
+  "settings": {
+    "analysis": {
+      "filter": {
+        "armenian_stop": {
+          "type": "stop",
+          "stopwords": "_armenian_"
+        },
+        "armenian_stemmer": {
+          "type": "stemmer",
+          "language": "armenian"
+        },
+        "armenian_keywords": {
+          "type":       "keyword_marker",
+          "keywords":   [] 
+        }
+      },
+      "analyzer": {
+        "armenian_analyzer": {
+          "type": "custom",
+          "tokenizer": "standard",
+          "filter": [
+            "lowercase",
+            "armenian_stop",
+            "armenian_keywords",
+            "armenian_stemmer"
+          ]
+        }
+      }
+    }
+  },
+  "mappings": {
+    "properties": {
+      "content": {
+        "type": "text",
+        "analyzer": "armenian_analyzer"
+      }
+    }
+  }
+}
+```
+{% include copy-curl.html %}
+
+## Generated tokens
+
+Use the following request to examine the tokens generated using the analyzer:
+
+```json
+GET armenian-index/_analyze
+{
+  "analyzer": "stem_exclusion_armenian_analyzer",
+  "text": "բարև բոլորին, մենք խաղաղություն ենք ուզում և նոր օր ենք սկսել"
+}
+```
+{% include copy-curl.html %}
+
+The response contains the generated tokens:
+
+```json
+{
+  "tokens": [
+    {"token": "բարև","start_offset": 0,"end_offset": 4,"type": "<ALPHANUM>","position": 0},
+    {"token": "բոլոր","start_offset": 5,"end_offset": 12,"type": "<ALPHANUM>","position": 1},
+    {"token": "խաղաղություն","start_offset": 19,"end_offset": 31,"type": "<ALPHANUM>","position": 3},
+    {"token": "ուզ","start_offset": 36,"end_offset": 42,"type": "<ALPHANUM>","position": 5},
+    {"token": "նոր","start_offset": 45,"end_offset": 48,"type": "<ALPHANUM>","position": 7},
+    {"token": "օր","start_offset": 49,"end_offset": 51,"type": "<ALPHANUM>","position": 8},
+    {"token": "սկսել","start_offset": 56,"end_offset": 61,"type": "<ALPHANUM>","position": 10}
+  ]
+}
+```