-
Notifications
You must be signed in to change notification settings - Fork 507
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
- Loading branch information
1 parent
284965a
commit 03360a6
Showing
38 changed files
with
5,499 additions
and
46 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
--- | ||
layout: default | ||
title: Arabic | ||
parent: Language analyzers | ||
grand_parent: Analyzers | ||
nav_order: 10 | ||
--- | ||
|
||
# Arabic analyzer | ||
|
||
The built-in `arabic` analyzer can be applied to a text field using the following command: | ||
|
||
```json | ||
PUT /arabic-index | ||
{ | ||
"mappings": { | ||
"properties": { | ||
"content": { | ||
"type": "text", | ||
"analyzer": "arabic" | ||
} | ||
} | ||
} | ||
} | ||
``` | ||
{% include copy-curl.html %} | ||
|
||
## Stem exclusion | ||
|
||
You can use `stem_exclusion` with this language analyzer using the following command: | ||
|
||
```json | ||
PUT index_with_stem_exclusion_arabic | ||
{ | ||
"settings": { | ||
"analysis": { | ||
"analyzer": { | ||
"stem_exclusion_arabic_analyzer":{ | ||
"type":"arabic", | ||
"stem_exclusion":["تكنولوجيا","سلطة "] | ||
} | ||
} | ||
} | ||
} | ||
} | ||
``` | ||
{% include copy-curl.html %} | ||
|
||
## Arabic analyzer internals | ||
|
||
The `arabic` analyzer is built using the following components: | ||
|
||
- Tokenizer: `standard` | ||
|
||
- Token filters: | ||
- lowercase | ||
- decimal_digit | ||
- stop (Arabic) | ||
- normalization (Arabic) | ||
- keyword | ||
- stemmer (Arabic) | ||
|
||
## Custom Arabic analyzer | ||
|
||
You can create a custom Arabic analyzer using the following command: | ||
|
||
```json | ||
PUT /arabic-index | ||
{ | ||
"settings": { | ||
"analysis": { | ||
"filter": { | ||
"arabic_stop": { | ||
"type": "stop", | ||
"stopwords": "_arabic_" | ||
}, | ||
"arabic_stemmer": { | ||
"type": "stemmer", | ||
"language": "arabic" | ||
}, | ||
"arabic_normalization": { | ||
"type": "arabic_normalization" | ||
}, | ||
"decimal_digit": { | ||
"type": "decimal_digit" | ||
}, | ||
"arabic_keywords": { | ||
"type": "keyword_marker", | ||
"keywords": [] | ||
} | ||
}, | ||
"analyzer": { | ||
"arabic_analyzer": { | ||
"type": "custom", | ||
"tokenizer": "standard", | ||
"filter": [ | ||
"lowercase", | ||
"arabic_normalization", | ||
"decimal_digit", | ||
"arabic_stop", | ||
"arabic_keywords", | ||
"arabic_stemmer" | ||
] | ||
} | ||
} | ||
} | ||
}, | ||
"mappings": { | ||
"properties": { | ||
"content": { | ||
"type": "text", | ||
"analyzer": "arabic_analyzer" | ||
} | ||
} | ||
} | ||
} | ||
``` | ||
{% include copy-curl.html %} | ||
|
||
## Generated tokens | ||
|
||
Use the following request to examine the tokens generated using the analyzer: | ||
|
||
```json | ||
POST /arabic-index/_analyze | ||
{ | ||
"field": "content", | ||
"text": "الطلاب يدرسون في الجامعات العربية. أرقامهم ١٢٣٤٥٦." | ||
} | ||
``` | ||
{% include copy-curl.html %} | ||
|
||
The response contains the generated tokens: | ||
|
||
```json | ||
{ | ||
"tokens": [ | ||
{ | ||
"token": "طلاب", | ||
"start_offset": 0, | ||
"end_offset": 6, | ||
"type": "<ALPHANUM>", | ||
"position": 0 | ||
}, | ||
{ | ||
"token": "يدرس", | ||
"start_offset": 7, | ||
"end_offset": 13, | ||
"type": "<ALPHANUM>", | ||
"position": 1 | ||
}, | ||
{ | ||
"token": "جامع", | ||
"start_offset": 17, | ||
"end_offset": 25, | ||
"type": "<ALPHANUM>", | ||
"position": 3 | ||
}, | ||
{ | ||
"token": "عرب", | ||
"start_offset": 26, | ||
"end_offset": 33, | ||
"type": "<ALPHANUM>", | ||
"position": 4 | ||
}, | ||
{ | ||
"token": "ارقامهم", | ||
"start_offset": 35, | ||
"end_offset": 42, | ||
"type": "<ALPHANUM>", | ||
"position": 5 | ||
}, | ||
{ | ||
"token": "123456", | ||
"start_offset": 43, | ||
"end_offset": 49, | ||
"type": "<NUM>", | ||
"position": 6 | ||
} | ||
] | ||
} | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
--- | ||
layout: default | ||
title: Armenian | ||
parent: Language analyzers | ||
grand_parent: Analyzers | ||
nav_order: 20 | ||
--- | ||
|
||
# Armenian analyzer | ||
|
||
The built-in `armenian` analyzer can be applied to a text field using the following command: | ||
|
||
```json | ||
PUT /arabic-index | ||
{ | ||
"mappings": { | ||
"properties": { | ||
"content": { | ||
"type": "text", | ||
"analyzer": "armenian" | ||
} | ||
} | ||
} | ||
} | ||
``` | ||
{% include copy-curl.html %} | ||
|
||
## Stem exclusion | ||
|
||
You can use `stem_exclusion` with this language analyzer using the following command: | ||
|
||
```json | ||
PUT index_with_stem_exclusion_armenian_analyzer | ||
{ | ||
"settings": { | ||
"analysis": { | ||
"analyzer": { | ||
"stem_exclusion_armenian_analyzer": { | ||
"type": "armenian", | ||
"stem_exclusion": ["բարև", "խաղաղություն"] | ||
} | ||
} | ||
} | ||
} | ||
} | ||
``` | ||
{% include copy-curl.html %} | ||
|
||
## Armenian analyzer internals | ||
|
||
The `armenian` analyzer is built using the following components: | ||
|
||
- Tokenizer: `standard` | ||
|
||
- Token filters: | ||
- lowercase | ||
- stop (Armenian) | ||
- keyword | ||
- stemmer (Armenian) | ||
|
||
## Custom Armenian analyzer | ||
|
||
You can create a custom Armenian analyzer using the following command: | ||
|
||
```json | ||
PUT /armenian-index | ||
{ | ||
"settings": { | ||
"analysis": { | ||
"filter": { | ||
"armenian_stop": { | ||
"type": "stop", | ||
"stopwords": "_armenian_" | ||
}, | ||
"armenian_stemmer": { | ||
"type": "stemmer", | ||
"language": "armenian" | ||
}, | ||
"armenian_keywords": { | ||
"type": "keyword_marker", | ||
"keywords": [] | ||
} | ||
}, | ||
"analyzer": { | ||
"armenian_analyzer": { | ||
"type": "custom", | ||
"tokenizer": "standard", | ||
"filter": [ | ||
"lowercase", | ||
"armenian_stop", | ||
"armenian_keywords", | ||
"armenian_stemmer" | ||
] | ||
} | ||
} | ||
} | ||
}, | ||
"mappings": { | ||
"properties": { | ||
"content": { | ||
"type": "text", | ||
"analyzer": "armenian_analyzer" | ||
} | ||
} | ||
} | ||
} | ||
``` | ||
{% include copy-curl.html %} | ||
|
||
## Generated tokens | ||
|
||
Use the following request to examine the tokens generated using the analyzer: | ||
|
||
```json | ||
GET armenian-index/_analyze | ||
{ | ||
"analyzer": "stem_exclusion_armenian_analyzer", | ||
"text": "բարև բոլորին, մենք խաղաղություն ենք ուզում և նոր օր ենք սկսել" | ||
} | ||
``` | ||
{% include copy-curl.html %} | ||
|
||
The response contains the generated tokens: | ||
|
||
```json | ||
{ | ||
"tokens": [ | ||
{"token": "բարև","start_offset": 0,"end_offset": 4,"type": "<ALPHANUM>","position": 0}, | ||
{"token": "բոլոր","start_offset": 5,"end_offset": 12,"type": "<ALPHANUM>","position": 1}, | ||
{"token": "խաղաղություն","start_offset": 19,"end_offset": 31,"type": "<ALPHANUM>","position": 3}, | ||
{"token": "ուզ","start_offset": 36,"end_offset": 42,"type": "<ALPHANUM>","position": 5}, | ||
{"token": "նոր","start_offset": 45,"end_offset": 48,"type": "<ALPHANUM>","position": 7}, | ||
{"token": "օր","start_offset": 49,"end_offset": 51,"type": "<ALPHANUM>","position": 8}, | ||
{"token": "սկսել","start_offset": 56,"end_offset": 61,"type": "<ALPHANUM>","position": 10} | ||
] | ||
} | ||
``` |
Oops, something went wrong.