驽马十驾 驽马十驾

驽马十驾,功在不舍

目录
【ELK】自定义分词器案例
/  

【ELK】自定义分词器案例

基本格式如下所示

PUT /my-index/_settings
{
  "index": {
    "analysis": {
      "analyzer": {
        "customHTMLSnowball": {
         "type": "custom",
          "char_filter": [
            "html_strip"
          ],
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "stop",
            "snowball"
          ]
        }}}}}

案例 1

{
    "settings": {
        "refresh_interval": "5s", 
        "number_of_shards": 1, 
        "number_of_replicas": 1, 
        "analysis": {
            "filter": {
                "edge_ngram_filter": {
                    "type": "edge_ngram", 
                    "min_gram": 1, 
                    "max_gram": 50
                }, 
                "pinyin_simple_filter": {
                    "type": "pinyin", 
                    "keep_first_letter": true, 
                    "keep_separate_first_letter": false, 
                    "keep_full_pinyin": false, 
                    "keep_original": false, 
                    "limit_first_letter_length": 50, 
                    "lowercase": true
                }, 
                "pinyin_full_filter": {
                    "type": "pinyin", 
                    "keep_first_letter": false, 
                    "keep_separate_first_letter": false, 
                    "keep_full_pinyin": true, 
                    "none_chinese_pinyin_tokenize": true, 
                    "keep_original": false, 
                    "limit_first_letter_length": 50, 
                    "lowercase": true
                }, 
                "t2s_convert": {
                    "type": "stconvert", 
                    "delimiter": ",", 
                    "convert_type": "t2s"
                }
            }, 
            "char_filter": {
                "charconvert": {
                    "type": "mapping", 
                    "mappings_path": "char_filter_text.txt"
                }
            }, 
            "tokenizer": {
                "ik_smart": {
                    "type": "ik", 
                    "use_smart": true
                }
            }, 
            "analyzer": {
                "ngramIndexAnalyzer": {
                    "type": "custom", 
                    "tokenizer": "keyword", 
                    "filter": [
                        "edge_ngram_filter", 
                        "lowercase"
                    ], 
                    "char_filter": [
                        "charconvert"
                    ]
                }, 
                "ngramSearchAnalyzer": {
                    "type": "custom", 
                    "tokenizer": "keyword", 
                    "filter": [
                        "lowercase"
                    ], 
                    "char_filter": [
                        "charconvert"
                    ]
                }, 
                "ikIndexAnalyzer": {
                    "type": "custom", 
                    "tokenizer": "ik", 
                    "char_filter": [
                        "charconvert"
                    ]
                }, 
                "ikSearchAnalyzer": {
                    "type": "custom", 
                    "tokenizer": "ik", 
                    "char_filter": [
                        "charconvert"
                    ]
                }, 
                "pinyiSimpleIndexAnalyzer": {
                    "tokenizer": "keyword", 
                    "filter": [
                        "pinyin_simple_filter", 
                        "edge_ngram_filter", 
                        "lowercase"
                    ]
                }, 
                "pinyiSimpleSearchAnalyzer": {
                    "tokenizer": "keyword", 
                    "filter": [
                        "pinyin_simple_filter", 
                        "lowercase"
                    ]
                }, 
                "pinyiFullIndexAnalyzer": {
                    "tokenizer": "keyword", 
                    "filter": [
                        "pinyin_full_filter", 
                        "lowercase"
                    ]
                }, 
                "pinyiFullSearchAnalyzer": {
                    "tokenizer": "keyword", 
                    "filter": [
                        "pinyin_full_filter", 
                        "lowercase"
                    ]
                }
            }
        }
    }
}

案例 2

{
    "settings":{
        "analysis":{
            "analyzer":{
                "myanalyzer":{
                    "tokenizer":"mytokenizer"
                }
            },
            "tokenizer":{
                "mytokenizer":{
                    "type":"ngram",
                    "min_gram":1,
                    "max_gram":2,
                    "token_chars":[
                        "letter",
                        "digit",
                        "whitespace",
                        "punctuation",
                        "symbol"
                    ]
                }
            }
        }
    }
}

案例 3

{
  "settings": {
    "analysis": {
      "filter": {   # filter过滤器设置
        "edge_ngram_filter":{
          "type":"edge_ngram",
          "min_gram":1,
          "max_gram":50
        },
        "pinyin_simple_filter":{
          "type":"pinyin",
          "keep_first_letter":true,
          "keep_separate_first_letter":false,
          "keep_full_pinyin":false,
          "keep_original":false,
          "limit_first_letter_length":50,
          "lowercase":true
        },
        "pinyin_full_filter":{
          "type":"pinyin",
          "keep_first_letter":false,
          "keep_separate_first_letter":false,
          "keep_full_pinyin":true,
          "none_chinese_pinyin_tokenize":true,
          "keep_original":false,
          "limit_first_letter_length":50,
          "lowercase":true
        }
      },
      "tokenizer": {  # tokenizer 分词器设置
        "ik_max_word":{
          "type":"ik_max_word",
          "use_smart":true
        }
      },
      "analyzer": {  #analyzer 分析器设置
        "ngramIndexAnalyzer":{
          "type":"custom",
          "tokenizer":"keyword",
          "filter":[
            "edge_ngram_filter",
            "lowercase"
            ]
        },
        "ikIndexAnalyzer":{
          "type":"custom",
          "tokenizer":"ik_max_word"
        },
        "pinyiSimpleIndexAnalyzer":{
          "tokenizer":"keyword",
          "filter":[
            "pinyin_simple_filter",
            "edge_ngram_filter",
            "lowercase"
            ]
        },
        "pinyiFullIndexAnalyzer":{
          "tokenizer":"keyword",
          "filter":[
            "pinyin_full_filter",
            "lowercase"
            ]
        }
      }
    }
  }
}

案例 4

这是我们综合的实战的代码

{
  "index": {
    "number_of_replicas": "1",
    "analysis": {
      "char_filter": {
        "ue_char_filter": {
          "type": "mapping",
          "mappings": [
            "- => ,",
            "— => ,"
          ]
        }
      },
      "tokenizer": {
        "ngram_tokenizer": {
          "type": "ngram",
          "min_gram": 2,
          "max_gram": 3,
          "token_chars": [
            "letter",
            "digit"
          ]
        }
      },
      "filter": {
        "my_pinyin": {
          "type": "pinyin",
          "keep_none_chinese": true,
          "keep_none_chinese_together": true,
          "none_chinese_pinyin_tokenize": false,
          "lowercase": true,
          "trim_whitespace": true,
          "keep_first_letter": true,
          "keep_full_pinyin": false,
          "keep_joined_full_pinyin": true,
          "keep_original": true
        }
      },
      "analyzer": {
        "ue_ik_pinyin_analyzer": {
          "type": "custom",
          "char_filter": [
            "html_strip",
            "ue_char_filter"
          ],
          "tokenizer": "ik_max_word",
          "filter": [
            "my_pinyin"
          ]
        },
        "ue-ngram":{
          "type": "custom",
          "char_filter": [
            "html_strip",
            "ue_char_filter"
          ],
          "tokenizer": "ngram_tokenizer"
        }
      }
    }
  }
}
积土成山,风雨兴焉。积水成渊,蛟龙生焉。