elasticsearch分词器

九转成圣

发布于 2024-04-10 17:16:11

760

发布于 2024-04-10 17:16:11

文章被收录于专栏：csdn

elasticsearch分词器

标准分词器

POST _analyze
{
  "text": "java程序员",
  "analyzer": "standard"
}

{
  "tokens" : [
    {
      "token" : "java",
      "start_offset" : 0,
      "end_offset" : 4,
      "type" : "<ALPHANUM>",
      "position" : 0
    },
    {
      "token" : "程",
      "start_offset" : 4,
      "end_offset" : 5,
      "type" : "<IDEOGRAPHIC>",
      "position" : 1
    },
    {
      "token" : "序",
      "start_offset" : 5,
      "end_offset" : 6,
      "type" : "<IDEOGRAPHIC>",
      "position" : 2
    },
    {
      "token" : "员",
      "start_offset" : 6,
      "end_offset" : 7,
      "type" : "<IDEOGRAPHIC>",
      "position" : 3
    }
  ]
}

ik分词器

ik_smart模式

POST _analyze
{
  "text": "java程序员",
  "analyzer": "ik_smart"
}

{
  "tokens" : [
    {
      "token" : "java",
      "start_offset" : 0,
      "end_offset" : 4,
      "type" : "ENGLISH",
      "position" : 0
    },
    {
      "token" : "程序员",
      "start_offset" : 4,
      "end_offset" : 7,
      "type" : "CN_WORD",
      "position" : 1
    }
  ]
}

ik_max_word模式

尽可能多的分词/尽可能细粒度的分词

POST _analyze
{
  "text": "java程序员",
  "analyzer": "ik_max_word"
}

{
  "tokens" : [
    {
      "token" : "java",
      "start_offset" : 0,
      "end_offset" : 4,
      "type" : "ENGLISH",
      "position" : 0
    },
    {
      "token" : "程序员",
      "start_offset" : 4,
      "end_offset" : 7,
      "type" : "CN_WORD",
      "position" : 1
    },
    {
      "token" : "程序",
      "start_offset" : 4,
      "end_offset" : 6,
      "type" : "CN_WORD",
      "position" : 2
    },
    {
      "token" : "员",
      "start_offset" : 6,
      "end_offset" : 7,
      "type" : "CN_CHAR",
      "position" : 3
    }
  ]
}

IK分词器扩展

扩展字典/扩展屏蔽字典

vim /opt/es/elasticsearch-7.6.1/plugins/elasticsearch-analysis-ik-7.6.1/config/IKAnalyzer.cfg.xml

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
        <comment>IK Analyzer 扩展配置</comment>
        <!--用户可以在这里配置自己的扩展字典 -->
        <entry key="ext_dict">ext.dic</entry>
         <!--用户可以在这里配置自己的扩展停止词字典-->
        <entry key="ext_stopwords">stopword.dic</entry>
        <!--用户可以在这里配置远程扩展字典 -->
        <!-- <entry key="remote_ext_dict">words_location</entry> -->
        <!--用户可以在这里配置远程扩展停止词字典-->
        <!-- <entry key="remote_ext_stopwords">words_location</entry> -->
</properties>

在IKAnalyzer.cfg.xml目录下新建ext.dic文件,stopword.dic(已存在)

往ext.dic文件和stopword.dic文件里添加词语,例如往ext.dic里添加传智播客,奥力给,往stopword.dic里添加的,了,吗,嗯

vim /opt/es/elasticsearch-7.6.1/plugins/elasticsearch-analysis-ik-7.6.1/config/ext.dic

传智播客
白嫖
奥力给

重启

扩展分词器前

POST _analyze
{
  "text": "传智播客的课程可以白嫖,奥力给",
  "analyzer": "ik_smart"
}

{
  "tokens" : [
    {
      "token" : "传",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "CN_CHAR",
      "position" : 0
    },
    {
      "token" : "智",
      "start_offset" : 1,
      "end_offset" : 2,
      "type" : "CN_CHAR",
      "position" : 1
    },
    {
      "token" : "播",
      "start_offset" : 2,
      "end_offset" : 3,
      "type" : "CN_CHAR",
      "position" : 2
    },
    {
      "token" : "客",
      "start_offset" : 3,
      "end_offset" : 4,
      "type" : "CN_CHAR",
      "position" : 3
    },
    {
      "token" : "的",
      "start_offset" : 4,
      "end_offset" : 5,
      "type" : "CN_CHAR",
      "position" : 4
    },
    {
      "token" : "课程",
      "start_offset" : 5,
      "end_offset" : 7,
      "type" : "CN_WORD",
      "position" : 5
    },
    {
      "token" : "可以",
      "start_offset" : 7,
      "end_offset" : 9,
      "type" : "CN_WORD",
      "position" : 6
    },
    {
      "token" : "白",
      "start_offset" : 9,
      "end_offset" : 10,
      "type" : "CN_CHAR",
      "position" : 7
    },
    {
      "token" : "嫖",
      "start_offset" : 10,
      "end_offset" : 11,
      "type" : "CN_CHAR",
      "position" : 8
    },
    {
      "token" : "奥",
      "start_offset" : 12,
      "end_offset" : 13,
      "type" : "CN_CHAR",
      "position" : 9
    },
    {
      "token" : "力",
      "start_offset" : 13,
      "end_offset" : 14,
      "type" : "CN_CHAR",
      "position" : 10
    },
    {
      "token" : "给",
      "start_offset" : 14,
      "end_offset" : 15,
      "type" : "CN_CHAR",
      "position" : 11
    }
  ]
}

扩展后

{
  "tokens" : [
    {
      "token" : "传智播客",
      "start_offset" : 0,
      "end_offset" : 4,
      "type" : "CN_WORD",
      "position" : 0
    },
    {
      "token" : "课程",
      "start_offset" : 5,
      "end_offset" : 7,
      "type" : "CN_WORD",
      "position" : 1
    },
    {
      "token" : "可以",
      "start_offset" : 7,
      "end_offset" : 9,
      "type" : "CN_WORD",
      "position" : 2
    },
    {
      "token" : "白嫖",
      "start_offset" : 9,
      "end_offset" : 11,
      "type" : "CN_WORD",
      "position" : 3
    },
    {
      "token" : "奥力给",
      "start_offset" : 12,
      "end_offset" : 15,
      "type" : "CN_WORD",
      "position" : 4
    }
  ]
}

扩展后有了"传智播客",“白嫖”,“奥力给”,没有了"的"

本文参与腾讯云自媒体同步曝光计划，分享自作者个人站点/博客。

原始发表：2024-04-10，如有侵权请联系 cloudcommunity@tencent.com 删除

播客