POST _analyze
{
"text": "java程序员",
"analyzer": "standard"
}
{
"tokens" : [
{
"token" : "java",
"start_offset" : 0,
"end_offset" : 4,
"type" : "<ALPHANUM>",
"position" : 0
},
{
"token" : "程",
"start_offset" : 4,
"end_offset" : 5,
"type" : "<IDEOGRAPHIC>",
"position" : 1
},
{
"token" : "序",
"start_offset" : 5,
"end_offset" : 6,
"type" : "<IDEOGRAPHIC>",
"position" : 2
},
{
"token" : "员",
"start_offset" : 6,
"end_offset" : 7,
"type" : "<IDEOGRAPHIC>",
"position" : 3
}
]
}
POST _analyze
{
"text": "java程序员",
"analyzer": "ik_smart"
}
{
"tokens" : [
{
"token" : "java",
"start_offset" : 0,
"end_offset" : 4,
"type" : "ENGLISH",
"position" : 0
},
{
"token" : "程序员",
"start_offset" : 4,
"end_offset" : 7,
"type" : "CN_WORD",
"position" : 1
}
]
}
尽可能多的分词/尽可能细粒度的分词
POST _analyze
{
"text": "java程序员",
"analyzer": "ik_max_word"
}
{
"tokens" : [
{
"token" : "java",
"start_offset" : 0,
"end_offset" : 4,
"type" : "ENGLISH",
"position" : 0
},
{
"token" : "程序员",
"start_offset" : 4,
"end_offset" : 7,
"type" : "CN_WORD",
"position" : 1
},
{
"token" : "程序",
"start_offset" : 4,
"end_offset" : 6,
"type" : "CN_WORD",
"position" : 2
},
{
"token" : "员",
"start_offset" : 6,
"end_offset" : 7,
"type" : "CN_CHAR",
"position" : 3
}
]
}
扩展字典/扩展屏蔽字典
vim /opt/es/elasticsearch-7.6.1/plugins/elasticsearch-analysis-ik-7.6.1/config/IKAnalyzer.cfg.xml
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>IK Analyzer 扩展配置</comment>
<!--用户可以在这里配置自己的扩展字典 -->
<entry key="ext_dict">ext.dic</entry>
<!--用户可以在这里配置自己的扩展停止词字典-->
<entry key="ext_stopwords">stopword.dic</entry>
<!--用户可以在这里配置远程扩展字典 -->
<!-- <entry key="remote_ext_dict">words_location</entry> -->
<!--用户可以在这里配置远程扩展停止词字典-->
<!-- <entry key="remote_ext_stopwords">words_location</entry> -->
</properties>
在IKAnalyzer.cfg.xml目录下新建ext.dic文件,stopword.dic(已存在)
往ext.dic文件和stopword.dic文件里添加词语,例如往ext.dic里添加传智播客,奥力给,往stopword.dic里添加的,了,吗,嗯
vim /opt/es/elasticsearch-7.6.1/plugins/elasticsearch-analysis-ik-7.6.1/config/ext.dic
传智播客
白嫖
奥力给
重启
扩展分词器前
POST _analyze
{
"text": "传智播客的课程可以白嫖,奥力给",
"analyzer": "ik_smart"
}
{
"tokens" : [
{
"token" : "传",
"start_offset" : 0,
"end_offset" : 1,
"type" : "CN_CHAR",
"position" : 0
},
{
"token" : "智",
"start_offset" : 1,
"end_offset" : 2,
"type" : "CN_CHAR",
"position" : 1
},
{
"token" : "播",
"start_offset" : 2,
"end_offset" : 3,
"type" : "CN_CHAR",
"position" : 2
},
{
"token" : "客",
"start_offset" : 3,
"end_offset" : 4,
"type" : "CN_CHAR",
"position" : 3
},
{
"token" : "的",
"start_offset" : 4,
"end_offset" : 5,
"type" : "CN_CHAR",
"position" : 4
},
{
"token" : "课程",
"start_offset" : 5,
"end_offset" : 7,
"type" : "CN_WORD",
"position" : 5
},
{
"token" : "可以",
"start_offset" : 7,
"end_offset" : 9,
"type" : "CN_WORD",
"position" : 6
},
{
"token" : "白",
"start_offset" : 9,
"end_offset" : 10,
"type" : "CN_CHAR",
"position" : 7
},
{
"token" : "嫖",
"start_offset" : 10,
"end_offset" : 11,
"type" : "CN_CHAR",
"position" : 8
},
{
"token" : "奥",
"start_offset" : 12,
"end_offset" : 13,
"type" : "CN_CHAR",
"position" : 9
},
{
"token" : "力",
"start_offset" : 13,
"end_offset" : 14,
"type" : "CN_CHAR",
"position" : 10
},
{
"token" : "给",
"start_offset" : 14,
"end_offset" : 15,
"type" : "CN_CHAR",
"position" : 11
}
]
}
扩展后
{
"tokens" : [
{
"token" : "传智播客",
"start_offset" : 0,
"end_offset" : 4,
"type" : "CN_WORD",
"position" : 0
},
{
"token" : "课程",
"start_offset" : 5,
"end_offset" : 7,
"type" : "CN_WORD",
"position" : 1
},
{
"token" : "可以",
"start_offset" : 7,
"end_offset" : 9,
"type" : "CN_WORD",
"position" : 2
},
{
"token" : "白嫖",
"start_offset" : 9,
"end_offset" : 11,
"type" : "CN_WORD",
"position" : 3
},
{
"token" : "奥力给",
"start_offset" : 12,
"end_offset" : 15,
"type" : "CN_WORD",
"position" : 4
}
]
}
扩展后有了"传智播客",“白嫖”,“奥力给”,没有了"的"