前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >solr使用教程四【面试+工作】

solr使用教程四【面试+工作】

作者头像
Java帮帮
发布2018-06-11 14:08:33
4010
发布2018-06-11 14:08:33
举报

<!-- Finnish -->

<fieldType name="text_fi" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fi.txt" format="snowball"

enablePositionIncrements="true"/>

<filter class="solr.SnowballPorterFilterFactory" language="Finnish"/>

<!-- less aggressive: <filter class="solr.FinnishLightStemFilterFactory"/> -->

</analyzer>

</fieldType>

<!-- French -->

<fieldType name="text_fr" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<!-- removes l', etc -->

<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_fr.txt"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fr.txt" format="snowball"

enablePositionIncrements="true"/>

<filter class="solr.FrenchLightStemFilterFactory"/>

<!-- less aggressive: <filter class="solr.FrenchMinimalStemFilterFactory"/> -->

<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="French"/> -->

</analyzer>

</fieldType>

<!-- Irish -->

<fieldType name="text_ga" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<!-- removes d', etc -->

<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ga.txt"/>

<!-- removes n-, etc. position increments is intentionally false! -->

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/hyphenations_ga.txt"

enablePositionIncrements="false"/>

<filter class="solr.IrishLowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ga.txt"

enablePositionIncrements="true"/>

<filter class="solr.SnowballPorterFilterFactory" language="Irish"/>

</analyzer>

</fieldType>

<!-- Galician -->

<fieldType name="text_gl" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_gl.txt"

enablePositionIncrements="true"/>

<filter class="solr.GalicianStemFilterFactory"/>

<!-- less aggressive: <filter class="solr.GalicianMinimalStemFilterFactory"/> -->

</analyzer>

</fieldType>

<!-- Hindi -->

<fieldType name="text_hi" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<!-- normalizes unicode representation -->

<filter class="solr.IndicNormalizationFilterFactory"/>

<!-- normalizes variation in spelling -->

<filter class="solr.HindiNormalizationFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hi.txt"

enablePositionIncrements="true"/>

<filter class="solr.HindiStemFilterFactory"/>

</analyzer>

</fieldType>

<!-- Hungarian -->

<fieldType name="text_hu" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hu.txt" format="snowball"

enablePositionIncrements="true"/>

<filter class="solr.SnowballPorterFilterFactory" language="Hungarian"/>

<!-- less aggressive: <filter class="solr.HungarianLightStemFilterFactory"/> -->

</analyzer>

</fieldType>

<!-- Armenian -->

<fieldType name="text_hy" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hy.txt"

enablePositionIncrements="true"/>

<filter class="solr.SnowballPorterFilterFactory" language="Armenian"/>

</analyzer>

</fieldType>

<!-- Indonesian -->

<fieldType name="text_id" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_id.txt"

enablePositionIncrements="true"/>

<!-- for a less aggressive approach (only inflectional suffixes), set stemDerivational to false -->

<filter class="solr.IndonesianStemFilterFactory" stemDerivational="true"/>

</analyzer>

</fieldType>

<!-- Italian -->

<fieldType name="text_it" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<!-- removes l', etc -->

<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_it.txt"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt" format="snowball"

enablePositionIncrements="true"/>

<filter class="solr.ItalianLightStemFilterFactory"/>

<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Italian"/> -->

</analyzer>

</fieldType>

<!-- Japanese using morphological analysis (see text_cjk for a configuration using bigramming)

NOTE: If you want to optimize search for precision, use default operator AND in your query

parser config with <solrQueryParser defaultOperator="AND"/> further down in this file. Use

OR if you would like to optimize for recall (default).

-->

<fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">

<analyzer>

<!-- Kuromoji Japanese morphological analyzer/tokenizer (JapaneseTokenizer)

Kuromoji has a search mode (default) that does segmentation useful for search. A heuristic

is used to segment compounds into its parts and the compound itself is kept as synonym.

Valid values for attribute mode are:

normal: regular segmentation

search: segmentation useful for search with synonyms compounds (default)

extended: same as search mode, but unigrams unknown words (experimental)

For some applications it might be good to use search mode for indexing and normal mode for

queries to reduce recall and prevent parts of compounds from being matched and highlighted.

Use <analyzer type="index"> and <analyzer type="query"> for this and mode normal in query.

Kuromoji also has a convenient user dictionary feature that allows overriding the statistical

model with your own entries for segmentation, part-of-speech tags and readings without a need

to specify weights. Notice that user dictionaries have not been subject to extensive testing.

User dictionary attributes are:

userDictionary: user dictionary filename

userDictionaryEncoding: user dictionary encoding (default is UTF-8)

See lang/userdict_ja.txt for a sample user dictionary file.

Punctuation characters are discarded by default. Use discardPunctuation="false" to keep them.

See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support.

-->

<tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>

<!--<tokenizer class="solr.JapaneseTokenizerFactory" mode="search" userDictionary="lang/userdict_ja.txt"/>-->

<!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) -->

<filter class="solr.JapaneseBaseFormFilterFactory"/>

<!-- Removes tokens with certain part-of-speech tags -->

<filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt"

enablePositionIncrements="true"/>

<!-- Normalizes full-width romaji to half-width and half-width kana to full-width (Unicode NFKC subset) -->

<filter class="solr.CJKWidthFilterFactory"/>

<!-- Removes common tokens typically not useful for search, but have a negative effect on ranking -->

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt"

enablePositionIncrements="true"/>

<!-- Normalizes common katakana spelling variations by removing any last long sound character (U+30FC) -->

<filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/>

<!-- Lower-cases romaji characters -->

<filter class="solr.LowerCaseFilterFactory"/>

</analyzer>

</fieldType>

<!-- Latvian -->

<fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_lv.txt"

enablePositionIncrements="true"/>

<filter class="solr.LatvianStemFilterFactory"/>

</analyzer>

</fieldType>

<!-- Dutch -->

<fieldType name="text_nl" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_nl.txt" format="snowball"

enablePositionIncrements="true"/>

<filter class="solr.StemmerOverrideFilterFactory" dictionary="lang/stemdict_nl.txt" ignoreCase="false"/>

<filter class="solr.SnowballPorterFilterFactory" language="Dutch"/>

</analyzer>

</fieldType>

<!-- Norwegian -->

<fieldType name="text_no" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_no.txt" format="snowball"

enablePositionIncrements="true"/>

<filter class="solr.SnowballPorterFilterFactory" language="Norwegian"/>

<!-- less aggressive: <filter class="solr.NorwegianLightStemFilterFactory"/> -->

<!-- singular/plural: <filter class="solr.NorwegianMinimalStemFilterFactory"/> -->

</analyzer>

</fieldType>

<!-- Portuguese -->

<fieldType name="text_pt" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pt.txt" format="snowball"

enablePositionIncrements="true"/>

<filter class="solr.PortugueseLightStemFilterFactory"/>

<!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> -->

<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> -->

<!-- most aggressive: <filter class="solr.PortugueseStemFilterFactory"/> -->

</analyzer>

</fieldType>

<!-- Romanian -->

<fieldType name="text_ro" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ro.txt"

enablePositionIncrements="true"/>

<filter class="solr.SnowballPorterFilterFactory" language="Romanian"/>

</analyzer>

</fieldType>

<!-- Russian -->

<fieldType name="text_ru" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ru.txt" format="snowball"

enablePositionIncrements="true"/>

<filter class="solr.SnowballPorterFilterFactory" language="Russian"/>

<!-- less aggressive: <filter class="solr.RussianLightStemFilterFactory"/> -->

</analyzer>

</fieldType>

<!-- Swedish -->

<fieldType name="text_sv" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_sv.txt" format="snowball"

enablePositionIncrements="true"/>

<filter class="solr.SnowballPorterFilterFactory" language="Swedish"/>

<!-- less aggressive: <filter class="solr.SwedishLightStemFilterFactory"/> -->

</analyzer>

</fieldType>

<!-- Thai -->

<fieldType name="text_th" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.ThaiWordFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_th.txt"

enablePositionIncrements="true"/>

</analyzer>

</fieldType>

<!-- Turkish -->

<fieldType name="text_tr" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.TurkishLowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_tr.txt"

enablePositionIncrements="true"/>

<filter class="solr.SnowballPorterFilterFactory" language="Turkish"/>

</analyzer>

</fieldType>

</types>

</schema>

本文参与 腾讯云自媒体同步曝光计划,分享自微信公众号。
原始发表:2018-04-07,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 Java帮帮 微信公众号,前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档