solr使用教程四【面试+工作】

<!-- Finnish -->

<fieldType name="text_fi" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fi.txt" format="snowball"

enablePositionIncrements="true"/>

<filter class="solr.SnowballPorterFilterFactory" language="Finnish"/>

<!-- less aggressive: <filter class="solr.FinnishLightStemFilterFactory"/> -->

</analyzer>

</fieldType>

<!-- French -->

<fieldType name="text_fr" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<!-- removes l', etc -->

<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_fr.txt"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fr.txt" format="snowball"

enablePositionIncrements="true"/>

<filter class="solr.FrenchLightStemFilterFactory"/>

<!-- less aggressive: <filter class="solr.FrenchMinimalStemFilterFactory"/> -->

<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="French"/> -->

</analyzer>

</fieldType>

<!-- Irish -->

<fieldType name="text_ga" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<!-- removes d', etc -->

<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ga.txt"/>

<!-- removes n-, etc. position increments is intentionally false! -->

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/hyphenations_ga.txt"

enablePositionIncrements="false"/>

<filter class="solr.IrishLowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ga.txt"

enablePositionIncrements="true"/>

<filter class="solr.SnowballPorterFilterFactory" language="Irish"/>

</analyzer>

</fieldType>

<!-- Galician -->

<fieldType name="text_gl" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_gl.txt"

enablePositionIncrements="true"/>

<filter class="solr.GalicianStemFilterFactory"/>

<!-- less aggressive: <filter class="solr.GalicianMinimalStemFilterFactory"/> -->

</analyzer>

</fieldType>

<!-- Hindi -->

<fieldType name="text_hi" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<!-- normalizes unicode representation -->

<filter class="solr.IndicNormalizationFilterFactory"/>

<!-- normalizes variation in spelling -->

<filter class="solr.HindiNormalizationFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hi.txt"

enablePositionIncrements="true"/>

<filter class="solr.HindiStemFilterFactory"/>

</analyzer>

</fieldType>

<!-- Hungarian -->

<fieldType name="text_hu" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hu.txt" format="snowball"

enablePositionIncrements="true"/>

<filter class="solr.SnowballPorterFilterFactory" language="Hungarian"/>

<!-- less aggressive: <filter class="solr.HungarianLightStemFilterFactory"/> -->

</analyzer>

</fieldType>

<!-- Armenian -->

<fieldType name="text_hy" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hy.txt"

enablePositionIncrements="true"/>

<filter class="solr.SnowballPorterFilterFactory" language="Armenian"/>

</analyzer>

</fieldType>

<!-- Indonesian -->

<fieldType name="text_id" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_id.txt"

enablePositionIncrements="true"/>

<!-- for a less aggressive approach (only inflectional suffixes), set stemDerivational to false -->

<filter class="solr.IndonesianStemFilterFactory" stemDerivational="true"/>

</analyzer>

</fieldType>

<!-- Italian -->

<fieldType name="text_it" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<!-- removes l', etc -->

<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_it.txt"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt" format="snowball"

enablePositionIncrements="true"/>

<filter class="solr.ItalianLightStemFilterFactory"/>

<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Italian"/> -->

</analyzer>

</fieldType>

<!-- Japanese using morphological analysis (see text_cjk for a configuration using bigramming)

NOTE: If you want to optimize search for precision, use default operator AND in your query

parser config with <solrQueryParser defaultOperator="AND"/> further down in this file. Use

OR if you would like to optimize for recall (default).

-->

<fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">

<analyzer>

<!-- Kuromoji Japanese morphological analyzer/tokenizer (JapaneseTokenizer)

Kuromoji has a search mode (default) that does segmentation useful for search. A heuristic

is used to segment compounds into its parts and the compound itself is kept as synonym.

Valid values for attribute mode are:

normal: regular segmentation

search: segmentation useful for search with synonyms compounds (default)

extended: same as search mode, but unigrams unknown words (experimental)

For some applications it might be good to use search mode for indexing and normal mode for

queries to reduce recall and prevent parts of compounds from being matched and highlighted.

Use <analyzer type="index"> and <analyzer type="query"> for this and mode normal in query.

Kuromoji also has a convenient user dictionary feature that allows overriding the statistical

model with your own entries for segmentation, part-of-speech tags and readings without a need

to specify weights. Notice that user dictionaries have not been subject to extensive testing.

User dictionary attributes are:

userDictionary: user dictionary filename

userDictionaryEncoding: user dictionary encoding (default is UTF-8)

See lang/userdict_ja.txt for a sample user dictionary file.

Punctuation characters are discarded by default. Use discardPunctuation="false" to keep them.

See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support.

-->

<tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>

<!--<tokenizer class="solr.JapaneseTokenizerFactory" mode="search" userDictionary="lang/userdict_ja.txt"/>-->

<!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) -->

<filter class="solr.JapaneseBaseFormFilterFactory"/>

<!-- Removes tokens with certain part-of-speech tags -->

<filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt"

enablePositionIncrements="true"/>

<!-- Normalizes full-width romaji to half-width and half-width kana to full-width (Unicode NFKC subset) -->

<filter class="solr.CJKWidthFilterFactory"/>

<!-- Removes common tokens typically not useful for search, but have a negative effect on ranking -->

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt"

enablePositionIncrements="true"/>

<!-- Normalizes common katakana spelling variations by removing any last long sound character (U+30FC) -->

<filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/>

<!-- Lower-cases romaji characters -->

<filter class="solr.LowerCaseFilterFactory"/>

</analyzer>

</fieldType>

<!-- Latvian -->

<fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_lv.txt"

enablePositionIncrements="true"/>

<filter class="solr.LatvianStemFilterFactory"/>

</analyzer>

</fieldType>

<!-- Dutch -->

<fieldType name="text_nl" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_nl.txt" format="snowball"

enablePositionIncrements="true"/>

<filter class="solr.StemmerOverrideFilterFactory" dictionary="lang/stemdict_nl.txt" ignoreCase="false"/>

<filter class="solr.SnowballPorterFilterFactory" language="Dutch"/>

</analyzer>

</fieldType>

<!-- Norwegian -->

<fieldType name="text_no" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_no.txt" format="snowball"

enablePositionIncrements="true"/>

<filter class="solr.SnowballPorterFilterFactory" language="Norwegian"/>

<!-- less aggressive: <filter class="solr.NorwegianLightStemFilterFactory"/> -->

<!-- singular/plural: <filter class="solr.NorwegianMinimalStemFilterFactory"/> -->

</analyzer>

</fieldType>

<!-- Portuguese -->

<fieldType name="text_pt" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pt.txt" format="snowball"

enablePositionIncrements="true"/>

<filter class="solr.PortugueseLightStemFilterFactory"/>

<!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> -->

<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> -->

<!-- most aggressive: <filter class="solr.PortugueseStemFilterFactory"/> -->

</analyzer>

</fieldType>

<!-- Romanian -->

<fieldType name="text_ro" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ro.txt"

enablePositionIncrements="true"/>

<filter class="solr.SnowballPorterFilterFactory" language="Romanian"/>

</analyzer>

</fieldType>

<!-- Russian -->

<fieldType name="text_ru" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ru.txt" format="snowball"

enablePositionIncrements="true"/>

<filter class="solr.SnowballPorterFilterFactory" language="Russian"/>

<!-- less aggressive: <filter class="solr.RussianLightStemFilterFactory"/> -->

</analyzer>

</fieldType>

<!-- Swedish -->

<fieldType name="text_sv" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_sv.txt" format="snowball"

enablePositionIncrements="true"/>

<filter class="solr.SnowballPorterFilterFactory" language="Swedish"/>

<!-- less aggressive: <filter class="solr.SwedishLightStemFilterFactory"/> -->

</analyzer>

</fieldType>

<!-- Thai -->

<fieldType name="text_th" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.ThaiWordFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_th.txt"

enablePositionIncrements="true"/>

</analyzer>

</fieldType>

<!-- Turkish -->

<fieldType name="text_tr" class="solr.TextField" positionIncrementGap="100">

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.TurkishLowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_tr.txt"

enablePositionIncrements="true"/>

<filter class="solr.SnowballPorterFilterFactory" language="Turkish"/>

</analyzer>

</fieldType>

</types>

</schema>

原文发布于微信公众号 - Java帮帮(javahelp)

原文发表时间:2018-04-07

本文参与腾讯云自媒体分享计划,欢迎正在阅读的你也加入,一起分享。

发表于

我来说两句

0 条评论
登录 后参与评论

扫码关注云+社区