3

T'ícÕã@sŠdZddlZddlZddlmZmZddlmZddlm	Z	yddl
Z
dZWnek
rddZYnXe
djƒƒZd	ZGd
d„deƒZdS)z9Chinese search language: includes routine to split words.éN)ÚDictÚList)ÚSearchLanguage)Úget_stemmerTFz
a  and  are  as  at
be  but  by
for
if  in  into  is  it
near  no  not
of  on  or
such
that  the  their  then  there  these  they  this  to
was  will  with
a
/**
 * Porter Stemmer
 */
var Stemmer = function() {

  var step2list = {
    ational: 'ate',
    tional: 'tion',
    enci: 'ence',
    anci: 'ance',
    izer: 'ize',
    bli: 'ble',
    alli: 'al',
    entli: 'ent',
    eli: 'e',
    ousli: 'ous',
    ization: 'ize',
    ation: 'ate',
    ator: 'ate',
    alism: 'al',
    iveness: 'ive',
    fulness: 'ful',
    ousness: 'ous',
    aliti: 'al',
    iviti: 'ive',
    biliti: 'ble',
    logi: 'log'
  };

  var step3list = {
    icate: 'ic',
    ative: '',
    alize: 'al',
    iciti: 'ic',
    ical: 'ic',
    ful: '',
    ness: ''
  };

  var c = "[^aeiou]";          // consonant
  var v = "[aeiouy]";          // vowel
  var C = c + "[^aeiouy]*";    // consonant sequence
  var V = v + "[aeiou]*";      // vowel sequence

  var mgr0 = "^(" + C + ")?" + V + C;                      // [C]VC... is m>0
  var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$";    // [C]VC[V] is m=1
  var mgr1 = "^(" + C + ")?" + V + C + V + C;              // [C]VCVC... is m>1
  var s_v   = "^(" + C + ")?" + v;                         // vowel in stem

  this.stemWord = function (w) {
    var stem;
    var suffix;
    var firstch;
    var origword = w;

    if (w.length < 3)
      return w;

    var re;
    var re2;
    var re3;
    var re4;

    firstch = w.substr(0,1);
    if (firstch == "y")
      w = firstch.toUpperCase() + w.substr(1);

    // Step 1a
    re = /^(.+?)(ss|i)es$/;
    re2 = /^(.+?)([^s])s$/;

    if (re.test(w))
      w = w.replace(re,"$1$2");
    else if (re2.test(w))
      w = w.replace(re2,"$1$2");

    // Step 1b
    re = /^(.+?)eed$/;
    re2 = /^(.+?)(ed|ing)$/;
    if (re.test(w)) {
      var fp = re.exec(w);
      re = new RegExp(mgr0);
      if (re.test(fp[1])) {
        re = /.$/;
        w = w.replace(re,"");
      }
    }
    else if (re2.test(w)) {
      var fp = re2.exec(w);
      stem = fp[1];
      re2 = new RegExp(s_v);
      if (re2.test(stem)) {
        w = stem;
        re2 = /(at|bl|iz)$/;
        re3 = new RegExp("([^aeiouylsz])\\1$");
        re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
        if (re2.test(w))
          w = w + "e";
        else if (re3.test(w)) {
          re = /.$/;
          w = w.replace(re,"");
        }
        else if (re4.test(w))
          w = w + "e";
      }
    }

    // Step 1c
    re = /^(.+?)y$/;
    if (re.test(w)) {
      var fp = re.exec(w);
      stem = fp[1];
      re = new RegExp(s_v);
      if (re.test(stem))
        w = stem + "i";
    }

    // Step 2
    re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
    if (re.test(w)) {
      var fp = re.exec(w);
      stem = fp[1];
      suffix = fp[2];
      re = new RegExp(mgr0);
      if (re.test(stem))
        w = stem + step2list[suffix];
    }

    // Step 3
    re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
    if (re.test(w)) {
      var fp = re.exec(w);
      stem = fp[1];
      suffix = fp[2];
      re = new RegExp(mgr0);
      if (re.test(stem))
        w = stem + step3list[suffix];
    }

    // Step 4
    re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
    re2 = /^(.+?)(s|t)(ion)$/;
    if (re.test(w)) {
      var fp = re.exec(w);
      stem = fp[1];
      re = new RegExp(mgr1);
      if (re.test(stem))
        w = stem;
    }
    else if (re2.test(w)) {
      var fp = re2.exec(w);
      stem = fp[1] + fp[2];
      re2 = new RegExp(mgr1);
      if (re2.test(stem))
        w = stem;
    }

    // Step 5
    re = /^(.+?)e$/;
    if (re.test(w)) {
      var fp = re.exec(w);
      stem = fp[1];
      re = new RegExp(mgr1);
      re2 = new RegExp(meq1);
      re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
      if (re.test(stem) || (re2.test(stem) && !(re3.test(stem))))
        w = stem;
    }
    re = /ll$/;
    re2 = new RegExp(mgr1);
    if (re.test(w) && re2.test(w)) {
      re = /.$/;
      w = w.replace(re,"");
    }

    // and turn initial Y back to y
    if (firstch == "y")
      w = firstch.toLowerCase() + w.substr(1);
    return w;
  }
}
c@s|eZdZUdZdZdZeZeZ	e
jdƒZgZ
ee
eddœdd„Zeeed	œd
d„Zeedœd
d„Zeedœdd„ZdS)Ú
SearchChinesez'
    Chinese search implementation
    ÚzhÚChinesez
[a-zA-Z0-9_]+N)ÚoptionsÚreturncCs4tr(|jdƒ}|r(tjj|ƒr(tj|ƒtƒ|_dS)NÚdict)	ÚJIEBAÚgetÚosÚpathÚisfileÚjiebaZ
load_userdictrÚstemmer)Úselfr	Z	dict_path©rú2/tmp/pip-build-gk9425m9/sphinx/sphinx/search/zh.pyÚinitãs


zSearchChinese.init)Úinputr
cCs@g}trttj|ƒƒ}dd„|jj|ƒDƒ}|jj|ƒ||S)NcSsg|]}|jƒ‘qSr)Ústrip)Ú.0Ztermrrrú
<listcomp>ñsz'SearchChinese.split.<locals>.<listcomp>)rÚlistrZcut_for_searchÚlatin1_lettersÚfindallÚlatin_termsÚextend)rrÚchineseÚlatin1rrrÚsplitëszSearchChinese.split)Ústemmed_wordr
cCst|ƒdkS)Né)Úlen)rr#rrrÚword_filterõszSearchChinese.word_filter)Úwordr
cCsJ||jko,t|ƒdko,t|jj|jƒƒƒdk}|r:|jƒS|jj|jƒƒS)Né)rr%rÚstemÚlower)rr'Zshould_not_be_stemmedrrrr)øs
zSearchChinese.stem)Ú__name__Ú
__module__Ú__qualname__Ú__doc__ÚlangZ
language_nameÚjs_porter_stemmerZjs_stemmer_codeÚenglish_stopwordsZ	stopwordsÚreÚcompilerrrÚstrrrr"Úboolr&r)rrrrr×s


r)r.rr2ÚtypingrrZ
sphinx.searchrZsphinx.util.stemmerrrrÚImportErrorÚsetr"r1r0rrrrrÚ<module>s


;