
    g/                     2    d Z ddlZddlmZ  G d de      Zy)z
A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm.
Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
    N)StemmerIc                   N    e Zd ZdZdZddZddZd Zd Zd Z	d	 Z
d
 Zd Zd Zy)LancasterStemmera/  
    Lancaster Stemmer

        >>> from nltk.stem.lancaster import LancasterStemmer
        >>> st = LancasterStemmer()
        >>> st.stem('maximum')     # Remove "-um" when word is intact
        'maxim'
        >>> st.stem('presumably')  # Don't remove "-um" when word is not intact
        'presum'
        >>> st.stem('multiply')    # No action taken if word ends with "-ply"
        'multiply'
        >>> st.stem('provision')   # Replace "-sion" with "-j" to trigger "j" set of rules
        'provid'
        >>> st.stem('owed')        # Word starting with vowel must contain at least 2 letters
        'ow'
        >>> st.stem('ear')         # ditto
        'ear'
        >>> st.stem('saying')      # Words starting with consonant must contain at least 3
        'say'
        >>> st.stem('crying')      #     letters and one of those letters must be a vowel
        'cry'
        >>> st.stem('string')      # ditto
        'string'
        >>> st.stem('meant')       # ditto
        'meant'
        >>> st.stem('cement')      # ditto
        'cem'
        >>> st_pre = LancasterStemmer(strip_prefix_flag=True)
        >>> st_pre.stem('kilometer') # Test Prefix
        'met'
        >>> st_custom = LancasterStemmer(rule_tuple=("ssen4>", "s1t."))
        >>> st_custom.stem("ness") # Change s to t
        'nest'
    )szai*2.za*1.zbb1.zcity3s.zci2>zcn1t>zdd1.zdei3y>zdeec2ss.zdee1.zde2>zdooh4>ze1>zfeil1v.zfi2>zgni3>zgai3y.zga2>zgg1.zht*2.z	hsiug5ct.zhsi3>zi*1.zi1y>zji1d.zjuf1s.zju1d.zjo1d.zjeh1r.zjrev1t.zjsim2t.zjn1d.zj1s.zlbaifi6.zlbai4y.zlba3>zlbi3.zlib2l>zlc1.zlufi4y.zluf3>zlu2.zlai3>zlau3>zla2>zll1.zmui3.zmu*2.zmsi3>zmm1.znois4j>znoix4ct.znoi3>znai3>zna2>znee0.zne2>znn1.zpihs4>zpp1.zre2>zrae0.zra2.zro2>zru2>zrr1.zrt1>zrei3y>zsei3y>zsis2.zsi2>zssen4>zss0.zsuo3>zsu*2.zs*1>zs0.z	tacilp4y.zta2>ztnem4>ztne3>ztna3>ztpir2b.ztpro2b.ztcud1.ztpmus2.ztpec2iv.ztulo2v.ztsis0.ztsi3>ztt1.zuqi3.zugo1.zvis3j>zvie0.zvi2>zylb1>zyli3y>zylp0.zyl2>zygo1.zyhp1.zymo1.zypo1.zyti3>zyte3>zytl2.zyrtsi5.zyra3>zyro3>zyfi3.zycn2t>zyca3>zzi2>zzy1s.Nc                 V    i | _         || _        |r|| _        y| j                  | _        y)z,Create an instance of the Lancaster stemmer.N)rule_dictionary_strip_prefixdefault_rule_tuple_rule_tuple)self
rule_tuplestrip_prefix_flags      H/var/www/openai/venv/lib/python3.12/site-packages/nltk/stem/lancaster.py__init__zLancasterStemmer.__init__   s+      ".)3:9P9P    c                 .   |r|n| j                   }t        j                  d      }i | _        |D ]d  }|j	                  |      st        d| d      |dd }|| j                  v r| j                  |   j                  |       U|g| j                  |<   f y)a(  Validate the set of rules used in this stemmer.

        If this function is called as an individual method, without using stem
        method, rule_tuple argument will be compiled into self.rule_dictionary.
        If this function is called within stem, self._rule_tuple will be used.

        z^[a-z]+\*?\d[a-z]*[>\.]?$z	The rule z is invalidr      N)r
   recompiler   match
ValueErrorappend)r   r   
valid_rulerulefirst_letters        r   
parseRuleszLancasterStemmer.parseRules   s     $.Z43C3C
ZZ <=
!D##D) 9TF+!>??!9Lt333$$\299$?6:V$$\2 r   c                     |j                         }| j                  r| j                  |      n|}|}| j                  s| j	                          | j                  ||      S )z(Stem a word using the Lancaster stemmer.)lowerr   _LancasterStemmer__stripPrefixr   r   _LancasterStemmer__doStemming)r   wordintact_words      r   stemzLancasterStemmer.stem   sX     zz|+/+=+=t!!$'4  ##OO  {33r   c                 B   t        j                  d      }d}|r| j                  |      }|dk  s||   | j                  vrd}nd}| j                  ||      D ]  }|j	                  |      }|s|j                         \  }	}
}}}t        |      }|j                  |	ddd         sP|
r7||k(  sX| j                  ||      sk| j                  |||      }d}|dk(  rd} n2| j                  ||      s| j                  |||      }d}|dk(  rd} n |dk(  rd}|r|S )z Perform the actual word stemmingz#^([a-z]+)(\*?)(\d)([a-z]*)([>\.]?)$Tr   FN.)
r   r    _LancasterStemmer__getLastLetterr   r   groupsintendswith_LancasterStemmer__isAcceptable_LancasterStemmer__applyRule)r   r    r!   r   proceedlast_letter_positionrule_was_appliedr   
rule_matchending_stringintact_flagremove_totalappend_string	cont_flags                 r   __doStemmingzLancasterStemmer.__doStemming   sy    ZZ FG
#'#7#7#=  %q(,-T5I5II $)  !006J1KLD!+!1!1$!7J! '--/)'()%
 (+<'8  ==tt)<=*#';#64;N;N$(,<" ,0+;+;(,lM,&D 8<$4'0C'727$)!%!4!4T<!H'+'7'7$(,(" 48 0#,#3.3G %G MJ $u,#Gk l r   c                 l    d}t        t        |            D ]  }||   j                         r|} |S  |S )zHGet the zero-based index of the last alphabetic character in this stringr$   )rangelenisalpha)r   r    last_letterpositions       r   __getLastLetterz LancasterStemmer.__getLastLetter  s@    c$i(HH~%%'& )
 r   c                     d}|d   dv rt        |      |z
  dk\  rd}|S t        |      |z
  dk\  r|d   dv rd}|S |d   dv rd}|S )z1Determine if the word is acceptable for stemming.Fr   aeiouy   T   r   r8   )r   r    r2   word_is_acceptables       r   __isAcceptablezLancasterStemmer.__isAcceptable$  s|    " 7h4y<'1,%)" "! Y%*Aw("%)" "! aH$%)"!!r   c                 :    t        |      |z
  }|d| }|r||z  }|S )z#Apply the stemming rule to the wordr   rA   )r   r    r2   r3   new_word_lengths        r   __applyRulezLancasterStemmer.__applyRule5  s2     d)l2Ao& M!Dr   c                 V    dD ]#  }|j                  |      s|t        |      d c S  |S )zYRemove prefix from a word.

        This function originally taken from Whoosh.

        )	kilomicromilliintraultramegananopicopseudoN)
startswithr8   )r   r    prefixs      r   __stripPrefixzLancasterStemmer.__stripPrefix@  s4    

F v&CKM**

 r   c                      y)Nz<LancasterStemmer> )r   s    r   __repr__zLancasterStemmer.__repr__U  s    #r   )NF)N)__name__
__module____qualname____doc__r	   r   r   r"   r   r&   r*   r+   r   rV   rU   r   r   r   r      sA    !HtlQ<.4=~""	*$r   r   )rZ   r   nltk.stem.apir   r   rU   r   r   <module>r\      s     
 "E$x E$r   