
    ge1                     2    d Z ddlZddlmZ  G d de      Zy)a  
ARLSTem Arabic Stemmer
The details about the implementation of this algorithm are described in:
K. Abainia, S. Ouamour and H. Sayoud, A Novel Robust Arabic Light Stemmer ,
Journal of Experimental & Theoretical Artificial Intelligence (JETAI'17),
Vol. 29, No. 3, 2017, pp. 557-573.
The ARLSTem is a light Arabic stemmer that is based on removing the affixes
from the word (i.e. prefixes, suffixes and infixes). It was evaluated and
compared to several other stemmers using Paice's parameters (under-stemming
index, over-stemming index and stemming weight), and the results showed that
ARLSTem is promising and producing high performances. This stemmer is not
based on any dictionary and can be used on-line effectively.
    N)StemmerIc                   d    e Zd ZdZd Zd Zd Zd Zd Zd Z	d Z
d	 Zd
 Zd Zd Zd Zd Zd Zy)ARLSTemaY  
    ARLSTem stemmer : a light Arabic Stemming algorithm without any dictionary.
    Department of Telecommunication & Information Processing. USTHB University,
    Algiers, Algeria.
    ARLSTem.stem(token) returns the Arabic stem for the input token.
    The ARLSTem Stemmer requires that all tokens are encoded using Unicode
    encoding.
    c                    t        j                  d      | _        t        j                  d      | _        t        j                  d      | _        g d| _        g d| _        ddg| _        g d| _        d	d
g| _	        ddg| _
        ddg| _        ddg| _        g d| _        ddg| _        ddg| _        ddg| _        ddg| _        g d| _        ddg| _        g d| _        g d| _        y )Nz[\u0622\u0623\u0625]z[\u0649]z[\u064B-\u065F])u   الu   للu   فلu   فب)u   بالu   كالu   والu   فللu   ولل)u   فبالu   وبالu   فكالu   كيu   كمu   هاu   همu   كماu   كنّu   هماu   هنّ)   انu   ين   ونu   تانu   تينr   r   u   ستu   سيu   ساu   سن)u   لنu   لتu   ليu   لأu   تماu   تنّ)   ناu   تمu   تا   وا)   ت   ا   ن)recompilere_hamzated_alifre_alifMaqsurare_diacriticspr2pr3pr32pr4su2su22su3su32pl_si2pl_si3verb_su2verb_pr2	verb_pr22	verb_pr33	verb_suf3	verb_suf2	verb_suf1)selfs    F/var/www/openai/venv/lib/python3.12/site-packages/nltk/stem/arlstem.py__init__zARLSTem.__init__*   s     "

+B C jj5ZZ(:; TU)+?@	
 #N3#^4	(*>?)+?@	 G+-AB (8'8(.9
 /0DE
 8    c                 L   	 |t        d      | j                  |      }| j                  |      }||}| j                  |      }| j	                  |      }|*| j                  |      }||S || j                  |      S |S |S # t         $ r}t        |       Y d}~yd}~ww xY w)zN
        call this function to get the word's stem based on ARLSTem .
        NzUThe word could not be stemmed, because                                  it is empty !)
ValueErrornormprefsuff	plur2singfem2mascverbprint)r$   tokenprepsfmes         r%   stemzARLSTem.stemd   s    	} 0 
 IIe$E))E"CIIe$E&Bz]]5)>I{#yy// L 	 	!HH	s*   A+B .B B B 	B#BB#c                     | j                   j                  d|      }| j                  j                  d|      }| j                  j                  d|      }|j	                  d      rt        |      dkD  r|dd }|S )z
        normalize the word by removing diacritics, replacing hamzated Alif
        with Alif replacing AlifMaqsura with Yaa and removing Waaw at the
        beginning.
         r      يu   و      N)r   subr   r   
startswithlenr$   r1   s     r%   r*   zARLSTem.norm   sv     ""&&r51%%))(E:##''%8 H%#e*q.!"IEr'   c                    t        |      dkD  r)| j                  D ]  }|j                  |      s|dd c S  t        |      dkD  r)| j                  D ]  }|j                  |      s|dd c S  t        |      dkD  r)| j                  D ]  }|j                  |      s|dd c S  t        |      dkD  r*| j
                  D ]  }|j                  |      s|dd c S  yy)z<
        remove prefixes from the words' beginning.
           r:   N         )r>   r   r=   r   r   r   )r$   r1   p3p4p2s        r%   r+   zARLSTem.pref   s     u:>hh##B' 9$  u:>hh##B' 9$  u:>ii##B' 9$   u:>hh##B' 9$  r'   c                    |j                  d      rt        |      dkD  r|dd S t        |      dkD  r)| j                  D ]  }|j                  |      s|dd c S  t        |      dkD  r)| j                  D ]  }|j                  |      s|dd c S  |j                  d	      rt        |      dkD  r|dd }|S t        |      dkD  r)| j                  D ]  }|j                  |      s|dd c S  t        |      dkD  r)| j
                  D ]  }|j                  |      s|dd c S  |j                  d
      rt        |      dkD  r|dd S |S )z6
        remove suffixes from the word's end.
        u   كr:   NrC   rA   u   هr	   )endswithr>   r   r   r   r   )r$   r1   s2s3s       r%   r,   zARLSTem.suff   s?    >>(#E
Q":u:>hh>>"% ":%  u:>hh>>"% ":%  >>(#E
Q#2JELu:>ii>>"% ":%   u:>ii>>"% ":%   >>.)c%j1n":r'   c                 N    |j                  d      rt        |      dkD  r|dd S yy)zR
        transform the word from the feminine form to the masculine form.
        u   ةr:   NrI   )rL   r>   r?   s     r%   r.   zARLSTem.fem2masc   s.     >>(#E
Q": )7#r'   c                    t        |      dkD  r)| j                  D ]  }|j                  |      s|dd c S  t        |      dkD  r)| j                  D ]  }|j                  |      s|dd c S  t        |      dkD  r|j                  d      r|dd S t        |      dkD  r$|j	                  d      r|d	   dk(  r|dd	 |dd z   S t        |      dkD  r&|j	                  d      r|d   dk(  r|d
d |d   z   S yyy)zO
        transform the word from the plural form to the singular form.
        rC   NrJ   rA   rK   r:   u   اتr   rD   r;   rI   )r>   r   rL   r   r=   )r$   r1   ps2ps3s       r%   r-   zARLSTem.plur2sing   s	    u:>{{>>#& ":% # u:>{{>>#& ":% # u:>enn^<":u:>e..x8U1X=Q!9uQRy((u:>e..x8U2Y(=R2;r** >S8>r'   c                     | j                  |      }||S | j                  |      }||S | j                  |      }||S | j                  |      }||S | j	                  |      }||S | j                  |      S )z=
        stem the verb prefixes and suffixes or both
        )verb_t1verb_t2verb_t3verb_t4verb_t5verb_t6)r$   r1   vbs      r%   r/   zARLSTem.verb   s     \\% >I\\% >I\\% >I\\% >I\\% >I||E""r'   c                    t        |      dkD  r:|j                  d      r)| j                  D ]  }|j                  |      s|dd c S  t        |      dkD  r:|j                  d      r)| j                  D ]  }|j                  |      s|dd c S  t        |      dkD  rw|j                  d      rft        |      dkD  r|j                  d      r|dd S |j                  d      r|dd	 S |j                  d      r|dd	 S |j                  d
      r|dd	 S t        |      dkD  r'|j                  d      r|j                  d
      r|dd	 S t        |      dkD  r)|j                  d      r|j                  d
      r|dd	 S yyy)z8
        stem the present prefixes and suffixes
        rA   r   r;   rJ   r9   rC   r   r
   rI   r   N)r>   r=   r   rL   r   r$   r1   rM   s      r%   rT   zARLSTem.verb_t1   so    u:>e..x8kk>>"% 2;& " u:>e..x8mm>>"% 2;& $ u:>e..x85zA~%.."@Qr{"~~h'Qr{"~~h'Qr{"~~h'Qr{"u:>e..x8U^^H=U2;u:>e..x8U^^H=U2; >V8>r'   c                    t        |      dkD  r| j                  D ]9  }|j                  | j                  d         s"|j	                  |      s4|dd c S  |j                  | j                  d         r#|j	                  | j                  d         r|dd S |j                  | j                  d         r#|j	                  | j                  d         r|dd S t        |      dkD  r4|j                  | j                  d         r|j	                  d      r|dd S t        |      dkD  r6|j                  | j                  d         r|j	                  d      r|dd S y	y	y	)
z7
        stem the future prefixes and suffixes
        rB   r   rD   rJ   r;   rA   r   rI   N)r>   r   r=   r   rL   r\   s      r%   rU   zARLSTem.verb_t2  sB    u:>kk##DMM!$45%..:L 2;& "
 a 01ennT[[QR^6TQr{"a 01ennT[[QR^6TQr{" JN  q!12x(2; JN  q!12x(2; ) 3 r'   c                 P   t        |      dkD  r)| j                  D ]  }|j                  |      s|dd c S  t        |      dkD  r)| j                  D ]  }|j                  |      s|dd c S  t        |      dkD  r*| j                  D ]  }|j                  |      s|dd c S  yy)z+
        stem the present suffixes
        rA   NrK   rC   rJ   r:   rI   )r>   r!   rL   r"   r#   )r$   r1   r   r   su1s        r%   rV   zARLSTem.verb_t38  s     u:>~~>>#& ":% & u:>~~>>#& ":% & u:>~~>>#& ":% & r'   c                     t        |      dkD  r@| j                  D ]  }|j                  |      s|dd c S  |j                  d      r|dd S yy)z+
        stem the present prefixes
        r:   r;   Nr9   )r>   r#   r=   )r$   r1   pr1s      r%   rW   zARLSTem.verb_t4I  s]     u:>~~##C( 9$ & )QRy  *	 r'   c                     t        |      dkD  rR| j                  D ]  }|j                  |      s|dd c S  | j                  D ]  }|j                  |      s|dd c S  |S )z*
        stem the future prefixes
        rC   rD   N)r>   r   r=   r   )r$   r1   r   s      r%   rX   zARLSTem.verb_t5T  sh     u:>~~##C( 9$ & }}##C( 9$ % r'   c                 t    t        |      dkD  r)| j                  D ]  }|j                  |      s|dd c S  |S )z)
        stem the order prefixes
        rC   rD   N)r>   r    r=   )r$   r1   r   s      r%   rY   zARLSTem.verb_t6a  s>     u:>~~##C( 9$ & r'   N)__name__
__module____qualname____doc__r&   r6   r*   r+   r,   r.   r-   r/   rT   rU   rV   rW   rX   rY    r'   r%   r   r       sP    88t!F$%*:+&#*@<&"	!r'   r   )rg   r   nltk.stem.apir   r   rh   r'   r%   <module>rj      s     
 "Ih Ir'   