
    gy                     R    d Z ddlZddlZddlmZ ddlmZ ddlmZ  G d de      Z	y)a  
The Sonority Sequencing Principle (SSP) is a language agnostic algorithm proposed
by Otto Jesperson in 1904. The sonorous quality of a phoneme is judged by the
openness of the lips. Syllable breaks occur before troughs in sonority. For more
on the SSP see Selkirk (1984).

The default implementation uses the English alphabet, but the `sonority_hiearchy`
can be modified to IPA or any other alphabet for the use-case. The SSP is a
universal syllabification algorithm, but that does not mean it performs equally
across languages. Bartlett et al. (2009) is a good benchmark for English accuracy
if utilizing IPA (pg. 311).

Importantly, if a custom hierarchy is supplied and vowels span across more than
one level, they should be given separately to the `vowels` class attribute.

References:

- Otto Jespersen. 1904. Lehrbuch der Phonetik.
  Leipzig, Teubner. Chapter 13, Silbe, pp. 185-203.
- Elisabeth Selkirk. 1984. On the major class features and syllable theory.
  In Aronoff & Oehrle (eds.) Language Sound Structure: Studies in Phonology.
  Cambridge, MIT Press. pp. 107-136.
- Susan Bartlett, et al. 2009. On the Syllabification of Phonemes.
  In HLT-NAACL. pp. 308-316.
    N)punctuation)
TokenizerI)ngramsc                   *    e Zd ZdZddZd Zd Zd Zy)SyllableTokenizera  
    Syllabifies words based on the Sonority Sequencing Principle (SSP).

        >>> from nltk.tokenize import SyllableTokenizer
        >>> from nltk import word_tokenize
        >>> SSP = SyllableTokenizer()
        >>> SSP.tokenize('justification')
        ['jus', 'ti', 'fi', 'ca', 'tion']
        >>> text = "This is a foobar-like sentence."
        >>> [SSP.tokenize(token) for token in word_tokenize(text)]
        [['This'], ['is'], ['a'], ['foo', 'bar', '-', 'li', 'ke'], ['sen', 'ten', 'ce'], ['.']]
    c                     |s	|dk(  rg d}|d   | _         i | _        t        |      D ]F  \  }}|D ]<  }t        |      |z
  }|| j                  |<   || j                  |j	                         <   > H y)a  
        :param lang: Language parameter, default is English, 'en'
        :type lang: str
        :param sonority_hierarchy: Sonority hierarchy according to the
                                   Sonority Sequencing Principle.
        :type sonority_hierarchy: list(str)
        en)aeiouylmnrwzvsfbcdgtkpqxhjr   N)vowelsphoneme_map	enumeratelenupper)selflangsonority_hierarchyilevelcsonority_levels          V/var/www/openai/venv/lib/python3.12/site-packages/nltk/tokenize/sonority_sequencing.py__init__zSyllableTokenizer.__init__9   s     "ddl" )+!"45HAu!$%7!81!<&4  #.<  +  6    c           	         g }|D ]#  }	 |j                  || j                  |   f       % |S # t        $ r |dvr|t        vr|t	        j
                  dj                  |             |j                  |t        | j                  j                               f       || j                  vr)| xj                  |z  c_	        n|j                  |df       Y w xY w)af  
        Assigns each phoneme its value from the sonority hierarchy.
        Note: Sentence/text has to be tokenized first.

        :param token: Single word or token
        :type token: str
        :return: List of tuples, first element is character/phoneme and
                 second is the soronity value.
        :rtype: list(tuple(str, int))
        
0123456789zECharacter not defined in sonority_hierarchy, assigning as vowel: '{}')
appendr   KeyErrorr   warningswarnformatmaxvaluesr   )r   tokensyllables_valuesr   s       r   assign_valueszSyllableTokenizer.assign_valuesU   s     A5 ''D,<,<Q,?(@A     
5L(Qk-AMM44:F1I %++QD4D4D4K4K4M0N,OP+q($++QG4
5s    -B$CCc                    g }d}t        j                  dj                  | j                              }t	        |      D ]  \  }}|t
        v r|j                  |        |j                  |      s$t        |      dk(  r||z  }E|dd |d   |z   gz   }Ut        |      dk(  r|j                  ||z          x|j                  |        |S )a  
        Ensures each syllable has at least one vowel.
        If the following syllable doesn't have vowel, add it to the current one.

        :param syllable_list: Single word or token broken up into syllables.
        :type syllable_list: list(str)
        :return: Single word or token broken up into syllables
                 (with added syllables if necessary)
        :rtype: list(str)
         |r   Nr   )	recompilejoinr   r   r   r    searchr   )r   syllable_listvalid_syllablesfrontvowel_patternr   syllables          r   validate_syllablesz$SyllableTokenizer.validate_syllablesq   s     

388DKK#89$]3KAx;&&&x0 ''1'1,X%E&5cr&:'+h6> 'O '1,#**58+;<#**84 4" r   c                 &   | j                        }t        fd| j                  D              dk  rgS g }|d   d   }t        |d      D ]  }t	        | \  }}|\  }}	}
|d   }|	dk(  r%|j                  |       |j                  |       d}C||	cxk\  r|
k(  rn n||z  }|j                  |       d}j||	cxkD  r|
k  rn n|j                  |       d}||z  }||z  } ||d   d   z  }|j                  |       | j                  |      S )a"  
        Apply the SSP to return a list of syllables.
        Note: Sentence/text has to be tokenized first.

        :param token: Single word or token
        :type token: str
        :return syllable_list: Single word or token broken up into syllables.
        :rtype: list(str)
        c              3   @   K   | ]  }j                  |        y w)N)count).0xr'   s     r   	<genexpr>z-SyllableTokenizer.tokenize.<locals>.<genexpr>   s     3{!u{{1~{s      r      )nr   r+   )r)   sumr   r   zipr    r6   )r   r'   r(   r1   r5   trigramphonemesr&   
prev_valuefocal_value
next_valuefocal_phonemes    `          r   tokenizezSyllableTokenizer.tokenize   s@     --e4 3t{{33q87N#A&q).!4G"G}Hf28/JZ$QKM b $$X.$$]3{8j8M)$$X.k6J6$$X.M) M)1 54 	$R(++X&&&}55r   N)r	   F)__name__
__module____qualname____doc__r   r)   r6   rH    r   r   r   r   +   s    =8 8B06r   r   )
rL   r-   r"   stringr   nltk.tokenize.apir   	nltk.utilr   r   rM   r   r   <module>rQ      s)   4 
   ( W6
 W6r   