
    g                     x    d Z ddlmZ ddlmZ 	 ddlZ G d d      Zd Z	e
dk(  r e	        yy# e$ r dZY $w xY w)	a  
A module for language identification using the TextCat algorithm.
An implementation of the text categorization algorithm
presented in Cavnar, W. B. and J. M. Trenkle,
"N-Gram-Based Text Categorization".

The algorithm takes advantage of Zipf's law and uses
n-gram frequencies to profile languages and text-yet to
be identified-then compares using a distance measure.

Language n-grams are provided by the "An Crubadan"
project. A corpus reader was created separately to read
those files.

For details regarding the algorithm, see:
https://www.let.rug.nl/~vannoord/TextCat/textcat.pdf

For details about An Crubadan, see:
https://borel.slu.edu/crubadan/index.html
    )maxsize)trigramsNc                   D    e Zd ZdZi ZdZdZi Zd Zd Z	d Z
d Zd Zd	 Zy)
TextCatN<>c                     t         st        d      ddlm} || _        | j                  j                         D ]  }| j                  j                  |        y )Nzclassify.textcat requires the regex module that supports unicode. Try '$ pip install regex' and see https://pypi.python.org/pypi/regex for further details.r   )crubadan)reOSErrornltk.corpusr
   _corpuslangs	lang_freq)selfr
   langs      J/var/www/openai/venv/lib/python3.12/site-packages/nltk/classify/textcat.py__init__zTextCat.__init__7   sL    #  	)LL&&(DLL""4( )    c                 0    t        j                  dd|      S )z)Get rid of punctuation except apostrophesz[^\P{P}\']+ )r   subr   texts     r   remove_punctuationzTextCat.remove_punctuationG   s    vvnb$//r   c                 0   ddl m}m} | j                  |      } ||      } |       }|D ]c  }t	        | j
                  |z   | j                  z         }|D 	cg c]  }	dj                  |	       }
}	|
D ]  }||v r||xx   dz  cc<   d||<    e |S c c}	w )z'Create FreqDist of trigrams within textr   )FreqDistword_tokenizer      )nltkr   r   r   r   _START_CHAR	_END_CHARjoin)r   r   r   r   
clean_texttokensfingerprintttoken_trigram_tuplestritoken_trigramscur_trigrams               r   profilezTextCat.profileK   s    0,,T2
z*jA#+D,<,<q,@4>>,Q#R 6JK6Jsbggcl6JNK-+-,1,/0K,	  .	   Ls   Bc                    | j                   j                  |      }d}||v r`t        |j                               j	                  |      }t        |j                               j	                  |      }t        ||z
        }|S t        }|S )zgCalculate the "out-of-place" measure between the
        text and language profile for a single trigramr   )r   r   listkeysindexabsr   )r   r   trigramtext_profilelang_fddistidx_lang_profileidx_texts           r   	calc_distzTextCat.calc_dist_   s     ,,((.g#GLLN399'BL--/066w?H '(23D  Dr   c                     i }| j                  |      }| j                  j                  j                         D ]&  }d}|D ]  }|| j	                  |||      z  } |||<   ( |S )zOCalculate the "out-of-place" measure between
        the text and all languagesr   )r,   r   _all_lang_freqr/   r8   )r   r   	distancesr,   r   	lang_distr2   s          r   
lang_distszTextCat.lang_distst   so     	,,t$LL//446D I"T^^D'7CC	 # (IdO 7 r   c                     | j                  |      | _        t        | j                  | j                  j                        S )zYFind the language with the min distance
        to the text and return its ISO 639-3 code)key)r=   last_distancesmingetr   s     r   guess_languagezTextCat.guess_language   s4     #ood34&&D,?,?,C,CDDr   )__name__
__module____qualname__r   fingerprintsr!   r"   r@   r   r   r,   r8   r=   rC    r   r   r   r   /   s:    GLKIN) 0(*$Er   r   c            
         ddl m}  g d}dddddd	d
ddd	}t               }|D ]  }| j                  |      }t	        |      dz
  }t        t        t        |            }d}t        d|      D ]<  }	ddj                  t        d||	         D 
cg c]
  }
||	   |
    c}
      z   }||z  }> t        d|dd z   dz          |j                  |      }t        d| d||    d       t        d        y c c}
w )Nr   )udhr)	zKurdish-UTF8zAbkhaz-UTF8zFarsi_Persian-UTF8z
Hindi-UTF8zHawaiian-UTF8zRussian-UTF8zVietnamese-UTF8zSerbian_Srpski-UTF8zEsperanto-UTF8zNorthern Kurdish	AbkhazianzIranian PersianHindiHawaiianRussian
VietnameseSerbian	Esperanto)	kmrabkpeshinhawrusviesrpepor   r    zLanguage snippet:    z...zLanguage detection: z ()z############################################################################################################################################)r   rJ   r   sentslenr.   mapranger#   printrC   )rJ   r   friendlytccur_langraw_sentencesrowscolssampleijcur_sentguesss                r   demorn      s"    
E " 
H 
B

8,=!A%C]+, q$ASXXE!TRSWDU&VDUq}Q'7':DU&VWWHhF  
 	"VAc]2U:;!!&)$UG2huo->a@Ai#  'Ws   C3__main__)__doc__sysr   	nltk.utilr   regexr   ImportErrorr   rn   rD   rH   r   r   <module>ru      sZ   *  \E \E@.b zF q  	Bs   / 99