
    g                     V    d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	  G d de      Z
y)	av  
An NLTK interface for the n-gram statistics gathered from
the corpora for each language using An Crubadan.

There are multiple potential applications for the data but
this reader was created with the goal of using it in the
context of language identification.

For details about An Crubadan, this data, and its potential uses, see:
http://borel.slu.edu/crubadan/index.html
    N)path)CorpusReader)ZipFilePathPointer)FreqDistc                   P     e Zd ZdZdZi Zd
 fd	Zd Zd Zd Z	d Z
d Zd	 Z xZS )CrubadanCorpusReaderzK
    A corpus reader used to access language An Crubadan n-gram files.
    z	table.txtc                 X    t         |   ||d       g | _        | j                          y )Nutf8encoding)super__init___lang_mapping_data_load_lang_mapping_data)selfrootfileidsr   tagset	__class__s        P/var/www/openai/venv/lib/python3.12/site-packages/nltk/corpus/reader/crubadan.pyr   zCrubadanCorpusReader.__init__%   s+    w8"$$$&    c                 x    || j                   vr| j                  |      | j                   |<   | j                   |   S )zTReturn n-gram FreqDist for a specific language
        given ISO 639-3 language code)_all_lang_freq_load_lang_ngrams)r   langs     r   	lang_freqzCrubadanCorpusReader.lang_freq*   s>     t***(,(>(>t(DD%""4((r   c                 F    | j                   D cg c]  }|d   	 c}S c c}w )z7Return a list of supported languages as ISO 639-3 codes   )r   )r   rows     r   langszCrubadanCorpusReader.langs3   s&    "&"9"9:"93A"9:::s   c                 |    | j                   D ]-  }|d   j                         |j                         k(  s(|d   c S  y)z5Return internal Crubadan code based on ISO 639-3 coder   r   Nr   lowerr   r   is      r   iso_to_crubadanz$CrubadanCorpusReader.iso_to_crubadan7   3    ((Atzz|tzz|+t )r   c                 |    | j                   D ]-  }|d   j                         |j                         k(  s(|d   c S  y)z2Return ISO 639-3 code given internal Crubadan coder   r   Nr"   r$   s      r   crubadan_to_isoz$CrubadanCorpusReader.crubadan_to_iso=   r'   r   c                    t        | j                  t              rt        d      t	        j
                  | j                  | j                        }| j                  | j                         vrt        d|z         t        |d      5 }|j                         j                         }|j                  d      D cg c]  }|j                  d       c}| _        ddd       yc c}w # 1 sw Y   yxY w)zCLoad language mappings between codes and description from table.txtz?Please install the 'crubadan' corpus first, use nltk.download()z%Could not find language mapper file: utf-8r   
	N)
isinstancer   r   RuntimeErrorr   join_LANG_MAPPER_FILEr   openreadstripsplitr   )r   mapper_fileraw	strip_rawr   s        r   r   z,CrubadanCorpusReader._load_lang_mapping_dataC   s    dii!34Q  ii		4+A+AB!!7FTUU+0C
((*IBK//RVBW&XBW3syyBW&XD# 10 'Y 10s   2C'9C"C'"C''C0c                    || j                         vrt        d      | j                  |      }t        j                  | j
                  |dz         }t        j                  |      st        d      t               }t        |d      5 }|D ]:  }|j                  d      }|d   j                  d      }t        |d	         }	|	||<   < 	 d
d
d
       |S # 1 sw Y   |S xY w)zbLoad single n-gram language file given the ISO 639-3 language code
        and return its FreqDistzUnsupported language.z-3grams.txtz,No N-gram file found for requested language.r+   r    r   r,   r   N)r    r/   r&   r   r0   r   isfiler   r2   r5   r4   int)
r   r   crubadan_code
ngram_filecountsflinedatangramfreqs
             r   r   z&CrubadanCorpusReader._load_lang_ngramsS   s     tzz|#677,,T2YYtyy--*GH
{{:&MNN*w/1zz#Qd+47| $u  0  0 s   	A CC)r
   N)__name__
__module____qualname____doc__r1   r   r   r   r    r&   r)   r   r   __classcell__)r   s   @r   r   r      s9     $N'
);Y r   r   )rH   reosr   nltk.corpus.readerr   	nltk.datar   nltk.probabilityr   r    r   r   <module>rP      s)   
 
  + ( %M< Mr   