
    g|                      F    d dl Z d dlmZ d dlmZ d dlmZ  G d de      Zy)    N)warn)ElementTree)CorpusReaderc                   X     e Zd ZdZ fdZd Zd Zd Zd Zd Z	d Z
d	 Zd
 Zd Z xZS )BCP47CorpusReaderu~  
    Parse BCP-47 composite language tags

    Supports all the main subtags, and the 'u-sd' extension:

    >>> from nltk.corpus import bcp47
    >>> bcp47.name('oc-gascon-u-sd-fr64')
    'Occitan (post 1500): Gascon: Pyrénées-Atlantiques'

    Can load a conversion table to Wikidata Q-codes:
    >>> bcp47.load_wiki_q()
    >>> bcp47.wiki_q['en-GI-spanglis']
    'Q79388'

    c                    t         |   ||       i | _        | j                  d      5 }| j	                  |j                         j                  d            | _        ddd       | j                  d      5 }| j                  t        j                  |      j                  d            | _        ddd       | j                          y# 1 sw Y   lxY w# 1 sw Y   &xY w)zRead the BCP-47 databasez!iana/language-subtag-registry.txtz%%
Nzcldr/common-subdivisions-en.xmlz+localeDisplayNames/subdivisions/subdivision)super__init__langcodeopen	data_dictreadsplitdbsubdiv_dictetparseiterfindsubdiv
morphology)selfrootfileidsfp	__class__s       M/var/www/openai/venv/lib/python3.12/site-packages/nltk/corpus/reader/bcp47.pyr
   zBCP47CorpusReader.__init__    s    w'YY:;rnnRWWY__V%<=DG <YY89R**%%&STDK : 	 <;99s   4C	79C	CCc                     | j                  d      5 }| j                  |j                         j                         j	                  d      dd       | _        ddd       y# 1 sw Y   yxY w)z:Load conversion table to Wikidata Q-codes (only if needed)z-cldr/tools-cldr-rdf-external-entityToCode.tsv
   N)r   	wiki_dictr   stripr   wiki_q)r   r   s     r   load_wiki_qzBCP47CorpusReader.load_wiki_q,   sM    YYFG2..):)@)@)Fqr)JKDK HGGs   AA  A)c                     |D cg c]!  }|j                         j                  d      # c}D ci c]  }|d   |d   j                  d      d    c}S c c}w c c}w )z7Convert Wikidata list of Q-codes to a BCP-47 dictionary	r   r   /)r!   r   )r   lineslinepairs       r   r    zBCP47CorpusReader.wiki_dict1   sh     ?DDed++D1eD
D GT!W]]3'++D
 	
D
s
   &A"Ac                 \    |D ci c]  }|j                   d   |j                   c}S c c}w )z2Convert the CLDR subdivisions list to a dictionarytype)attribtext)r   subdivssubs      r   r   zBCP47CorpusReader.subdiv_dict8   s,    8?@

6"CHH,@@@s   !)c           
         t         j                  t         j                  t         j                  t         j                  t         j                  d| _        d}d}d}d}t        j                  |dz   d      t        j                  |dz         t        j                  | |dz         t        j                  d|d	z   d
|dz   d      t        j                  |dz   |dz   dz         t        j                  |       d| _        y )N)languageextlangscriptregionvariantz[0-9]z[a-z]z[A-Z]z[a-zA-Z0-9]   ?(   z)|()   )r2   r3   r4   r5   r6   	singleton)strlowertitleuppercasingrecompileformat)r   diglowupalnums        r   r   zBCP47CorpusReader.morphology<   s    		yyiiiiyy
 

c!eWA;/zzSUG-jjB4Aw0jj1RTF#c!eWA!67zzU1WIuSy!m_"=>se-
    c                    |d   j                  dd      j                         | _        i }i |d<   dD ]
  }i |d   |<    |dd D ]  }|j                         j                  d      D cg c]  }|j                  d	       }}|d   d   }|d   d   }||vri ||<   i }	|d
d D ]  }t	        |      d
k(  r%|\  }
}|
|	vr|g|	|
<   n9|	|
   j                  |       n$|	
   dxx   d|d   j                         z   z  cc<   d|vs_|dk(  se|
dk(  sk|| j                  |	|
   d   <    |	D ]  }
t	        |	|
         dk(  s|	|
   d   |	|
<   ! d|v r|	|d   |   |<   |	||   |<     |S c c}w )z;Convert the BCP-47 language subtag registry to a dictionaryr   z
File-Date: 
deprecated)r2   r3   r4   r5   r6   	redundantgrandfatheredr   Nr   : r:   r'    
Deprecatedr2   Description)replacer!   versionr   lenappendr   )r   recordsdiclabelrecordfieldfieldstyptag	subfieldskeyvals               r   r   zBCP47CorpusReader.data_dictQ   s   qz)),;AACL
E (*Ce$
 abkF5;\\^5I5I$5OP5OEekk$'5OFP)A,C)A,C#~CIu:?!&JS#)+*-	#!#--c2cN2&#a0@*@@& .z)},8;DMM)C."45 $ !y~&!+%.s^A%6IcN ! v%.7L!#&s+ )C9 ": 
9 Qs   )E'c                 4    t        |      t        k(  r|d   }|S )zReturn only first valuer   )r,   list)r   rb   s     r   val2strzBCP47CorpusReader.val2str   s    9a&C
rJ   c                 @    |d    }dD ]  }||v s|d||    z  } |S )zConcatenate subtag valuesr2   )r3   r4   r5   r6   	extensionrP    )r   	lg_recordnamerZ   s       r   lang2strzBCP47CorpusReader.lang2str   s@    J'(LE	!"Yu-.// M rJ   c                 >   |j                  d      }i }g d}|r|r|j                  d      }d}|r;|j                  d      } | j                  |   |      }| j                  |   j	                  |      r|| j
                  |   v rFd}| j                  | j
                  |   |   d         }|dk(  r||v r||xx   d|z   z  cc<   n|||<   n|| j
                  d	   |   v rd}d
|d| d}	d| j
                  d	   |   |   v r0| j
                  d	   |   |   d   }
|	d| j                  |
       dz  }	| j                  | j
                  d	   |   |   d         ||<   t        |	       n|r;|s|dk(  r2|d   dk(  r*|d   }|| j                  v r| j                  |   }njd d}nc| dj                  |D cg c]  }d|z   	 c}       j                         }| j                  d   j	                  |      sd| d}t        |       ||d<   g }|r|r|S c c}w )z8Convert a BCP-47 tag to a dictionary of labelled subtags-)r2   r3   r4   r5   r6   r6   r   FTrS   r6   rP   rM   The rQ   z code is deprecatedPreferred-Valuez', prefer ''usdr   z<Unknown subdivision: >rL   r=   z<Invalid extension: rg   )r   poprB   rE   	fullmatchr   re   r   r   joinr?   )r   r_   subtagslanglabelssubtagfoundrZ   valstrnotepreferrr   exts                r   	parse_tagzBCP47CorpusReader.parse_tag   sa   ))C.R&[[^FE

1+U+F3;;u%//7/ $!%dggenV.D].S!T I-%4- K4&=8K*0DK477<#8#?? $!%fZq7JK,0Ee0LV0TT%)WW\%:5%A&%I 1&F !k$,,v2F1Gq$IID&*ll GGL1%8@O'U T
1 2 S=WQZ4%7 BT[[("kk"o 6se1=#HRWW-I#c#g-I%J$KLRRTC;;{3==fE 4SE;S	$'[!S &T  .Js   9H
c                    dD ]  }d}|| j                   |   v r| j                   |   |   d    }d|d| }n||| j                   d   |   v rh| j                   d   |   |   d    }d|d| d}d| j                   d   |   |   v r/| j                   d   |   |   d   }|d	| j                  |      z  }|st               |c S  	 | j                  | j	                  |            S #  t        d
|d       Y yxY w)z
        Convert a BCP-47 tag to a colon-separated string of subtag names

        >>> from nltk.corpus import bcp47
        >>> bcp47.name('ca-Latn-ES-valencia')
        'Catalan: Latin: Spain: Valencian'

        )rN   rO   NrS   rn   z	 code is rM   z and deprecatedro   z	, prefer zTag z was not recognized)r   re   r   rk   r   )r   r_   rZ   rb   r}   r~   s         r   rj   zBCP47CorpusReader.name   s5    4ECdggen$,];<cWIeW5-e44.u5c:=IJcWIeWOD$(=e(DS(II!WW\259#>?PQFiV(<'?@@DT

 4	==!455	4w123s   C( (C;)__name__
__module____qualname____doc__r
   r#   r    r   r   r   re   rk   r   rj   __classcell__)r   s   @r   r   r      s>     
L

A
*,\/brJ   r   )	rC   warningsr   	xml.etreer   r   nltk.corpus.readerr   r   rh   rJ   r   <module>r      s!    
  ' +K KrJ   