
    Of|                      F    S SK r S SKJr  S SKJr  S SKJr   " S S\5      rg)    N)warn)ElementTree)CorpusReaderc                   b   ^  \ rS rSrSrU 4S jrS rS rS rS r	S r
S	 rS
 rS rS rSrU =r$ )BCP47CorpusReader   uV  
Parse BCP-47 composite language tags

Supports all the main subtags, and the 'u-sd' extension:

>>> from nltk.corpus import bcp47
>>> bcp47.name('oc-gascon-u-sd-fr64')
'Occitan (post 1500): Gascon: Pyrénées-Atlantiques'

Can load a conversion table to Wikidata Q-codes:
>>> bcp47.load_wiki_q()
>>> bcp47.wiki_q['en-GI-spanglis']
'Q79388'

c                   > [         TU ]  X5        0 U l        U R                  S5       nU R	                  UR                  5       R                  S5      5      U l        SSS5        U R                  S5       nU R                  [        R                  " U5      R                  S5      5      U l        SSS5        U R                  5         g! , (       d  f       Nr= f! , (       d  f       N0= f)zRead the BCP-47 databasez!iana/language-subtag-registry.txtz%%
Nzcldr/common-subdivisions-en.xmlz+localeDisplayNames/subdivisions/subdivision)super__init__langcodeopen	data_dictreadsplitdbsubdiv_dictetparseiterfindsubdiv
morphology)selfrootfileidsfp	__class__s       :/usr/lib/python3/dist-packages/nltk/corpus/reader/bcp47.pyr   BCP47CorpusReader.__init__    s    'YY:;rnnRWWY__V%<=DG <YY89R**%%&STDK : 	 <;99s   4C	6:C	
C
C(c                     U R                  S5       nU R                  UR                  5       R                  5       R	                  S5      SS 5      U l        SSS5        g! , (       d  f       g= f)z:Load conversion table to Wikidata Q-codes (only if needed)z-cldr/tools-cldr-rdf-external-entityToCode.tsv
   N)r   	wiki_dictr   stripr   wiki_q)r   r   s     r   load_wiki_qBCP47CorpusReader.load_wiki_q,   sM    YYFG2..):)@)@)Fqr)JKDK HGGs   AA  
A.c                     U Vs/ s H!  o"R                  5       R                  S5      PM#     sn Vs0 s H  nUS   US   R                  S5      S   _M      sn$ s  snf s  snf )z7Convert Wikidata list of Q-codes to a BCP-47 dictionary	r!   r   /)r#   r   )r   lineslinepairs       r   r"   BCP47CorpusReader.wiki_dict1   sf     ?DDed++D1eD
D GT!W]]3'++D
 	
D
s
   (A%Ac                 `    U Vs0 s H  o"R                   S   UR                  _M     sn$ s  snf )z2Convert the CLDR subdivisions list to a dictionarytype)attribtext)r   subdivssubs      r   r   BCP47CorpusReader.subdiv_dict8   s*    8?@

6"CHH,@@@s   #+c           
         [         R                  [         R                  [         R                  [         R                  [         R                  S.U l        SnSnSnSn[
        R                  " US-   S35      [
        R                  " US-   5      [
        R                  " U US-   35      [
        R                  " SUS	-   S
US-   S35      [
        R                  " US-   US-   S-   35      [
        R                  " U 5      S.U l        g )N)languageextlangscriptregionvariantz[0-9]z[a-z]z[A-Z]z[a-zA-Z0-9]   ?(   z)|()   )r7   r8   r9   r:   r;   	singleton)strlowertitleuppercasingrecompileformat)r   diglowupalnums        r   r   BCP47CorpusReader.morphology<   s    		yyiiiiyy
 

c!eWA;/zzSUG-jjB4Aw0jj1RTF#c!eWA!67zzU1WIuSy!m_"=>se-
    c                    US   R                  SS5      R                  5       U l        0 n0 US'   S H  n0 US   U'   M     USS  GH(  nUR                  5       R                  S5       Vs/ s H  oUR                  S	5      PM     nnUS   S   nUS   S   nXr;  a  0 X''   0 n	US
S  H  n[	        U5      S
:X  a#  Uu  pX;  a  U/X'   O8X   R                  U5        O$U	W
   S==   SUS   R                  5       -   -  ss'   SU;  d  Ma  US:X  d  Mi  U
S:X  d  Mq  XR                  X   S   '   M     U	 H  n
[	        X   5      S:X  d  M  X   S   X'   M!     SU;   a  XS   U   U'   GM!  XU   U'   GM+     U$ s  snf )z;Convert the BCP-47 language subtag registry to a dictionaryr   z
File-Date: 
deprecated)r7   r8   r9   r:   r;   	redundantgrandfatheredr!   Nr    : r?   r*    
Deprecatedr7   Description)replacer#   versionr   lenappendr   )r   recordsdiclabelrecordfieldfieldstyptag	subfieldskeyvals               r   r   BCP47CorpusReader.data_dictQ   s   qz)),;AACL
E (*Ce$
 abkF5;\\^5I5I$5OP5OEkk$'5OFP)A,C)A,C~Iu:?!&JS+*-	!--c2cN2&#a0@*@@& .z)},8;MM)."45 $ !y~&!+%.^A%6IN ! v%.7L!#&s+ )C9 ": 
9 Qs   +E5c                 6    [        U5      [        :X  a  US   nU$ )zReturn only first valuer   )r0   list)r   rh   s     r   val2strBCP47CorpusReader.val2str   s    9a&C
rP   c                 F    US    nS H  nX1;   d  M
  USX    3-  nM     U$ )zConcatenate subtag valuesr7   )r8   r9   r:   r;   	extensionrV    )r   	lg_recordnamer`   s       r   lang2strBCP47CorpusReader.lang2str   s<    J'(LE!"Y-.// M rP   c                    UR                  S5      n0 n/ SQnU(       Ga0  U(       Ga(  UR                  S5      nSnU(       GaJ  UR                  S5      nU R                  U   " U5      nU R                  U   R	                  U5      (       a  XPR
                  U   ;   aF  SnU R                  U R
                  U   U   S   5      nUS:X  a  Xs;   a  X7==   SU-   -  ss'   OXU'   OXPR
                  S	   U   ;   a  SnS
U< SU S3n	SU R
                  S	   U   U   ;   a0  U R
                  S	   U   U   S   n
U	SU R                  U
5       S3-  n	U R                  U R
                  S	   U   U   S   5      X7'   [        U	5        O
U(       a  GMJ  U(       d  US:X  a4  US   S:X  a+  US   nXR                  ;   a  U R                  U   nOrSW S3nOkU SR                  U Vs/ s H  nSU-   PM
     sn5       3R                  5       nU R                  S   R	                  U5      (       d  SU S3n[        U5        XS'   / nU(       a
  U(       a  GM(  U$ s  snf )z8Convert a BCP-47 tag to a dictionary of labelled subtags-)r7   r8   r9   r:   r;   r;   r   FTrY   r;   rV   rS   The rW   z code is deprecatedPreferred-Valuez', prefer ''usdr!   z<Unknown subdivision: >rR   rB   z<Invalid extension: ro   )r   poprG   rJ   	fullmatchr   rl   r   r   joinrD   )r   re   subtagslanglabelssubtagfoundr`   valstrnotepreferr{   exts                r   	parse_tagBCP47CorpusReader.parse_tag   sY   ))C.R&[[^FE

1U+F3;;u%//77/ $!%dggenV.D].S!T I-%- K4&=8K*0K77<#8#?? $!%fZq7JK,0Ee0LV0TT%)WW\%:5%A&%I 1&F !k$,,v2F1Gq$IID&*ll GGL1%8@O' T
1 &2 S=WQZ4%7 B[[("kk"o 6se1=#HRWW-I#c#g-I%J$KLRRTC;;{3==fEE 4SE;S	$'[!S &&T  .Js   I
c                    S H  nSnXR                   U   ;   a   U R                   U   U   S    nSU< SU 3nOXR                   S   U   ;   al  U R                   S   U   U   S    nSU< SU S3nSU R                   S   U   U   ;   a0  U R                   S   U   U   S   nUS	U R                  U5      < 3-  nU(       d  M  [        W5        Us  $     U R                  U R	                  U5      5      $ !   [        S
U< S35         g= f)z
Convert a BCP-47 tag to a colon-separated string of subtag names

>>> from nltk.corpus import bcp47
>>> bcp47.name('ca-Latn-ES-valencia')
'Catalan: Latin: Spain: Valencian'

)rT   rU   NrY   rw   z	 code is rS   z and deprecatedrx   z	, prefer zTag z was not recognized)r   rl   r   rs   r   )r   re   r`   rh   r   r   s         r   rr   BCP47CorpusReader.name   s3    4ECggen$,];<cWIeW5-e44.u5c:=IJcWIeWOD$(=e(DS(II!WW\259#>?PQFiV(<'?@@DsT

 4	==!455	4w123s   C6 6D
)rG   r   rJ   r   r   r[   r$   )__name__
__module____qualname____firstlineno____doc__r   r%   r"   r   r   r   rl   rs   r   rr   __static_attributes____classcell__)r   s   @r   r   r      sC     
L

A
*,\/b rP   r   )	rH   warningsr   	xml.etreer   r   nltk.corpus.readerr   r   rp   rP   r   <module>r      s!    
  ' +K KrP   