
    Of1                     Z    S SK r S SKJr  S SKJrJr  S r " S S\5      r " S S\5      rg)	    N)CorpusReader)StreamBackedCorpusViewconcatc                 J   ^  [         R                  " T 5      SU 4S jj5       nU$ )Nc                 h   > UR                  SS 5        U(       d  U R                  5       nT" X40 UD6$ )Ntags)popfileids)selfr
   kwargsfuns      ;/usr/lib/python3/dist-packages/nltk/corpus/reader/ipipan.py	decorator_parse_args.<locals>.decorator   s/    

64 llnG4+F++    N)	functoolswraps)r   r   s   ` r   _parse_argsr      s&    __S, , r   c                       \ rS rSrSrS rSS jrSS jrSS jrSS jr	\
SS	 j5       r\
SS
 j5       r\
SS j5       r\
SS j5       r\
SS j5       r\
SS j5       rS rS rS rSS jrS rS rS rSrg)IPIPANCorpusReader   a  
Corpus reader designed to work with corpus created by IPI PAN.
See http://korpus.pl/en/ for more details about IPI PAN corpus.

The corpus includes information about text domain, channel and categories.
You can access possible values using ``domains()``, ``channels()`` and
``categories()``. You can use also this metadata to filter files, e.g.:
``fileids(channel='prasa')``, ``fileids(categories='publicystyczny')``.

The reader supports methods: words, sents, paras and their tagged versions.
You can get part of speech instead of full tag by giving "simplify_tags=True"
parameter, e.g.: ``tagged_sents(simplify_tags=True)``.

Also you can get all tags disambiguated tags specifying parameter
"one_tag=False", e.g.: ``tagged_paras(one_tag=False)``.

You can get all tags that were assigned by a morphological analyzer specifying
parameter "disamb_only=False", e.g. ``tagged_words(disamb_only=False)``.

The IPIPAN Corpus contains tags indicating if there is a space between two
tokens. To add special "no space" markers, you should specify parameter
"append_no_space=True", e.g. ``tagged_words(append_no_space=True)``.
As a result in place where there should be no space between two tokens new
pair ('', 'no-space') will be inserted (for tagged data) and just '' for
methods without tags.

The corpus reader can also try to append spaces between words. To enable this
option, specify parameter "append_space=True", e.g. ``words(append_space=True)``.
As a result either ' ' or (' ', 'space') will be inserted between tokens.

By default, xml entities like &quot; and &amp; are replaced by corresponding
characters. You can turn off this feature, specifying parameter
"replace_xmlentities=False", e.g. ``words(replace_xmlentities=False)``.
c                 6    [         R                  " XUS S 5        g r   )r   __init__)r   rootr
   s      r   r   IPIPANCorpusReader.__init__=   s    d'4>r   Nc                 T    U(       d  U R                  5       nU R                  US5      $ )Nchannelr
   _parse_headerr   r
   s     r   channelsIPIPANCorpusReader.channels@   s#    llnG!!'955r   c                 T    U(       d  U R                  5       nU R                  US5      $ )Ndomainr   r!   s     r   domainsIPIPANCorpusReader.domainsE   s#    llnG!!'844r   c                     U(       d  U R                  5       nU R                  US5       Vs/ s H  o R                  U5      PM     sn$ s  snf )NkeyTerm)r
   r    _map_category)r   r
   cats      r   
categoriesIPIPANCorpusReader.categoriesJ   sH    llnG/3/A/A'9/U
/Us#/U
 	
 
s   A	c                    Ub  Ub  Ub  [        S5      eUc  Uc  Uc  [        R                  " U 5      $ [        U[        5      (       a  U/n[        U[        5      (       a  U/n[        U[        5      (       a  U/nU(       a  U R                  SU5      $ U(       a  U R                  SU5      $ U R                  SX0R                  S9$ )NzNYou can specify only one of channels, domains and categories parameter at oncer   r%   r)   )map)
ValueErrorr   r
   
isinstancestr_list_morph_files_byr*   )r   r"   r&   r,   s       r   r
   IPIPANCorpusReader.fileidsQ   s    G$7J<R3  J4F''--h$$ zHgs##iGj#&&$J,,YAA,,Xw??,,:+=+= -  r   c                     [        U R                  U5       Vs/ s H'  nU R                  " U4[        R                  SS.UD6PM)     sn5      $ s  snf NF)moder   r   _list_morph_files_viewIPIPANCorpusView
SENTS_MODEr   r
   r   fileids       r   sentsIPIPANCorpusReader.sentsh   d    
 #44W=	 >F 

!1!<!<5LR >	
 	
   .Ac                     [        U R                  U5       Vs/ s H'  nU R                  " U4[        R                  SS.UD6PM)     sn5      $ s  snf r6   r   r9   r:   r;   
PARAS_MODEr=   s       r   parasIPIPANCorpusReader.parass   rA   rB   c           
          [        U R                  U5       Vs/ s H  nU R                  " U4SS0UD6PM     sn5      $ s  snf )Nr   Fr   r9   r:   r=   s       r   wordsIPIPANCorpusReader.words~   sO     #44W==F 

6888=
 	
s   ?c           
          [        U R                  U5       Vs/ s H&  nU R                  " U4S[        R                  0UD6PM(     sn5      $ s  snf Nr7   r8   r=   s       r   tagged_sentsIPIPANCorpusReader.tagged_sents   V     #44W==F 

6N(8(C(CNvN=
 	
   -Ac           
          [        U R                  U5       Vs/ s H&  nU R                  " U4S[        R                  0UD6PM(     sn5      $ s  snf rM   rD   r=   s       r   tagged_parasIPIPANCorpusReader.tagged_paras   rP   rQ   c           
          [        U R                  U5       Vs/ s H  o0R                  " U40 UD6PM     sn5      $ s  snf r   rI   r=   s       r   tagged_wordsIPIPANCorpusReader.tagged_words   s=    8<8N8Nw8WX8WfZZ)&)8WX
 	
Xs   <c                 N    U R                  U5       Vs/ s H  o"PM     sn$ s  snf r   )abspathsr   r
   fs      r   r9   $IPIPANCorpusReader._list_morph_files   s%    ==121a1222s   "c                 p    U R                  U5       Vs/ s H  nUR                  SS5      PM     sn$ s  snf Nz	morph.xmlz
header.xml)r9   replacerZ   s      r   _list_header_files%IPIPANCorpusReader._list_header_files   s?     ++G4
4 IIk<04
 	
 
s   3c                     [        5       nU R                  U5       H.  nU R                  XB5      nU H  nUR                  U5        M     M0     [	        U5      $ r   )setr`   _get_tagaddlist)r   r
   tagvaluesr[   values_listvs          r   r     IPIPANCorpusReader._parse_header   sM    ((1A--/K 

1 ! 2 F|r   c                    U R                  5       n[        5       nU Ha  nU R                  U5      R                  SS5      nU R	                  Xq5      nU H&  n	Ub  U" U	5      n	X;   d  M  UR                  U5        M(     Mc     [        U5      $ r^   )r
   rc   abspathr_   rd   re   rf   )
r   rg   rh   r/   r
   ret_fileidsr[   fpri   values
             r   r3   'IPIPANCorpusReader._list_morph_files_by   s|    ,,.eAa((lCB--0K$?JE?OOA&	 %  K  r   c                 (   / n[        U5       nUR                  5       nS S S 5        Sn WR                  SU-   U5      nUS:  a  U$ UR                  SU-   S-   U5      nUR                  XW[	        U5      -   S-   U 5        MY  ! , (       d  f       Nj= f)Nr   <z</>   )openreadfindappendlen)r   r[   rg   r   infileheadertag_endtag_poss           r   rd   IPIPANCorpusReader._get_tag   s    !W[[]F kk#)W5G{kk$*s"2G<GKKS1A5@A  Ws   B
Bc                 B    UR                  S5      nUS:X  a  U$ XS-   S  $ )Nrt      )rx   )r   r+   poss      r   r*    IPIPANCorpusReader._map_category   s)    hhsm"9JQwy>!r   c                    UR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  S	S5      nUR                  S
S5      n	UR                  SS5      n
[        U5      S:  a  [        SUR                  5       -  5      eU(       d  U(       d  [        S5      eU(       d   U(       d  U(       a  U(       d  [        S5      e[	        UUUUUUUU	U
S9	$ )Nr   Tr7   r   simplify_tagsFone_tagdisamb_onlyappend_no_spaceappend_spacereplace_xmlentitieszUnexpected arguments: %sz;You cannot specify both one_tag=False and disamb_only=Falsez[You cannot specify simplify_tags, one_tag or disamb_only with functions other than tagged_*)r   r7   r   r   r   r   r   r   )r	   rz   r0   keysr;   )r   filenamer   r   r7   r   r   r   r   r   r   s              r   r:   IPIPANCorpusReader._view   s   zz&$'zz&!$

?E:**Y-jj5 **%6>zz.%8$jj)>Ev;?7&++-GHH{P  g[A 
  '#+% 3

 
	
r    r   )NNN)__name__
__module____qualname____firstlineno____doc__r   r"   r&   r,   r
   r   r?   rF   rJ   rN   rS   rV   r9   r`   r    r3   rd   r*   r:   __static_attributes__r   r   r   r   r      s    !F?6
5

. 
 
 
 
 
 
 
 
 
 
 
 

3
!
B" 
r   r   c                   B    \ rS rSrSrSrSrSS jrS rS r	S r
S	 rS
rg)r;      r   r   ru   c                    [         R                  " XS US 5        SU l        SU l        UR	                  SS5      U l        UR	                  SS5      U l        UR	                  S[        R                  5      U l	        UR	                  SS5      U l
        UR	                  SS5      U l        UR	                  S	S5      U l        UR	                  S
S5      U l        UR	                  SS5      U l        g )NFr   r   Tr   r7   r   r   r   r   r   )r   r   in_sentencepositionr	   	show_tagsr   r;   
WORDS_MODEr7   r   r   r   r   r   )r   r   startposr   s       r   r   IPIPANCorpusView.__init__   s    ''hM FD1!::mT:JJv'7'B'BC	#ZZ?zz)T2%zz*;UC"JJ~u=#)::.CT#J r   c                 J   / n/ nSnSn[        5       nU R                  U5      n [        U5      S::  a"  U R                  U5        U R                  U5      nUS/:X  a  U(       a   e/ $ UR	                  5       nU =R
                  [        U5      S-   -  sl        UR                  S5      (       a	  SU l        GOeUR                  S5      (       a  GOMUR                  S5      (       aB  U R                  (       a  U(       a  U(       d  U R                  U5        SnSnSn	[        5       nGOUR                  S5      (       a  U R                  (       a  SU l        U R                  U5        U R                  U R                  :X  a  U/$ U R                  U R                  :X  a$  U R                  (       a  U R                  U5        U$ UR                  U5        GOHU R                  U R                  :X  a  U R                  U5        U/$ GOUR                  S	5      (       a:  US
S n	U R                  (       a"  U	R!                  SS5      R!                  SS5      n	GOUR                  S5      (       a^  U R"                  (       a  UR%                  S5      S:w  a6  XR'                  S5      S
-   UR'                  S5       n
UR)                  U
5        GOTUR                  S5      (       a  U R*                  (       a  U R,                  (       a#  U Vs/ s H  oR/                  S5      S   PM     nnU R0                  (       a  U R"                  (       d  UR                  W	[3        U5      45        OUR                  W	UR	                  5       45        OUR                  W	5        OUR                  S5      (       aY  U R                  (       a  SnU R4                  (       a4  U R*                  (       a  UR                  S5        O)UR                  S5        OUR                  S5      (       a   GM  s  snf )NFTr    z<chunk type="s"z<chunk type="p"z<tokz</chunkz<orth   iz&quot;"z&amp;&z<lexzdisamb=r   z<ctagz</ctagz</tok:r   z<ns/>)r   zno-spacez</cesAna)rc   
_read_datarz   _seekr	   r   
startswithr   r   _append_spacer7   r<   r   ry   rE   r   r_   r   rx   indexre   r   r   splitr   tupler   )r   streamsentence	sentencesspaceno_spacer   lineslineorthrg   ts               r   
read_blockIPIPANCorpusView.read_block  s   	u'5zQ

6"/}$$}	99;DMMSY]*M011#' !233(($$x&&x0 u++##',D$JJv&yyDOO3 (z)doo5,, ..x8'!((2YY$//1JJv&%;& 2 ))Abz++<<#6>>wLD((''499Y+?2+Ezz'2Q6H9MNCHHSM))>>))9=>AQ><<t/?/? uT{(;< txxz(:;OOD)))$$#H''~~ (89 +,,E d  ?s   P c                     UR                  5       U l        UR                  S5      nUR                  S5      nUR	                  5         U$ )Ni   
)tellr   rw   r   reverse)r   r   buffr   s       r   r   IPIPANCorpusView._read_dataT  s9    {{4 

4 r   c                 :    UR                  U R                  5        g r   )seekr   )r   r   s     r   r   IPIPANCorpusView._seek[  s    DMM"r   c                 l    U R                   (       a  UR                  S5        g UR                  S5        g )N) r   r   )r   ry   )r   r   s     r   r   IPIPANCorpusView._append_space^  s!    >>OON+OOC r   )
r   r   r   r   r7   r   r   r   r   r   N)r   )r   r   r   r   r   r<   rE   r   r   r   r   r   r   r   r   r   r;   r;      s-    JJJKL\#!r   r;   )	r   nltk.corpus.reader.apir   nltk.corpus.reader.utilr   r   r   r   r;   r   r   r   <module>r      s3     / BW
 W
to!- o!r   