
    Of=                         S SK r S SKrS SKrS SKrS SKJr  S SKJrJr  S r	 " S S\5      r
 " S S\5      r " S	 S
5      r " S S\5      r " S S\5      r " S S\5      rg)    N)concat)XMLCorpusReaderXMLCorpusViewc                 J   ^  [         R                  " T 5      SU 4S jj5       nU$ )z^
Wraps function arguments:
if fileids not specified then function set NKJPCorpusReader paths.
c                 <   > U(       d  U R                   nT" X40 UD6$ N_paths)selffileidskwargsfuns      9/usr/lib/python3/dist-packages/nltk/corpus/reader/nkjp.py	decorator_parse_args.<locals>.decorator   s    kkG4+F++    r   )	functoolswraps)r   r   s   ` r   _parse_argsr      s(     __S, ,
 r   c                       \ rS rSrSrSrSrSrSS jrS r	S r
SS
 jrS r\SS j5       r\SS j5       r\SS j5       r\SS j5       r\SS j5       rSrg	)NKJPCorpusReader    r            c           	          [        U[        5      (       a  [        R                  " XUS-   5        O*[        R                  " XU Vs/ s H  o3S-   PM	     sn5        U R	                  5       U l        gs  snf )a  
Corpus reader designed to work with National Corpus of Polish.
See http://nkjp.pl/ for more details about NKJP.
use example:
import nltk
import nkjp
from nkjp import NKJPCorpusReader
x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='') # obtain the whole corpus
x.header()
x.raw()
x.words()
x.tagged_words(tags=['subst', 'comp'])  #Link to find more tags: nkjp.pl/poliqarp/help/ense2.html
x.sents()
x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='Wilk*') # obtain particular file(s)
x.header(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'])
x.tagged_words(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'], tags=['subst', 'comp'])
z.*/header.xmlz/header.xmlN)
isinstancestrr   __init__	get_pathsr
   )r   rootr   fileids       r   r   NKJPCorpusReader.__init__&   sa    $ gs##$$T?1JK$$'J'm3'J nn& Ks   A1c           	          U R                    Vs/ s HH  n[        R                  R                  [	        U R
                  5      UR                  S5      S   5      PMJ     sn$ s  snf )N
header.xmlr   )_fileidsospathjoinr   _rootsplitr   fs     r   r    NKJPCorpusReader.get_paths@   sR     ]]
" GGLLTZZ!'',*?*BC"
 	
 
s   AA!c                 h    U R                    Vs/ s H  oR                  S5      S   PM     sn$ s  snf )zN
Returns a list of file identifiers for the fileids that make up
this corpus.
r%   r   )r&   r+   r,   s     r   r   NKJPCorpusReader.fileidsF   s,    
 37--@-Q%a(-@@@s   /Nc                 V   UR                  S[        R                  5      nU[        R                  L a	  [        XS9$ U[        R                  L a	  [        XS9$ U[        R                  L a	  [        XS9$ U[        R                  L a  [        X[        R                  S9$ [        S5      e)zA
Returns a view specialised for use with particular corpus file.
mode)tags)r3   r2   zNo such mode!)popr   
WORDS_MODENKJPCorpus_Morph_View
SENTS_MODENKJPCorpus_Segmentation_ViewHEADER_MODENKJPCorpus_Header_ViewRAW_MODENKJPCorpus_Text_View	NameError)r   filenamer3   r   r2   s        r   _viewNKJPCorpusReader._viewM   s     zz&"2"="=>#...(==%000/DD%111)(>>%...'*>*G*G 
 O,,r   c                 D    U R                   U;   a  U$ U R                   U-   $ )z,
Add root if necessary to specified fileid.
)r!   )r   r"   s     r   add_rootNKJPCorpusReader.add_root`   s$     99Myy6!!r   c           
          [        U Vs/ s HC  nU R                  " U R                  U5      4S[        R                  0UD6R                  5       PME     sn5      $ s  snf )z)
Returns header(s) of specified fileids.
r2   )r   r?   rB   r   r9   handle_queryr   r   r   r"   s       r   headerNKJPCorpusReader.headerh   sn    
 
 &	 &F 

MM&)0@0L0LPV,.! &	
 	
   A
Ac           
          [        U Vs/ s HC  nU R                  " U R                  U5      4S[        R                  0UD6R                  5       PME     sn5      $ s  snf )z)
Returns sentences in specified fileids.
r2   )r   r?   rB   r   r7   rE   rF   s       r   sentsNKJPCorpusReader.sentsv   sn    
 
 &	 &F 

MM&)0@0K0KOU,.! &	
 	
rI   c           
          [        U Vs/ s HC  nU R                  " U R                  U5      4S[        R                  0UD6R                  5       PME     sn5      $ s  snf z%
Returns words in specified fileids.
r2   )r   r?   rB   r   r5   rE   rF   s       r   wordsNKJPCorpusReader.words   sn     
 &	 &F 

MM&)0@0K0KOU,.! &	
 	
rI   c                     UR                  S/ 5      n[        U Vs/ s HD  nU R                  " U R                  U5      4[        R
                  US.UD6R                  5       PMF     sn5      $ s  snf )zm
Call with specified tags as a list, e.g. tags=['subst', 'comp'].
Returns tagged words in specified fileids.
r3   )r2   r3   )r4   r   r?   rB   r   r5   rE   )r   r   r   r3   r"   s        r   tagged_wordsNKJPCorpusReader.tagged_words   s     zz&"% & &F 

MM&))44 	
 ,.! &

 
	
s   AA.c           
          [        U Vs/ s HC  nU R                  " U R                  U5      4S[        R                  0UD6R                  5       PME     sn5      $ s  snf rN   )r   r?   rB   r   r;   rE   rF   s       r   rawNKJPCorpusReader.raw   sn    
 
 &	 &F 

MM&)0@0I0IMS,.! &	
 	
rI   r	   )z.*r   )__name__
__module____qualname____firstlineno__r5   r7   r9   r;   r   r    r   r?   rB   r   rG   rK   rO   rR   rU   __static_attributes__ r   r   r   r       s    JJKH'4
A-&" 
 
 
 
 
 
 
 
$ 
 
r   r   c                   &    \ rS rSrS rS rS rSrg)r:      c                 Z    SU l         [        R                  " XS-   U R                   5        g)zc
HEADER_MODE
A stream backed corpus view specialized for use with
header.xml files in NKJP corpus.
z.*/sourceDesc$r%   N)tagspecr   r   r   r>   r   s      r   r   NKJPCorpus_Header_View.__init__   s$     (t%<dllKr   c                     U R                  5         / n [        R                  " X R                  5      n[	        U5      S:X  a  OUR                  U5        MC  U R                  5         U$ Nr   )_openr   
read_block_streamlenextendclose)r   rG   segms      r   rE   #NKJPCorpus_Header_View.handle_query   sV    

 ++D,,?D4yA~MM$	 
 	

r   c                 l   UR                  S5      n/ nU(       a  SR                  S U 5       5      nUR                  S5      n/ nU(       a  SR                  S U 5       5      nUR                  S5      n/ nU(       a  SR                  S U 5       5      nUR                  S5      n	/ n
U	(       a  SR                  S	 U	 5       5      n
UR                  S
5      n/ nU(       a  SR                  S U 5       5      nUR                  S5      n/ nU(       a  SR                  S U 5       5      nUUUU
UUS.$ )Nz
bibl/title
c              3   T   #    U  H  oR                   R                  5       v   M      g 7fr   textstrip).0titles     r   	<genexpr>4NKJPCorpus_Header_View.handle_elt.<locals>.<genexpr>   s     EfUjj..00f   &(zbibl/authorc              3   T   #    U  H  oR                   R                  5       v   M      g 7fr   rp   )rs   authors     r   ru   rv      s     Iv{{0022rw   z	bibl/datec              3   T   #    U  H  oR                   R                  5       v   M      g 7fr   rp   )rs   dates     r   ru   rv           A54YY__..5rw   zbibl/publisherc              3   T   #    U  H  oR                   R                  5       v   M      g 7fr   rp   )rs   	publishers     r   ru   rv      s     !U*Y.."6"6"8"8*rw   z	bibl/idnoc              3   T   #    U  H  oR                   R                  5       v   M      g 7fr   rp   )rs   idnos     r   ru   rv      r|   rw   z	bibl/notec              3   T   #    U  H  oR                   R                  5       v   M      g 7fr   rp   )rs   notes     r   ru   rv      r|   rw   )rt   ry   r{   r~   r   r   )findallr)   )r   eltcontexttitlesrt   authorsry   datesr{   
publishersr~   idnosr   notesr   s                  r   
handle_elt!NKJPCorpus_Header_View.handle_elt   s   \*IIEfEEE++m,YYIIIFK(99A5AAD[[!12
			!U*!UUIK(99A5AADK(99A5AAD "
 	
r   )r`   N)rW   rX   rY   rZ   r   rE   r   r[   r\   r   r   r:   r:      s    L	&
r   r:   c                   *    \ rS rSrSrS rS rS rSrg)XML_Tool   z
Helper class creating xml file to one without references to nkjp: namespace.
That's needed because the XMLCorpusView assumes that one can find short substrings
of XML that are valid XML, which is not true if a namespace is declared at top level
c                 ~    [         R                  R                  X5      U l        [        R
                  " SS9U l        g )NF)delete)r'   r(   r)   	read_filetempfileNamedTemporaryFile
write_file)r   r!   r>   s      r   r   XML_Tool.__init__   s(    d5"55UCr   c                     [        U R                  5      nU R                  nSn[        U5      (       a  UR	                  5       n[
        R                  " SU5      nSR                  U5      n[
        R                  " SU5      nSR                  U5      n[
        R                  " SU5      nSR                  U5      n[
        R                  " SU5      nSR                  U5      n[
        R                  " SU5      nSR                  U5      nUR                  U5        [        U5      (       a  M  UR                  5         UR                  5         U R                  R                  $ ! [         a  nU R                  5         [        UeS nAff = f)N znkjp:[^ ]* z<nkjp:paren>z</nkjp:paren>z<choice>z	</choice>)openr   r   rh   readlinerer+   r)   writerj   name	Exceptionremove_preprocessed_file)r   frfwlinexretes          r   build_preprocessed_file XML_Tool.build_preprocessed_file   s&   	#dnn%BBDd)){{}HH^T2hhqkHH^S1hhqkHH_c2hhqkHHZ-hhqkHH[#.hhqk d)) HHJHHJ??''' 	#))+"	#s   D,E& 05E& &
F0FFc                 X    [         R                  " U R                  R                  5        g r   )r'   remover   r   )r   s    r   r   !XML_Tool.remove_preprocessed_file  s    
		$//&&'r   )r   r   N)	rW   rX   rY   rZ   __doc__r   r   r   r[   r\   r   r   r   r      s    D#2(r   r   c                   H    \ rS rSrSrS rS rS rS rS r	S r
S	 rS
 rSrg)r8   i  za
A stream backed corpus view specialized for use with
ann_segmentation.xml files in NKJP corpus.
c                    SU l         [        U[        R                  S9U l        U R                  R	                  5         [        US5      U l        [        R                  " X R                  R                  5       U R                   5        g )Nz.*p/.*s)r2   zann_segmentation.xml)
r`   r<   r7   	text_viewrE   r   xml_toolr   r   r   ra   s      r   r   %NKJPCorpus_Segmentation_View.__init__!  sf     -/::
 	##% +AB--7794<<	
r   c                 N    UR                  S5      S   R                  S5      S   $ )N(r   ,r   )r+   )r   example_words     r   get_segm_id(NKJPCorpus_Segmentation_View.get_segm_id/  s(    !!#&q)//4Q77r   c                 <    [        UR                  S5      S   5      $ )Nr   r   )intr+   )r   beg_words     r   get_sent_beg)NKJPCorpus_Segmentation_View.get_sent_beg2  s    8>>#&q)**r   c                     UR                  S5      S   R                  S5      n[        US   5      [        US   5      -   $ )N)r   r   r   r   )r+   r   )r   end_wordsplitteds      r   get_sent_end)NKJPCorpus_Segmentation_View.get_sent_end6  s=    >>#&q)//48A;#hqk"222r   c                     U R                  US   5      nU R                  R                  U   nU R                  US   5      nU R	                  U[        U5      S-
     5      nX4U $ )Nr   r   )r   r   	segm_dictr   r   rh   )r   	sent_segmidrk   begends         r   get_sentences*NKJPCorpus_Segmentation_View.get_sentences;  sf    il+~~''+	!-	#i.1*< =>}r   c                     / nSnSnU HU  nU R                  U5      nU R                  U5      US-
  :  d  XF:w  a"  UR                  U5        U R                  U5      nUnMW     U$ )Nr   )r   r   appendr   )r   rk   r   prev_txt_endprev_txt_nrwordtxt_nrs          r   remove_choice*NKJPCorpus_Segmentation_View.remove_choiceC  sp    D%%d+F  &)99[=R

4 #006 K  
r   c                     U R                  5         / n [        R                  " X R                  5      n[	        U5      S:X  a  O<U H4  nU R                  U5      nUR                  U R                  U5      5        M6     Ml  U R                  5         U R                  R                  5         U$ ! [         a&  nU R                  R                  5         [        UeS nAff = frd   )re   r   rf   rg   rh   r   r   r   rj   r   r   r   )r   	sentencesr   rk   r   s        r   rE   )NKJPCorpus_Segmentation_View.handle_queryQ  s    	#JJLI)44T<<H	y>Q&%D--d3D$$T%7%7%=> &	  JJLMM224 	#MM224"	#s   B*B- -
C7!CCc                 \    / nU H#  nUR                  UR                  S5      5        M%     U$ )Ncorresp)r   get)r   r   r   r   segs        r   r   'NKJPCorpus_Segmentation_View.handle_eltc  s+    CJJswwy)* 
r   )r`   r   r   N)rW   rX   rY   rZ   r   r   r   r   r   r   r   rE   r   r[   r\   r   r   r8   r8     s/    

8+3
#$r   r8   c                   B    \ rS rSrSrSrSrS rS rSS jr	S	 r
S
 rSrg)r<   ij  zU
A stream backed corpus view specialized for use with
text.xml files in NKJP corpus.
r   r   c                     UR                  SS5      U l        SU l        [        5       U l        [        US5      U l        [        R                  " X R                  R                  5       U R                  5        g )Nr2   r   z	.*/div/abztext.xml)
r4   r2   r`   dictr   r   r   r   r   r   ra   s      r   r   NKJPCorpus_Text_View.__init__s  sV    JJvq)	" :6--7794<<	
r   c                     U R                  5         U R                  U R                  5      nU R                  5         U R                  R                  5         U$ ! [         a&  nU R                  R                  5         [        UeS nAff = fr   )re   rf   rg   rj   r   r   r   )r   r   r   s      r   rE   !NKJPCorpus_Text_View.handle_query~  sh    	#JJL-AJJLMM224H 	#MM224"	#s   AA 
B	#!BB	Nc                     / n [         R                  " X5      n[        U5      S:X  a  OU H  nUR                  U5        M     MB  SR	                  U Vs/ s H  oUPM     sn5      /$ s  snf )z&
Returns text as a list of sentences.
r   r   )r   rf   rh   r   r)   )r   streamr`   elt_handlertxtrk   parts          r   rf   NKJPCorpus_Text_View.read_block  sk      ++D9D4yA~

4  	  3/34$3/011/s   A(c                 |    UR                    H,  nUR                  S5      (       d  M  UR                  U5      s  $    g )Nr   )attribendswithr   )r   r   attrs      r   r    NKJPCorpus_Text_View.get_segm_id  s-    JJD}}T""wwt}$ r   c                     U R                   [        R                  L a(  UR                  U R                  U R                  U5      '   UR                  $ r   )r2   r<   r7   rq   r   r   )r   r   r   s      r   r   NKJPCorpus_Text_View.handle_elt  s;    99,77747HHDNN4++C01xxr   )r2   r   r`   r   )NN)rW   rX   rY   rZ   r   r7   r;   r   rE   rf   r   r   r[   r\   r   r   r<   r<   j  s,    
 JH	
	#2%
r   r<   c                   *    \ rS rSrSrS rS rS rSrg)r6   i  za
A stream backed corpus view specialized for use with
ann_morphosyntax.xml files in NKJP corpus.
c                     UR                  SS 5      U l        SU l        [        US5      U l        [
        R                  " X R                  R                  5       U R                  5        g )Nr3   z	.*/seg/fszann_morphosyntax.xml)r4   r3   r`   r   r   r   r   r   ra   s      r   r   NKJPCorpus_Morph_View.__init__  sN    JJvt,	" +AB--7794<<	
r   c                     U R                  5         / n [        R                  " X R                  5      n[	        U5      S:X  a  O!U H  nUc  M  UR                  U5        M     MQ  U R                  5         U R                  R                  5         U$ ! [         a&  nU R                  R                  5         [        UeS nAff = frd   )
re   r   rf   rg   rh   r   rj   r   r   r   )r   rO   rk   r   r   s        r   rE   "NKJPCorpus_Morph_View.handle_query  s    	#JJLE$//llCt9> D'T* !	  JJLMM224L 	#MM224"	#s   A	B AB 
C!B==Cc                    SnSnSnU R                   c  SnU GH_  nSUR                  5       ;   a<  UR                  S   S:X  a)  U H!  nUR                  S:X  d  M  UR                  nM#     MT  SUR                  5       ;   d  Mj  UR                  S   S:X  d  M  U H  nSUR                  5       ;   d  M  UR                  S   S	:X  d  M.  U H  nSUR                  5       ;   d  M  UR                  S   S
:X  d  M.  U Hr  n	SU	R                  5       ;   a.  U R                   b!  U	R                  S   U R                   ;   a  SnME  SU	R                  5       ;   d  M[  U	R                  S   S:X  d  Mp  SnMt     M     M     GMb     U(       a
  U(       a  U$ g g )N FTr   orthstringinterpstypelexctagvalueinterp)r3   keysr   tagrq   )
r   r   r   r   flagis_not_interpchildsymbolsymbol2symbol3s
             r   r    NKJPCorpus_Morph_View.handle_elt  sK   99DE%%,,v*>&*H#FzzX-%{{ $ 5::<'ELL,@I,M#F.6==3HE3Q'-G &',,. 8$+NN6$:f$D/6G(/7<<>(A,0II,A,3NN7,Ctyy,P/3(/7<<>(A,3NN7,Cx,O8= 07 (. $ 4 MK "4r   )r3   r`   r   N)	rW   rX   rY   rZ   r   r   rE   r   r[   r\   r   r   r6   r6     s    

#$#r   r6   )r   r'   r   r   nltk.corpus.reader.utilr   nltk.corpus.reader.xmldocsr   r   r   r   r:   r   r8   r<   r6   r\   r   r   <module>r	     so     	 	  * ER
 R
j;
] ;
|%( %(PL= L^6= 6rCM Cr   