
    OfT                         S r SSKrSSK7  SSK7  SSKJr  SSKJr  SSKJ	r	J
r
   " S S\5      r " S	 S
5      r " S S\5      r " S S\5      rg)z!
Read CoNLL-style chunk fileids.
    N)*)map_tag)Tree)LazyConcatenationLazyMapc                   .   \ rS rSrSrSrSrSrSrSr	Sr
S	r\\\\\	\
\4rS
SSSS\S
S
4S jrS'S jrS'S jrS(S jrS(S jrS)S jrS)S jrS)S jrS'S jrS*S jrS(S jrS(S jrS'S jrS rS rS'S jrS'S jrS'S  jrS'S! jr S" r!S# r"S$ r#\$S% 5       r%S&r&g
)+ConllCorpusReader   a  
A corpus reader for CoNLL-style files.  These files consist of a
series of sentences, separated by blank lines.  Each sentence is
encoded using a table (or "grid") of values, where each line
corresponds to a single word, and each column corresponds to an
annotation type.  The set of columns used by CoNLL-style files can
vary from corpus to corpus; the ``ConllCorpusReader`` constructor
therefore takes an argument, ``columntypes``, which is used to
specify the columns that are used by a given corpus. By default
columns are split by consecutive whitespaces, with the
``separator`` argument you can set a string to split by (e.g.
``' '``).


@todo: Add support for reading from corpora where different
    parallel files contain different columns.
@todo: Possibly add caching of the grid corpus view?  This would
    allow the same grid view to be used by different data access
    methods (eg words() and parsed_sents() could both share the
    same grid corpus view object).
@todo: Better support for -DOCSTART-.  Currently, we just ignore
    it, but it could be used to define methods that retrieve a
    document at a time (eg parsed_documents()).
wordspostreechunknesrlignoreNSFTutf8c                 V   U H   nXR                   ;  d  M  [        SU-  5      e   [        U[        5      (       a  U/nX@l        [        U5       VVs0 s H  u  pX_M	     snnU l        X`l        XPl        Xpl	        Xl
        [        R                  XX(5        Xl        Xl        g s  snnf )NzBad column type %r)COLUMN_TYPES
ValueError
isinstancestr_chunk_types	enumerate_colmap_pos_in_tree_root_label_srl_includes_roleset_tree_classCorpusReader__init___tagsetsep)selfrootfileidscolumntypeschunk_types
root_labelpos_in_treesrl_includes_rolesetencoding
tree_classtagset	separator
columntypeics                  :/usr/lib/python3/dist-packages/nltk/corpus/reader/conll.pyr!   ConllCorpusReader.__init__C   s     &J!2!22 !5
!BCC & k3''&-K'+4[+AB+A!+AB'%%9"%d'< Cs   B%c                     U R                  U R                  5        [        [        U R                  U R                  U5      5      5      $ N)_requireWORDSr   r   
_get_words_gridsr$   r&   s     r3   r   ConllCorpusReader.wordsd   s1    djj! $++g:N!OPP    c                     U R                  U R                  5        [        U R                  U R	                  U5      5      $ r6   )r7   r8   r   r9   r:   r;   s     r3   sentsConllCorpusReader.sentsh   s,    djj!tG(<==r=   c                    ^ ^ T R                  T R                  T R                  5        U U4S jn[        [	        UT R                  U5      5      5      $ )Nc                 (   > TR                  U T5      $ r6   _get_tagged_wordsgridr$   r.   s    r3   get_tagged_words8ConllCorpusReader.tagged_words.<locals>.get_tagged_wordso       ))$77r=   )r7   r8   POSr   r   r:   r$   r&   r.   rG   s   ` ` r3   tagged_wordsConllCorpusReader.tagged_wordsl   s<    djj$((+	8 !)94;;w;O!PQQr=   c                    ^ ^ T R                  T R                  T R                  5        U U4S jn[        UT R	                  U5      5      $ )Nc                 (   > TR                  U T5      $ r6   rC   rE   s    r3   rG   8ConllCorpusReader.tagged_sents.<locals>.get_tagged_wordsw   rI   r=   )r7   r8   rJ   r   r:   rK   s   ` ` r3   tagged_sentsConllCorpusReader.tagged_sentst   s7    djj$((+	8 'W)=>>r=   c                    ^ ^^ T R                  T R                  T R                  T R                  5        Tc  T R                  mUU U4S jn[        [        UT R                  U5      5      5      $ )Nc                 *   > TR                  U TT5      $ r6   _get_chunked_wordsrF   r(   r$   r.   s    r3   get_chunked_words:ConllCorpusReader.chunked_words.<locals>.get_chunked_words       **4fEEr=   )r7   r8   rJ   CHUNKr   r   r   r:   r$   r&   r(   r.   rX   s   ` `` r3   chunked_wordsConllCorpusReader.chunked_words|   sT    djj$((DJJ7++K	F !):DKK<P!QRRr=   c                    ^ ^^ T R                  T R                  T R                  T R                  5        Tc  T R                  mUU U4S jn[        UT R                  U5      5      $ )Nc                 *   > TR                  U TT5      $ r6   rU   rW   s    r3   rX   :ConllCorpusReader.chunked_sents.<locals>.get_chunked_words   rZ   r=   )r7   r8   rJ   r[   r   r   r:   r\   s   ` `` r3   chunked_sentsConllCorpusReader.chunked_sents   sO    djj$((DJJ7++K	F ($++g*>??r=   c                    ^ ^^ T R                  T R                  T R                  T R                  5        Tc  T R                  mUU U4S jn[        UT R                  U5      5      $ )Nc                 *   > TR                  U TT5      $ r6   )_get_parsed_sent)rF   r*   r$   r.   s    r3   get_parsed_sent7ConllCorpusReader.parsed_sents.<locals>.get_parsed_sent   s    (({FCCr=   )r7   r8   rJ   TREEr   r   r:   )r$   r&   r*   r.   rg   s   ` `` r3   parsed_sentsConllCorpusReader.parsed_sents   sN    djj$((DII6++K	D G(<==r=   c                     U R                  U R                  5        [        U R                  U R	                  U5      5      $ r6   )r7   SRLr   _get_srl_spansr:   r;   s     r3   	srl_spansConllCorpusReader.srl_spans   s.    dhht**DKK,@AAr=   c                   ^ ^ T R                  T R                  T R                  T R                  T R                  5        Tc  T R
                  mUU 4S jn[        UT R                  U5      5      nU(       a  [        U5      nU$ )Nc                 (   > TR                  U T5      $ r6   )_get_srl_instances)rF   r*   r$   s    r3   get_srl_instances:ConllCorpusReader.srl_instances.<locals>.get_srl_instances   s    **4==r=   )	r7   r8   rJ   ri   rm   r   r   r:   r   )r$   r&   r*   flattenrt   results   ` `   r3   srl_instancesConllCorpusReader.srl_instances   se    djj$((DIItxx@++K	> *DKK,@A&v.Fr=   c                    ^ ^ T R                  T R                  T R                  T R                  5        U U4S jn[	        [        UT R                  U5      5      5      $ )z
:return: a list of word/tag/IOB tuples
:rtype: list(tuple)
:param fileids: the list of fileids that make up this corpus
:type fileids: None or str or list
c                 (   > TR                  U T5      $ r6   _get_iob_wordsrE   s    r3   get_iob_words2ConllCorpusReader.iob_words.<locals>.get_iob_words       &&tV44r=   )r7   r8   rJ   r[   r   r   r:   r$   r&   r.   r~   s   ` ` r3   	iob_wordsConllCorpusReader.iob_words   sC     	djj$((DJJ7	5 !G8L!MNNr=   c                    ^ ^ T R                  T R                  T R                  T R                  5        U U4S jn[	        UT R                  U5      5      $ )z
:return: a list of lists of word/tag/IOB tuples
:rtype: list(list)
:param fileids: the list of fileids that make up this corpus
:type fileids: None or str or list
c                 (   > TR                  U T5      $ r6   r|   rE   s    r3   r~   2ConllCorpusReader.iob_sents.<locals>.get_iob_words   r   r=   )r7   r8   rJ   r[   r   r:   r   s   ` ` r3   	iob_sentsConllCorpusReader.iob_sents   s>     	djj$((DJJ7	5 }dkk'&:;;r=   c                     [        U R                  US5       VVs/ s H  u  p#[        X R                  US9PM     snn5      $ s  snnf )NT)r,   )concatabspathsStreamBackedCorpusView_read_grid_block)r$   r&   fileidencs       r3   r:   ConllCorpusReader._grids   sM      &*]]7D%A%AMV 'v/D/DsS%A
 	
s    A
c                    / n[        U5       H  nUR                  5       nU(       d  M  UR                  S5       Vs/ s H  oDR                  U R                  5      PM     nnUS   U R                  R                  SS5         S:X  a  US	 U H,  n[        U5      [        US   5      :w  d  M   [        SU-  5      e   UR                  U5        M     U$ s  snf )N
r   r   z
-DOCSTART-z"Inconsistent number of columns:
%s)	read_blankline_blockstripsplitr#   r   getlenr   append)r$   streamgridsblocklinerF   rows          r3   r   "ConllCorpusReader._read_grid_block   s    )&1EKKME5:[[5FG5FTJJtxx(5FDG Awt||''34DG s8s47|+$%JU%RSS  LL! 2"  Hs   $Cc                 >    U R                  XR                  S   5      $ )Nr   )_get_columnr   )r$   rF   s     r3   r9   ConllCorpusReader._get_words   s    ll7&;<<r=   c           	      "   U R                  XR                  S   5      nU(       a5  X R                  :w  a&  U Vs/ s H  n[        U R                  X$5      PM     nn[	        [        U R                  XR                  S   5      U5      5      $ s  snf )Nr   r   r   r   r"   r   listzipr$   rF   r.   pos_tagsts        r3   rD   #ConllCorpusReader._get_tagged_words   sr    ##D,,u*=>f,BJK(Qf8(HKC((||G/DExPQQ Ls    Bc                 \   U R                  XR                  S   5      nU(       a5  X R                  :w  a&  U Vs/ s H  n[        U R                  X$5      PM     nn[	        [        U R                  XR                  S   5      UU R                  XR                  S   5      5      5      $ s  snf )Nr   r   r   r   r   s        r3   r}    ConllCorpusReader._get_iob_words   s    ##D,,u*=>f,BJK(Qf8(HK  ||G'<=  ||G'<=
 	
 Ls    B)c                 
   U R                  XR                  S   5      nU R                  XR                  S   5      nU(       a5  X0R                  :w  a&  U Vs/ s H  n[        U R                  X65      PM     nnU R                  XR                  S   5      n[	        U R
                  / 5      /n[        XEU5       H  u  pnUS:X  a  Su  pOUR                  S5      u  pUb  X;  a  SnUS:X  a  XS   R                  5       :w  a  S	nUS
;   a  [        U5      S:X  a  UR                  5         US	:X  a1  [	        U/ 5      nUS   R                  U5        UR                  U5        US   R                  X45        M     US   $ s  snf )Nr   r   r   O)r    -IBBO   r   )r   r   r"   r   r   r   r   r   labelr   popr   )r$   rF   r(   r.   r   r   r   
chunk_tagsstackwordpos_tag	chunk_tagstate
chunk_type	new_chunks                  r3   rV   $ConllCorpusReader._get_chunked_words  sc     ||G'<=##D,,u*=>f,BJK(Qf8(HK%%dLL,AB
d&&+,(+EZ(H$D9C$+!z&/ooc&:#&:+H|
Bioo.? ?}Uq		| R0	b	  +Y'"Id_-) )I, Qx7 Ls    F c           	         U R                  XR                  S   5      nU R                  XR                  S   5      nU(       a5  X0R                  :w  a&  U Vs/ s H  n[        U R                  X65      PM     nnU R                  XR                  S   5      nSn[	        XEU5       H\  u  pnU	S:X  a  Sn	U	S:X  a  Sn	U
S:X  a  Sn
U
S:X  a  Sn
UR                  S	5      u  pUR                  S5      S-  nX S
U
 SU	 SU 3-  nM^      U R                  R                  U5      nU(       d  UR                  5        Hr  n[        U5       H`  u  nn[        U[        5      (       d  M  [!        U5      S:X  d  M.  [        US   ["        5      (       d  MH  US   UR%                  5       4UU'   Mb     Mt     U$ s  snf ! [        [        4 a/    U R                  R                  SU R                   SU S35      n Nf = f)Nr   r   r   r   (z-LRB-)z-RRB-r   z ( z)    r   )r   r   r"   r   r   r   countr   
fromstringr   
IndexErrorr   subtreesr   r   r   r   r   r   )r$   rF   r*   r.   r   r   r   
parse_tagstreestrr   r   	parse_tagleftrightr   subtreer1   childs                     r3   rf   "ConllCorpusReader._get_parsed_sent(  s     ||G'<=##D,,u*=>f,BJK(Qf8(HK%%dLL,@A
(+EZ(H$D9s{s{#~!#~!%OOC0MTKK$s*Er'!D6E7;;G )I	R##..w7D ==? )' 2HAu"5$//J!O&uQx55&+Ah%>
 !3 + ? L$ J' 	R##..43C3C2DAgYa/PQD	Rs    F0F5 5<G43G4c                    U R                   (       a4  U R                  XR                  S   S-   5      nU R                  S   S-   nO0U R                  XR                  S   5      nU R                  S   S-   n[        U Vs/ s H  oDS:w  d  M
  UPM     sn5      n/ n[	        U5       H  nU R                  XU-   5      n/ n	/ n
[        U5       H  u  pUR                  S5      u  pUR                  S5       H  nU(       d  M  U
R                  X45        M      [	        UR                  S5      5       H.  nU
R                  5       u  nnU	R                  UUS-   4U45        M0     M     UR                  U	5        M     U$ s  snf )z+
list of list of (start, end), tag) tuples
r   r   r   r   r   r   r   )
r   r   r   r   ranger   r   r   r   r   )r$   rF   
predicates	start_colp	num_preds	spanlistsr1   colspanlistr   wordnumsrl_tagr   r   tagstarts                    r3   rn    ConllCorpusReader._get_srl_spansM  sa    %%))$U0Ca0GHJU+a/I))$U0CDJU+a/I J;Jqs(J;<		y!A""4Q7CHE$-cN  'c 2::c?Csc^4 + u{{3/0A#(99;LS%OOeWq[%93$?@ 1 %3 X& " # <s   ?	E>E>c           
      R   U R                  X5      nU R                  U5      nU R                  (       a@  U R                  XR                  S   S-   5      nU R                  XR                  S   5      nO-U R                  XR                  S   5      nS /[        U5      -  n[        U5      n[        U5       Hl  u  pU	S:X  a  M  U H,  n
U
 H!  u  u  pnU[        X5      ;   d  M  US;   d  M!    O   M,    O   [        SU	-  5      eUR                  [        X8XU   U
5      5        Mn     U$ )Nr   r   r   VzC-VzNo srl column found for %r)rf   rn   r   r   r   r   ConllSRLInstanceListr   r   r   r   ConllSRLInstance)r$   rF   r*   r   r   r   rolesets	instancesr   	predicater   r   endr   s                 r3   rs   $ConllCorpusReader._get_srl_instancesm  s(   $$T7''-	%%))$U0Ca0GHJ''ll5.ABH))$U0CDJvJ/H(.	"+J"7GC &)1%LU#%"33|8K *2  & !!=	!IJJ 	G;LhW #8& r=   c                 P    U H   nX R                   ;  d  M  [        SU-  5      e   g )Nz)This corpus does not contain a %s column.)r   r   )r$   r'   r0   s      r3   r7   ConllCorpusReader._require  s+    %J- BZO  &r=   c                 `    [        [        U 5      5       Vs/ s H
  o U   U   PM     sn$ s  snf r6   )r   r   )rF   column_indexr1   s      r3   r   ConllCorpusReader._get_column  s,    /4SY/?@/?!Q%/?@@@s   +)r   r   r   r   r   r"   r   r#   r6   )NN)NNN)NNT)'__name__
__module____qualname____firstlineno____doc__r8   rJ   ri   r[   NErm   IGNOREr   r   r!   r   r?   rL   rQ   r]   rb   rj   ro   rx   r   r   r:   r   r9   rD   r}   rV   rf   rn   rs   r7   staticmethodr   __static_attributes__ r=   r3   r	   r	      s    : E
CDE	B
CF 3eRf=L !BQ>R?S@>BO<$	
6=R

 D#J@H A Ar=   r	   c                   *    \ rS rSrSrS rS rS rSrg)r   i  zp
An SRL instance from a CoNLL corpus, which identifies and
providing labels for the arguments of a single verb.
c           	      H   / U l          X l         X0l        X@l        / U l         XPl         Xl         UR                  5       U l         U HU  u  u  pgnUS;   a)  U =R                   [        [        Xg5      5      -  sl         M7  U R                  R                  Xg4U45        MW     g )Nr   )verb	verb_head	verb_stemroleset	argumentstagged_spansr   leavesr   r   r   r   )	r$   r   r   r   r   r   r   r   r   s	            r3   r!   ConllSRLInstance.__init__  s    		/
 #	"
 #	F
 )	* 	G[[]
	 ".LU#l"		T%"344	%%|S&9:	 ".r=   c                     [        U R                  5      S:w  a  SOSnSU R                  [        U R                  5      U4-  $ )Nr   sr   z,<ConllSRLInstance for %r with %d argument%s>)r   r   r   )r$   plurals     r3   __repr__ConllSRLInstance.__repr__  s?     DNN+q0b=^^S0&9
 	
r=   c                   ^  SR                  U 4S jT R                   5       5      nSU< ST R                  < S3nSn[        T R                  5       Hl  u  pE[        U[        5      (       a  US   nT R                   H!  u  u  pgnXF:X  a  USU-  -  nXG:X  d  M  US	-  nM#     UT R                  ;   a  S
U-  nX5S-   -  nMn     U[        R                  " UR                  SS5      SSS9-   $ )Nr   c              3   H   >#    U  H  nTR                   U   S    v   M     g7f)r   N)r   ).0r1   r$   s     r3   	<genexpr>*ConllSRLInstance.pprint.<locals>.<genexpr>  s     ?Y4::a=+Ys   "zSRL for z (stem=z):
r   r   z[%s z] z<<%s>>z ]]z    )initial_indentsubsequent_indent)joinr   r   r   r   r   tupler   textwrapfillreplace)	r$   verbstrhdrr  r1   r   r   r   argids	   `        r3   pprintConllSRLInstance.pprint  s    ((?TYY??74>>*<DA ,GA$&&Aw'+~~#e:%'A8IA	 (6
 DII~$OA - X]]IIdC 6
 
 	
r=   )r   r   r   r   r   r   r   r   N)	r   r   r   r   r   r!   r  r  r   r   r=   r3   r   r     s    (;T

r=   r   c                   8    \ rS rSrSrS	S jrS rS
S jrS rSr	g)r   i  z(
Set of instances for a single sentence
c                 :    Xl         [        R                  X5        g r6   )r   r   r!   )r$   r   r   s      r3   r!   ConllSRLInstanceList.__init__  s    	d&r=   c                 "    U R                  5       $ r6   )r  )r$   s    r3   __str__ConllSRLInstanceList.__str__  s    {{}r=   c                    U  H(  nUR                   U R                   :w  d  M  [        S5      e   U(       aV  U R                   R                  5       nS /[        U5      -  nS/[        U5      -  nU R	                  U R                   SX4U5        Sn[        [        W5      5       H  nU(       a8  USX7   -  -  nUSWU   -  -  nUS[        WU   R                  S5      5      -  -  nU  H&  nXrR                  :X  d  M  USUR                  -  -  n  O
   USS-  -  nU  H@  nSnUR                   H#  u  u  pnXy:X  a  S	U U 3nXzS
-
  :X  d  M  US-  nM%     USU-  -  nMB     US-  nM     U$ )NzTree mismatch!r   r   r   z%-20s z%-8s z
%15s*%-8s r   r   r   r   z%-12s r   )r   r   r   r   _tree2conllr   r  r   r   r   r   )r$   include_treeinstr   r   syntr  r1   argstrr   r   r  s               r3   r  ConllSRLInstanceList.pprint  s   DyyDII% !122 
 II$$&E&3u:%C53u:%DTYY5t<s5z"AX((Ws1v%%\E$q'--*<$=== &DNN22A 
 X^#+/+<+<'LU%z#$UGF8!41W~#	 ,=
 X&&  IA/ #0 r=   c                    [        U[        5      (       d   e[        U5      S:X  a<  [        US   [        5      (       a$  UR	                  5       XB'   X2   US   :X  d   eUS-   $ [        U5      S:X  a=  [        US   [
        5      (       a%  [        US   5      S:X  d   eUS   u  XB'   XB'   US-   $ SUR	                  5        XR    3XR'   U H  nU R                  XbX4U5      nM     XRS-
  ==   S-  ss'   U$ )Nr   r   r   r   r   )r   r   r   r   r   r  r   )r$   r   r   r   r   r#  r   s          r3   r    ConllSRLInstanceList._tree2conll  s    $%%%%t9>ja#66::<CL>T!W,,,Q;Y!^
47E : :tAw<1$$$)-a&CL#,Q;

~dm_=DM**55tL 1$Nr=   )r   N)r   )F)
r   r   r   r   r   r!   r  r  r   r   r   r=   r3   r   r     s    '&Pr=   r   c                   $    \ rS rSrSr SS jrSrg)ConllChunkCorpusReaderi1  zT
A ConllCorpusReader whose data file contains three columns: words,
pos, and chunk.
Nc                 8    [         R                  U UUSUUUUS9  g )N)r   r   r   )r(   r,   r.   r/   )r	   r!   )r$   r%   r&   r(   r,   r.   r/   s          r3   r!   ConllChunkCorpusReader.__init__7  s/     	""%# 	# 		
r=   r   )r   NN)r   r   r   r   r   r!   r   r   r=   r3   r)  r)  1  s     SW
r=   r)  )r   r  nltk.corpus.reader.apinltk.corpus.reader.utilnltk.tagr   	nltk.treer   	nltk.utilr   r   r    r	   r   r   r   r)  r   r=   r3   <module>r1     sX     $ %   0DA DANK
 K
\C4 CL
. 
r=   