
    Of$                        S r SSKrSSK7  SSK7  SSKJr  SSKJr  \R                  S5      r
\R                  S5      r\R                  S5      r\R                  S	5      r " S
 S\5      r " S S\\5      r " S S\5      rg)zO
Corpus reader for corpora that consist of parenthesis-delineated parse trees.
    N)*)map_tag)Treez\((\d+) ([^\s()]+) ([^\s()]+)\)z\(([^\s()]+) ([^\s()]+)\)z\([^\s()]+ ([^\s()]+)\)z
\s*\(\s*\(c                   L    \ rS rSrSr    SS jrS rS rS rSS jr	S	 r
S
rg)BracketParseCorpusReader   z
Reader for corpora that consist of parenthesis-delineated parse trees,
like those found in the "combined" section of the Penn Treebank,
e.g. "(S (NP (DT the) (JJ little) (NN dog)) (VP (VBD barked)))".

Nc                 T    [         R                  XX%5        X0l        X@l        X`l        g)ag  
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
:param comment_char: The character which can appear at the start of
    a line to indicate that the rest of the line is a comment.
:param detect_blocks: The method that is used to find blocks
    in the corpus; can be 'unindented_paren' (every unindented
    parenthesis starts a new parse) or 'sexpr' (brackets are
    matched).
:param tagset: The name of the tagset used by this corpus, to be used
    for normalizing or converting the POS tags returned by the
    ``tagged_...()`` methods.
N)SyntaxCorpusReader__init___comment_char_detect_blocks_tagset)selfrootfileidscomment_chardetect_blocksencodingtagsets          B/usr/lib/python3/dist-packages/nltk/corpus/reader/bracket_parse.pyr   !BracketParseCorpusReader.__init__"   s%    , 	##DB)+    c           	      ~   U R                   S:X  a  [        XR                  S9$ U R                   S:X  a  [        U5      $ U R                   S:X  ad  [	        USS9nU R                  (       aG  U Vs/ s H:  n[
        R                  S[
        R                  U R                  5      -  SU5      PM<     nnU$  S	5       es  snf )
Nsexpr)r   	blanklineunindented_parenz^\()start_rez	(?m)^%s.* zbad block type)r   read_sexpr_blockr   read_blankline_blockread_regexp_blockresubescape)r   streamtokstoks       r   _read_block$BracketParseCorpusReader._read_block=   s    ')#F9K9KLL  K/'//  $66$Vf=D!!  $# FF;43E3E)FFCP#   K&&&1s   .AB:c                 b    [         R                  SSU5      n[         R                  SSU5      nU$ )Nz\((.)\)z(\1 \1)z"\(([^\s()]+) ([^\s()]+) [^\s()]+\)(\1 \2)r"   r#   r   ts     r   
_normalize#BracketParseCorpusReader._normalizeO   s,    FF:z1-FF8*aHr   c           	      ~    [         R                  " U R                  U5      5      nUR                  5       S:X  a  [	        U5      S:X  a  US   $ U$ ! [
         a  n[        R                  R                  S5        UR                  S:X  an  [        SS5       H^  n [        U R                  USU-  -   5      5      n[        R                  R                  SU-  5        Us  s S nA$ ! [
         a     M\  f = f   [        R                  R                  S	5        [        S
U R                  U5      5      s S nA$ S nAff = f)Nr      r   z(Bad tree detected; trying to recover...
)zmismatched parens   )z(  Recovered by adding %d close paren(s)
z'  Recovered by returning a flat parse.
S)r   
fromstringr/   labellen
ValueErrorsysstderrwriteargsrange_tag)r   r.   treeenvs         r   _parseBracketParseCorpusReader._parseV   s   	+??4??1#56Dzz|r!c$i1nAw 	+JJHIvv//q!A S1W!=>

((JQN  !%  % JJGHTYYq\**!	+sO   AA A 
D<>D7AC%D7D<%
C3/D72C33>D71D<7D<c           
         [         R                  U R                  U5      5       VVs/ s H  u  p4XC4PM
     nnnU(       a;  X R                  :w  a,  U VVs/ s H  u  pCU[	        U R                  X#5      4PM     nnnU$ s  snnf s  snnf N)TAGWORDfindallr/   r   r   )r   r.   r   pwtagged_sents         r   r?   BracketParseCorpusReader._tagq   s{    ,3OODOOA<N,OP,O&1v,OPf,DODO&1GDLL&45K    Qs   A>$Bc                 J    [         R                  U R                  U5      5      $ rG   )WORDrI   r/   r-   s     r   _wordBracketParseCorpusReader._wordy   s    ||DOOA.//r   )r   r   r   )Nr   utf8NrG   )__name__
__module____qualname____firstlineno____doc__r   r(   r/   rD   r?   rP   __static_attributes__ r   r   r   r      s3     (6'$+60r   r   c                      ^  \ rS rSrSrS rSU 4S jjrSU 4S jjrSU 4S jjrSU 4S jjr	SU 4S jjr
SU 4S	 jjrS
rU =r$ )#CategorizedBracketParseCorpusReader}   z
A reader for parsed corpora whose documents are
divided into categories based on their file identifiers.
@author: Nathan Schneider <nschneid@cs.cmu.edu>
c                 b    [         R                  X5        [        R                  " U /UQ70 UD6  g)a<  
Initialize the corpus reader.  Categorization arguments
(C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to
the L{CategorizedCorpusReader constructor
<CategorizedCorpusReader.__init__>}.  The remaining arguments
are passed to the L{BracketParseCorpusReader constructor
<BracketParseCorpusReader.__init__>}.
N)CategorizedCorpusReaderr   r   )r   r=   kwargss      r   r   ,CategorizedBracketParseCorpusReader.__init__   s*     	 ((6 ))$@@@r   c                 B   > [         TU ]  U R                  X5      U5      $ rG   )supertagged_words_resolver   r   
categoriesr   	__class__s       r   rc   0CategorizedBracketParseCorpusReader.tagged_words       w#DMM'$FOOr   c                 B   > [         TU ]  U R                  X5      U5      $ rG   )rb   tagged_sentsrd   re   s       r   rk   0CategorizedBracketParseCorpusReader.tagged_sents   ri   r   c                 B   > [         TU ]  U R                  X5      U5      $ rG   )rb   tagged_parasrd   re   s       r   rn   0CategorizedBracketParseCorpusReader.tagged_paras   ri   r   c                 @   > [         TU ]  U R                  X5      5      $ rG   )rb   parsed_wordsrd   r   r   rf   rg   s      r   rq   0CategorizedBracketParseCorpusReader.parsed_words       w#DMM'$FGGr   c                 @   > [         TU ]  U R                  X5      5      $ rG   )rb   parsed_sentsrd   rr   s      r   rv   0CategorizedBracketParseCorpusReader.parsed_sents   rt   r   c                 @   > [         TU ]  U R                  X5      5      $ rG   )rb   parsed_parasrd   rr   s      r   ry   0CategorizedBracketParseCorpusReader.parsed_paras   rt   r   rY   )NNN)NN)rS   rT   rU   rV   rW   r   rc   rk   rn   rq   rv   ry   rX   __classcell__)rg   s   @r   r[   r[   }   s8    
APPPHHH Hr   r[   c                   <    \ rS rSrSrS	S jrS
S jrSS jrS rSr	g)AlpinoCorpusReader   a  
Reader for the Alpino Dutch Treebank.
This corpus has a lexical breakdown structure embedded, as read by `_parse`
Unfortunately this puts punctuation and some other words out of the sentence
order in the xml element tree. This is no good for `tag_` and `word_`
`_tag` and `_word` will be overridden to use a non-default new parameter 'ordered'
to the overridden _normalize function. The _parse function can then remain
untouched.
Nc           	      4    [         R                  U USSUUS9  g )Nzalpino\.xmlr   )r   r   r   )r   r   )r   r   r   r   s       r   r   AlpinoCorpusReader.__init__   s'     ))% 	* 	
r   c                 >   USS S:w  a  g[         R                  SSU5      nU(       a  [         R                  SSU5      nO[         R                  S	S
U5      n[         R                  SSU5      n[         R                  SSU5      n[         R                  SSU5      nU$ )a=  Normalize the xml sentence element in t.
The sentence elements <alpino_ds>, although embedded in a few overall
xml elements, are separated by blank lines. That's how the reader can
deliver them one at a time.
Each sentence has a few category subnodes that are of no use to us.
The remaining word nodes may or may not appear in the proper order.
Each word node has attributes, among which:
- begin : the position of the word in the sentence
- pos   : Part of Speech: the Tag
- word  : the actual word
The return value is a string with all xml elementes replaced by
clauses: either a cat clause with nested clauses, or a word clause.
The order of the bracket clauses closely follows the xml.
If ordered == True, the word clauses include an order sequence number.
If ordered == False, the word clauses only have pos and word parts.
N
   z
<alpino_dsr   z  <node .*? cat="(\w+)".*>z(\1z>  <node. *?begin="(\d+)".*? pos="(\w+)".*? word="([^"]+)".*?/>z
(\1 \2 \3)z-  <node .*?pos="(\w+)".*? word="([^"]+)".*?/>r+   z	  </node>r4   z<sentence>.*</sentence>z</?alpino_ds.*>r,   )r   r.   ordereds      r   r/   AlpinoCorpusReader._normalize   s    " Sb6\!FF0&!<QA GUVWAFF<q)FF-sA6FF%sA.r   c                    [         R                  U R                  USS95       VVVs/ s H  u  p4n[        U5      XT4PM     nnnnUR	                  5         U(       a?  X R
                  :w  a0  U VVVs/ s H  u  p5oE[        U R
                  X$5      4PM     nnnnU$ U VVVs/ s H	  u  p5oEU4PM     nnnnU$ s  snnnf s  snnnf s  snnnf )NT)r   )
SORTTAGWRDrI   r/   intsortr   r   )r   r.   r   orJ   rK   rL   s          r   r?   AlpinoCorpusReader._tag   s     (//40PQ
Q	q VQNQ 	 
 	f,GRGR)1GDLL&45{  
  4??;iqQq6;K?
 @s   B82$B?"Cc                 ^    U R                  U5      nU VVs/ s H  u  p4UPM	     snn$ s  snnf )z(Return a correctly ordered list if words)r?   )r   r.   rL   rK   rJ   s        r   rP   AlpinoCorpusReader._word   s)    iil +,fq,,,s   )rY   )z
ISO-8859-1N)FrG   )
rS   rT   rU   rV   rW   r   r/   r?   rP   rX   rY   r   r   r}   r}      s    
 D-r   r}   )rW   r:   nltk.corpus.reader.apinltk.corpus.reader.utilnltk.tagr   	nltk.treer   r"   compiler   rH   rO   EMPTY_BRACKETSr
   r   r^   r[   r}   rY   r   r   <module>r      s     $ %   ZZ:;

**1
2	zz,-M*`01 `0F%H5%HPH-1 H-r   