ó
    ´OÂf…#  ã                   óŽ   • S r SSKrSSKrSSKrSSKJr  SSK7  SSKJ	r	  SSK
7  SSK7  SSKJr   " S S\5      r " S	 S
\5      rg)zN
A reader for corpora that contain chunked (and optionally tagged)
documents.
é    N)Útagstr2tree)Ú*)ÚBracketParseCorpusReader)ÚTreec                   ó˜   • \ rS rSrSrS\\" SSS9\SS4S	 jrSS
 jr	SS jr
SS jrSS jrSS jrSS jrSS jrSS jrSS jrS rSrg)ÚChunkedCorpusReaderé   a  
Reader for chunked (and optionally tagged) corpora.  Paragraphs
are split using a block reader.  They are then tokenized into
sentences using a sentence tokenizer.  Finally, these sentences
are parsed into chunk trees using a string-to-chunktree conversion
function.  Each of these steps can be performed using a default
function or a custom function.  By default, paragraphs are split
on blank lines; sentences are listed one per line; and sentences
are parsed into chunk trees using ``nltk.chunk.tagstr2tree``.
Ú Ú
T)ÚgapsÚutf8Nc	                 óB   • [         R                  XX'5        XEXh4U l        g)zz
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
N)ÚCorpusReaderÚ__init__Ú_cv_args)	ÚselfÚrootÚfileidsÚ	extensionÚstr2chunktreeÚsent_tokenizerÚpara_block_readerÚencodingÚtagsets	            Ú</usr/lib/python3/dist-packages/nltk/corpus/reader/chunked.pyr   ÚChunkedCorpusReader.__init__&   s'   € ô 	×Ñ˜d¨'Ô<Ø&Ð8IÐRˆŒð	Aó    c                 óž   • [        U R                  US5       VVs/ s H  u  p#[        X#SSSS/U R                  Q76 PM!     snn5      $ s  snnf )z^
:return: the given file(s) as a list of words
    and punctuation symbols.
:rtype: list(str)
Tr   ©ÚconcatÚabspathsÚChunkedCorpusViewr   ©r   r   ÚfÚencs       r   ÚwordsÚChunkedCorpusReader.words:   sV   € ô ð !%§¡¨g°tÔ <ôâ <‘HQô " !¨!¨Q°°1ÐE°t·}±}ÕEÙ <òó
ð 	
ùóó   ›&A	
c                 óž   • [        U R                  US5       VVs/ s H  u  p#[        X#SSSS/U R                  Q76 PM!     snn5      $ s  snnf )zŠ
:return: the given file(s) as a list of
    sentences or utterances, each encoded as a list of word
    strings.
:rtype: list(list(str))
Tr   é   r   r#   s       r   ÚsentsÚChunkedCorpusReader.sentsG   óV   € ô ð !%§¡¨g°tÔ <ôâ <‘HQô " !¨!¨Q°°1ÐE°t·}±}ÕEÙ <òó
ð 	
ùór(   c                 óž   • [        U R                  US5       VVs/ s H  u  p#[        X#SSSS/U R                  Q76 PM!     snn5      $ s  snnf )z´
:return: the given file(s) as a list of
    paragraphs, each encoded as a list of sentences, which are
    in turn encoded as lists of word strings.
:rtype: list(list(list(str)))
Tr   r*   r   r#   s       r   ÚparasÚChunkedCorpusReader.parasU   r-   r(   c                 ó¤   • [        U R                  US5       VVs/ s H"  u  p4[        X4SSSS/U R                  Q7SU06PM$     snn5      $ s  snnf )z–
:return: the given file(s) as a list of tagged
    words and punctuation symbols, encoded as tuples
    ``(word,tag)``.
:rtype: list(tuple(str,str))
Tr*   r   Útarget_tagsetr   ©r   r   r   r$   r%   s        r   Útagged_wordsÚ ChunkedCorpusReader.tagged_wordsc   óh   € ô ð
 !%§¡¨g°tÔ <ô	ò !=‘HQô "Ø˜A˜q ! QðØ)-¯©òØFLôñ !=ò	ó
ð 	
ùóó   ›)A
c                 ó¤   • [        U R                  US5       VVs/ s H"  u  p4[        X4SSSS/U R                  Q7SU06PM$     snn5      $ s  snnf )z
:return: the given file(s) as a list of
    sentences, each encoded as a list of ``(word,tag)`` tuples.

:rtype: list(list(tuple(str,str)))
Tr*   r   r2   r   r3   s        r   Útagged_sentsÚ ChunkedCorpusReader.tagged_sentss   r6   r7   c                 ó¤   • [        U R                  US5       VVs/ s H"  u  p4[        X4SSSS/U R                  Q7SU06PM$     snn5      $ s  snnf )zÈ
:return: the given file(s) as a list of
    paragraphs, each encoded as a list of sentences, which are
    in turn encoded as lists of ``(word,tag)`` tuples.
:rtype: list(list(list(tuple(str,str))))
Tr*   r   r2   r   r3   s        r   Útagged_parasÚ ChunkedCorpusReader.tagged_parasƒ   r6   r7   c                 ó¤   • [        U R                  US5       VVs/ s H"  u  p4[        X4SSSS/U R                  Q7SU06PM$     snn5      $ s  snnf )a>  
:return: the given file(s) as a list of tagged
    words and chunks.  Words are encoded as ``(word, tag)``
    tuples (if the corpus has tags) or word strings (if the
    corpus has no tags).  Chunks are encoded as depth-one
    trees over ``(word,tag)`` tuples or word strings.
:rtype: list(tuple(str,str) and Tree)
Tr*   r   r2   r   r3   s        r   Úchunked_wordsÚ!ChunkedCorpusReader.chunked_words“   óh   € ô ð
 !%§¡¨g°tÔ <ô	ò !=‘HQô "Ø˜A˜q ! QðØ)-¯©òØFLôñ !=ò	ó
ð 	
ùór7   c                 ó¤   • [        U R                  US5       VVs/ s H"  u  p4[        X4SSSS/U R                  Q7SU06PM$     snn5      $ s  snnf )zþ
:return: the given file(s) as a list of
    sentences, each encoded as a shallow Tree.  The leaves
    of these trees are encoded as ``(word, tag)`` tuples (if
    the corpus has tags) or word strings (if the corpus has no
    tags).
:rtype: list(Tree)
Tr*   r   r2   r   r3   s        r   Úchunked_sentsÚ!ChunkedCorpusReader.chunked_sents¥   rA   r7   c                 ó¤   • [        U R                  US5       VVs/ s H"  u  p4[        X4SSSS/U R                  Q7SU06PM$     snn5      $ s  snnf )a7  
:return: the given file(s) as a list of
    paragraphs, each encoded as a list of sentences, which are
    in turn encoded as a shallow Tree.  The leaves of these
    trees are encoded as ``(word, tag)`` tuples (if the corpus
    has tags) or word strings (if the corpus has no tags).
:rtype: list(list(Tree))
Tr*   r2   r   r3   s        r   Úchunked_parasÚ!ChunkedCorpusReader.chunked_paras·   rA   r7   c                 óV   • [        U5       Vs/ s H  n[        U5      PM     sn$ s  snf ©N)Úread_blankline_blockr   )r   ÚstreamÚts      r   Ú_read_blockÚChunkedCorpusReader._read_blockÉ   s%   € Ü(<¸VÔ(DÓEÒ(D 1”˜A–Ñ(DÑEÐEùÒEs   Ž&)r   rI   ©NN)Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__doc__r   ÚRegexpTokenizerrJ   r   r&   r+   r/   r4   r9   r<   r?   rC   rF   rM   Ú__static_attributes__© r   r   r   r      s[   † ñ	ð Ø!Ù& t°$Ñ7Ø.ØØôAô(
ô
ô
ô
ô 
ô 
ô 
ô$
ô$
õ$Fr   r   c                   ó.   • \ rS rSr  SS jrS rS rSrg)r"   éÍ   Nc                 ó˜   • [         R                  XUS9  X0l        X@l        XPl        X`l        Xpl        X€l        Xl        X l	        X°l
        g )N)r   )ÚStreamBackedCorpusViewr   Ú_taggedÚ_group_by_sentÚ_group_by_paraÚ_chunkedÚ_str2chunktreeÚ_sent_tokenizerÚ_para_block_readerÚ_source_tagsetÚ_target_tagset)r   Úfileidr   ÚtaggedÚgroup_by_sentÚgroup_by_paraÚchunkedr   r   r   Úsource_tagsetr2   s               r   r   ÚChunkedCorpusView.__init__Î   sK   € ô 	×'Ñ'¨¸xÐ'ÑHØŒØ+ÔØ+ÔØŒØ+ÔØ-ÔØ"3ÔØ+ÔØ+Õr   c                 ó&  • / nU R                  U5       Hù  n/ nU R                  R                  U5       H   nU R                  UU R                  U R
                  S9nU R                  (       d  U R                  U5      nU R                  (       d  UR                  5       nU R                  (       a  UR                  U5        M  UR                  U5        M¢     U R                  (       a  UR                  U5        Mè  UR                  U5        Mû     U$ )N)rj   r2   )rb   ra   Útokenizer`   rc   rd   r\   Ú_untagr_   Úleavesr]   ÚappendÚextendr^   )r   rK   ÚblockÚpara_strÚparaÚsent_strÚsents          r   Ú
read_blockÚChunkedCorpusView.read_blockç   sß   € ØˆØ×/Ñ/°Ö7ˆHØˆDØ ×0Ñ0×9Ñ9¸(ÖCØ×*Ñ*ØØ"&×"5Ñ"5Ø"&×"5Ñ"5ð +ð ð —|—|ØŸ;™; tÓ,Dð —}—}ØŸ;™;›=Dð ×&×&Ø—K‘K Ö%à—K‘K Ö%ñ' Dð, ×"×"Ø—‘˜TÖ"à—‘˜TÖ"ñ7 8ð< ˆr   c                 óÌ   • [        U5       HT  u  p#[        U[        5      (       a  U R                  U5        M-  [        U[        5      (       a	  US   X'   MK  [        S5      e   U$ )Nr   z"expected child to be Tree or tuple)Ú	enumerateÚ
isinstancer   rn   ÚtupleÚ
ValueError)r   ÚtreeÚiÚchilds       r   rn   ÚChunkedCorpusView._untag	  sV   € Ü! $ž‰HˆAÜ˜%¤×&Ñ&Ø—‘˜EÖ"Ü˜E¤5×)Ñ)Ø ™(“ä Ð!EÓFÐFñ (ð ˆr   )	r_   r^   r]   rb   ra   rc   r`   r\   rd   rO   )rP   rQ   rR   rS   r   rw   rn   rV   rW   r   r   r"   r"   Í   s   † ð Øô,ò2 õDr   r"   )rT   ÚcodecsÚos.pathÚosÚnltkÚ
nltk.chunkr   Únltk.corpus.reader.apiÚ nltk.corpus.reader.bracket_parser   Únltk.corpus.reader.utilÚnltk.tokenizeÚ	nltk.treer   r   r   r[   r"   rW   r   r   Ú<module>rŒ      sG   ðñó
 Û ã Ý "Ü $Ý EÜ %Ü Ý ôpF˜,ô pFôfDÐ.õ Dr   