
    OfA                          S r SSKrSSK7  SSK7  SSK7   " S S\5      r " S S\\5      r	 " S S	\	5      r
 " S
 S\5      rg)z;
A reader for corpora that consist of plaintext documents.
    N)*c                   j    \ rS rSrSr\r \" 5       S\S4S jr	SS jr
SS jrSS jrS	 rS
 rS rSrg)PlaintextCorpusReader   a  
Reader for corpora that consist of plaintext documents.  Paragraphs
are assumed to be split using blank lines.  Sentences and words can
be tokenized using the default tokenizers, or by custom tokenizers
specified as parameters to the constructor.

This corpus reader can be customized (e.g., to skip preface
sections of specific document formats) by creating a subclass and
overriding the ``CorpusView`` class variable.
Nutf8c                 T    [         R                  XX&5        X0l        X@l        XPl        g)au  
Construct a new plaintext corpus reader for a set of documents
located at the given root directory.  Example usage:

    >>> root = '/usr/local/share/nltk_data/corpora/webtext/'
    >>> reader = PlaintextCorpusReader(root, '.*\.txt') # doctest: +SKIP

:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
:param word_tokenizer: Tokenizer for breaking sentences or
    paragraphs into words.
:param sent_tokenizer: Tokenizer for breaking paragraphs
    into words.
:param para_block_reader: The block reader used to divide the
    corpus into paragraph blocks.
N)CorpusReader__init___word_tokenizer_sent_tokenizer_para_block_reader)selfrootfileidsword_tokenizersent_tokenizerpara_block_readerencodings          >/usr/lib/python3/dist-packages/nltk/corpus/reader/plaintext.pyr
   PlaintextCorpusReader.__init__%   s&    2 	d'<--"3    c                     [        U R                  USS5       VVVs/ s H   u  p#nU R                  X R                  US9PM"     snnn5      $ s  snnnf )z^
:return: the given file(s) as a list of words
    and punctuation symbols.
:rtype: list(str)
Tr   )concatabspaths
CorpusView_read_word_blockr   r   pathencfileids        r   wordsPlaintextCorpusReader.wordsC   sX      ,0==$+M+M'T &;&;cJ+M
 	
s   'Ac                    U R                   c   [        5       U l         [        U R	                  USS5       VVVs/ s H   u  p#nU R                  X R                  US9PM"     snnn5      $ !   [        S5      e= fs  snnnf )z
:return: the given file(s) as a list of
    sentences or utterances, each encoded as a list of word
    strings.
:rtype: list(list(str))
%No sentence tokenizer for this corpusTr   )r   PunktTokenizer
ValueErrorr   r   r   _read_sent_blockr   s        r   sentsPlaintextCorpusReader.sentsP        'J'5'7$  ,0==$+M+M'T &;&;cJ+M
 	
J !HII   A* 'A:*A7c                    U R                   c   [        5       U l         [        U R	                  USS5       VVVs/ s H   u  p#nU R                  X R                  US9PM"     snnn5      $ !   [        S5      e= fs  snnnf )z
:return: the given file(s) as a list of
    paragraphs, each encoded as a list of sentences, which are
    in turn encoded as lists of word strings.
:rtype: list(list(list(str)))
r%   Tr   )r   r&   r'   r   r   r   _read_para_blockr   s        r   parasPlaintextCorpusReader.parasd   r+   r,   c                     / n[        S5       H;  nUR                  U R                  R                  UR	                  5       5      5        M=     U$ Nr   )rangeextendr   tokenizereadliner   streamr"   is       r   r   &PlaintextCorpusReader._read_word_blockx   s>    rALL--66v7HIJ r   c           	          / nU R                  U5       HW  nUR                  U R                  R                  U5       Vs/ s H  nU R                  R                  U5      PM      sn5        MY     U$ s  snf N)r   r4   r   r5   r   r   r8   r)   parasents        r   r(   &PlaintextCorpusReader._read_sent_block~   s    ++F3DLL !% 4 4 = =d C C ((11$7 C 4    %A1
c           	          / nU R                  U5       HW  nUR                  U R                  R                  U5       Vs/ s H  nU R                  R                  U5      PM      sn5        MY     U$ s  snf r<   )r   appendr   r5   r   r   r8   r/   r>   r?   s        r   r.   &PlaintextCorpusReader._read_para_block   rA   rB   )r   r   r   r<   )__name__
__module____qualname____firstlineno____doc__StreamBackedCorpusViewr   WordPunctTokenizerread_blankline_blockr
   r"   r)   r/   r   r(   r.   __static_attributes__ r   r   r   r      sH    	 (JE *+.4<

(
(		r   r   c                       \ rS rSrSrS rSrg) CategorizedPlaintextCorpusReader   zm
A reader for plaintext corpora whose documents are divided into
categories based on their file identifiers.
c                 b    [         R                  X5        [        R                  " U /UQ70 UD6  g)z
Initialize the corpus reader.  Categorization arguments
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
the ``CategorizedCorpusReader`` constructor.  The remaining arguments
are passed to the ``PlaintextCorpusReader`` constructor.
N)CategorizedCorpusReaderr
   r   r   argskwargss      r   r
   )CategorizedPlaintextCorpusReader.__init__   s*     	 ((6&&t=d=f=r   rP   N)rG   rH   rI   rJ   rK   r
   rO   rP   r   r   rR   rR      s    
>r   rR   c                       \ rS rSrS rSrg)*PortugueseCategorizedPlaintextCorpusReader   c                 J    [         R                  X5        [        S5      US'   g )N
portugueser   )rU   r
   r&   rV   s      r   r
   3PortugueseCategorizedPlaintextCorpusReader.__init__   s     ((6#1,#? r   rP   N)rG   rH   rI   rJ   r
   rO   rP   r   r   r[   r[      s    @r   r[   c                   >    \ rS rSrSrS rS rS rS
S jrS
S jr	S	r
g)EuroparlCorpusReader   a  
Reader for Europarl corpora that consist of plaintext documents.
Documents are divided into chapters instead of paragraphs as
for regular plaintext documents. Chapters are separated using blank
lines. Everything is inherited from ``PlaintextCorpusReader`` except
that:

- Since the corpus is pre-processed and pre-tokenized, the
  word tokenizer should just split the line at whitespaces.
- For the same reason, the sentence tokenizer should just
  split the paragraph at line breaks.
- There is a new 'chapters()' method that returns chapters instead
  instead of paragraphs.
- The 'paras()' method inherited from PlaintextCorpusReader is
  made non-functional to remove any confusion between chapters
  and paragraphs for Europarl.
c                     / n[        S5       H0  nUR                  UR                  5       R                  5       5        M2     U$ r2   )r3   r4   r6   splitr7   s       r   r   %EuroparlCorpusReader._read_word_block   s6    rALL*0023 r   c                     / nU R                  U5       H@  nUR                  UR                  5        Vs/ s H  oDR                  5       PM     sn5        MB     U$ s  snf r<   )r   r4   
splitlinesrd   r=   s        r   r(   %EuroparlCorpusReader._read_sent_block   O    ++F3DLL4??3DE3D4**,3DEF 4 F   A
c                     / nU R                  U5       H@  nUR                  UR                  5        Vs/ s H  oDR                  5       PM     sn5        MB     U$ s  snf r<   )r   rD   rg   rd   rE   s        r   r.   %EuroparlCorpusReader._read_para_block   ri   rj   Nc                     [        U R                  US5       VVs/ s H  u  p#U R                  X R                  US9PM!     snn5      $ s  snnf )z
:return: the given file(s) as a list of
    chapters, each encoded as a list of sentences, which are
    in turn encoded as lists of word strings.
:rtype: list(list(list(str)))
Tr   )r   r   r   r.   )r   r   r!   r    s       r   chaptersEuroparlCorpusReader.chapters   sS      &*]]7D%A%AMV (=(=L%A
 	
s   &A	
c                     [        S5      e)NzVThe Europarl corpus reader does not support paragraphs. Please use chapters() instead.)NotImplementedError)r   r   s     r   r/   EuroparlCorpusReader.paras   s    !d
 	
r   rP   r<   )rG   rH   rI   rJ   rK   r   r(   r.   rn   r/   rO   rP   r   r   ra   ra      s     $

r   ra   )rK   	nltk.datanltknltk.corpus.reader.apinltk.corpus.reader.utilnltk.tokenizer	   r   rU   rR   r[   ra   rP   r   r   <module>rx      sS     $ % ~L ~B>'>@U >(@1Q @6
0 6
r   