
    Of                     <    S r SSKrSSK7  SSK7   " S S\\5      rg)a  
CorpusReader for the Pros and Cons dataset.

- Pros and Cons dataset information -

Contact: Bing Liu, liub@cs.uic.edu
        https://www.cs.uic.edu/~liub

Distributed with permission.

Related papers:

- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
    Proceedings of the 22nd International Conference on Computational Linguistics
    (Coling-2008), Manchester, 18-22 August, 2008.

- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing
    Opinions on the Web". Proceedings of the 14th international World Wide Web
    conference (WWW-2005), May 10-14, 2005, in Chiba, Japan.
    N)*c                   T    \ rS rSrSr\r\" 5       S4S jrSS jr	SS jr
S rS	 rS
rg)ProsConsCorpusReader"   a  
Reader for the Pros and Cons sentence dataset.

    >>> from nltk.corpus import pros_cons
    >>> pros_cons.sents(categories='Cons') # doctest: +NORMALIZE_WHITESPACE
    [['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy',
    'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'],
    ...]
    >>> pros_cons.words('IntegratedPros.txt')
    ['Easy', 'to', 'use', ',', 'economical', '!', ...]
utf8c                 f    [         R                  XX$5        [        R                  X5        X0l        g)a}  
:param root: The root directory for the corpus.
:param fileids: a list or regexp specifying the fileids in the corpus.
:param word_tokenizer: a tokenizer for breaking sentences or paragraphs
    into words. Default: `WhitespaceTokenizer`
:param encoding: the encoding that should be used to read the corpus.
:param kwargs: additional parameters passed to CategorizedCorpusReader.
N)CorpusReader__init__CategorizedCorpusReader_word_tokenizer)selfrootfileidsword_tokenizerencodingkwargss         >/usr/lib/python3/dist-packages/nltk/corpus/reader/pros_cons.pyr
   ProsConsCorpusReader.__init__1   s(    " 	d'<((6-    Nc                    U R                  X5      nUc  U R                  nO[        U[        5      (       a  U/n[	        U R                  USS5       VVVs/ s H   u  p4nU R                  X0R                  US9PM"     snnn5      $ s  snnnf )a  
Return all sentences in the corpus or in the specified files/categories.

:param fileids: a list or regexp specifying the ids of the files whose
    sentences have to be returned.
:param categories: a list specifying the categories whose sentences
    have to be returned.
:return: the given file(s) as a list of sentences. Each sentence is
    tokenized using the specified word_tokenizer.
:rtype: list(list(str))
Tr   )_resolve_fileids
isinstancestrconcatabspaths
CorpusView_read_sent_blockr   r   
categoriespathencfileids         r   sentsProsConsCorpusReader.sentsF        --4?mmG%%iG ,0==$+M+M'T &;&;cJ+M
 	
   'Bc                    U R                  X5      nUc  U R                  nO[        U[        5      (       a  U/n[	        U R                  USS5       VVVs/ s H   u  p4nU R                  X0R                  US9PM"     snnn5      $ s  snnnf )aw  
Return all words and punctuation symbols in the corpus or in the specified
files/categories.

:param fileids: a list or regexp specifying the ids of the files whose
    words have to be returned.
:param categories: a list specifying the categories whose words have
    to be returned.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
Tr   )r   r   r   r   r   r   r   _read_word_blockr    s         r   wordsProsConsCorpusReader.words^   r'   r(   c                 .   / n[        S5       H  nUR                  5       nU(       d  M  [        R                  " SU5      nU(       d  M<  UR	                  U R
                  R                  UR                  S5      R                  5       5      5        M     U$ )N   z+^(?!\n)\s*<(Pros|Cons)>(.*)</(?:Pros|Cons)>   )	rangereadlinerematchappendr   tokenizegroupstrip)r   streamr%   ilinesents         r   r   %ProsConsCorpusReader._read_sent_blockv   sr    rA??$D88JDQDtT11::4::a=;N;N;PQR  r   c                 \    / nU R                  U5       H  nUR                  U5        M     U$ )N)r   extend)r   r8   r+   r;   s       r   r*   %ProsConsCorpusReader._read_word_block   s-    ))&1DLL 2r   )r   )NN)__name__
__module____qualname____firstlineno____doc__StreamBackedCorpusViewr   WordPunctTokenizerr
   r%   r+   r   r*   __static_attributes__ r   r   r   r   "   s2    
 (J *+.*
0
0	r   r   )rD   r2   nltk.corpus.reader.apinltk.tokenizer   r	   r   rH   r   r   <module>rK      s&   ( 
 $ c2L cr   